From f1a92878342b10a899f2dc881bb7b81c23d66778 Mon Sep 17 00:00:00 2001 From: oscarz Date: Wed, 2 Jul 2025 09:05:59 +0800 Subject: [PATCH] modify scripts --- iafd/src/fetch.py | 4 ++-- iafd/src/iafd_scraper.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index bc31628..728801b 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -241,7 +241,7 @@ def fetch_movies_by_dist(): else : logging.warning(f'parse_page_movie error. url: {url}') time.sleep(1) - elif status_code and status_code == 404: + elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]: logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') break else: @@ -281,7 +281,7 @@ def fetch_movies_by_stu(): else : logging.warning(f'parse_page_movie error. url: {url}') time.sleep(1) - elif status_code and status_code == 404: + elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]: logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') break else: diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py index da4992e..613aa70 100644 --- a/iafd/src/iafd_scraper.py +++ b/iafd/src/iafd_scraper.py @@ -70,7 +70,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor return None, http_code_404 # 直接返回 404,调用方可以跳过 response.raise_for_status() # 处理 HTTP 错误 - + # 过期的网页,与404相同处理 if "invalid or outdated page" in response.text.lower(): logging.debug(f"invalid or outdated page: {url}") @@ -85,6 +85,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 return soup, response.status_code + else: + # 检查是否发生跳转,比如到登录页面 + if response.history: + logging.warning(f"Page redirected on {url}. Validation failed.") + return None, http_code_login logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except cloudscraper.exceptions.CloudflareChallengeError as e: