diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py index bc31628..728801b 100644 --- a/iafd/src/fetch.py +++ b/iafd/src/fetch.py @@ -241,7 +241,7 @@ def fetch_movies_by_dist(): else : logging.warning(f'parse_page_movie error. url: {url}') time.sleep(1) - elif status_code and status_code == 404: + elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]: logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') break else: @@ -281,7 +281,7 @@ def fetch_movies_by_stu(): else : logging.warning(f'parse_page_movie error. url: {url}') time.sleep(1) - elif status_code and status_code == 404: + elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]: logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') break else: diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py index da4992e..613aa70 100644 --- a/iafd/src/iafd_scraper.py +++ b/iafd/src/iafd_scraper.py @@ -70,7 +70,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor return None, http_code_404 # 直接返回 404,调用方可以跳过 response.raise_for_status() # 处理 HTTP 错误 - + # 过期的网页,与404相同处理 if "invalid or outdated page" in response.text.lower(): logging.debug(f"invalid or outdated page: {url}") @@ -85,6 +85,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 return soup, response.status_code + else: + # 检查是否发生跳转,比如到登录页面 + if response.history: + logging.warning(f"Page redirected on {url}. Validation failed.") + return None, http_code_login logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except cloudscraper.exceptions.CloudflareChallengeError as e: