modify scripts

This commit is contained in:
oscarz
2025-07-02 09:05:59 +08:00
parent 8cd0a67b64
commit f1a9287834
2 changed files with 8 additions and 3 deletions

View File

@ -241,7 +241,7 @@ def fetch_movies_by_dist():
else : else :
logging.warning(f'parse_page_movie error. url: {url}') logging.warning(f'parse_page_movie error. url: {url}')
time.sleep(1) time.sleep(1)
elif status_code and status_code == 404: elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
break break
else: else:
@ -281,7 +281,7 @@ def fetch_movies_by_stu():
else : else :
logging.warning(f'parse_page_movie error. url: {url}') logging.warning(f'parse_page_movie error. url: {url}')
time.sleep(1) time.sleep(1)
elif status_code and status_code == 404: elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
break break
else: else:

View File

@ -70,7 +70,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
return None, http_code_404 # 直接返回 404调用方可以跳过 return None, http_code_404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误 response.raise_for_status() # 处理 HTTP 错误
# 过期的网页与404相同处理 # 过期的网页与404相同处理
if "invalid or outdated page" in response.text.lower(): if "invalid or outdated page" in response.text.lower():
logging.debug(f"invalid or outdated page: {url}") logging.debug(f"invalid or outdated page: {url}")
@ -85,6 +85,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
soup = BeautifulSoup(html_text, parser) soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查 if validator(soup): # 进行自定义页面检查
return soup, response.status_code return soup, response.status_code
else:
# 检查是否发生跳转,比如到登录页面
if response.history:
logging.warning(f"Page redirected on {url}. Validation failed.")
return None, http_code_login
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e: except cloudscraper.exceptions.CloudflareChallengeError as e: