modify scripts
This commit is contained in:
@ -241,7 +241,7 @@ def fetch_movies_by_dist():
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
time.sleep(1)
|
||||
elif status_code and status_code == 404:
|
||||
elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
break
|
||||
else:
|
||||
@ -281,7 +281,7 @@ def fetch_movies_by_stu():
|
||||
else :
|
||||
logging.warning(f'parse_page_movie error. url: {url}')
|
||||
time.sleep(1)
|
||||
elif status_code and status_code == 404:
|
||||
elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
break
|
||||
else:
|
||||
|
||||
@ -70,7 +70,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
|
||||
# 过期的网页,与404相同处理
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
logging.debug(f"invalid or outdated page: {url}")
|
||||
@ -85,6 +85,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup, response.status_code
|
||||
else:
|
||||
# 检查是否发生跳转,比如到登录页面
|
||||
if response.history:
|
||||
logging.warning(f"Page redirected on {url}. Validation failed.")
|
||||
return None, http_code_login
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
|
||||
Reference in New Issue
Block a user