modify scripts
This commit is contained in:
@ -241,7 +241,7 @@ def fetch_movies_by_dist():
|
|||||||
else :
|
else :
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -281,7 +281,7 @@ def fetch_movies_by_stu():
|
|||||||
else :
|
else :
|
||||||
logging.warning(f'parse_page_movie error. url: {url}')
|
logging.warning(f'parse_page_movie error. url: {url}')
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
elif status_code and status_code == 404:
|
elif status_code and status_code in [scraper.http_code_404, scraper.http_code_login, scraper.http_code_url]:
|
||||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -70,7 +70,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
response.raise_for_status() # 处理 HTTP 错误
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
# 过期的网页,与404相同处理
|
# 过期的网页,与404相同处理
|
||||||
if "invalid or outdated page" in response.text.lower():
|
if "invalid or outdated page" in response.text.lower():
|
||||||
logging.debug(f"invalid or outdated page: {url}")
|
logging.debug(f"invalid or outdated page: {url}")
|
||||||
@ -85,6 +85,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
soup = BeautifulSoup(html_text, parser)
|
soup = BeautifulSoup(html_text, parser)
|
||||||
if validator(soup): # 进行自定义页面检查
|
if validator(soup): # 进行自定义页面检查
|
||||||
return soup, response.status_code
|
return soup, response.status_code
|
||||||
|
else:
|
||||||
|
# 检查是否发生跳转,比如到登录页面
|
||||||
|
if response.history:
|
||||||
|
logging.warning(f"Page redirected on {url}. Validation failed.")
|
||||||
|
return None, http_code_login
|
||||||
|
|
||||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user