modify scripts
This commit is contained in:
@ -47,14 +47,14 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
|||||||
|
|
||||||
# 处理 HTTP 状态码
|
# 处理 HTTP 状态码
|
||||||
if response.status_code == 404:
|
if response.status_code == 404:
|
||||||
logging.warning(f"Page not found (404): {url}")
|
logging.debug(f"Page not found (404): {url}")
|
||||||
return None, 404 # 直接返回 404,调用方可以跳过
|
return None, 404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
response.raise_for_status() # 处理 HTTP 错误
|
response.raise_for_status() # 处理 HTTP 错误
|
||||||
|
|
||||||
# 过期的网页,与404相同处理
|
# 过期的网页,与404相同处理
|
||||||
if "invalid or outdated page" in response.text.lower():
|
if "invalid or outdated page" in response.text.lower():
|
||||||
logging.warning(f"invalid or outdated page: {url}")
|
logging.debug(f"invalid or outdated page: {url}")
|
||||||
return None, 404 # 直接返回 404,调用方可以跳过
|
return None, 404 # 直接返回 404,调用方可以跳过
|
||||||
|
|
||||||
# 预处理 HTML(如果提供了 preprocessor)
|
# 预处理 HTML(如果提供了 preprocessor)
|
||||||
|
|||||||
Reference in New Issue
Block a user