modify scripts
This commit is contained in:
@ -2,11 +2,13 @@ import logging
|
||||
import sys
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
import src.utils.utils as utils
|
||||
|
||||
http_code_404 = 404
|
||||
http_code_403 = 403
|
||||
http_code_redirect = 401
|
||||
http_code_url = 601
|
||||
http_code_local = 99
|
||||
@ -59,10 +61,10 @@ class GenericCrawler:
|
||||
response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == http_code_404:
|
||||
logging.debug(f"Page not found (404): {url}")
|
||||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
if response.status_code in [http_code_404, http_code_403]:
|
||||
logging.debug(f"get http code: {response.status_code}, url: {url}")
|
||||
return None, response.status_code # 直接返回,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 检查是否发生跳转,比如到登录页面
|
||||
@ -86,6 +88,7 @@ class GenericCrawler:
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retrying...")
|
||||
time.sleep(0.3)
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
Reference in New Issue
Block a user