modify scripts

2025-06-27 09:20:47 +08:00
parent d91ba1cd17
commit ac7cff9454
4 changed files with 39 additions and 39 deletions
--- a/src/crawling/craw.py
+++ b/src/crawling/craw.py
@ -2,11 +2,13 @@ import logging
 import sys
 import requests
 import re
+import time
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 import src.utils.utils as utils

 http_code_404       = 404
+http_code_403       = 403
 http_code_redirect  = 401
 http_code_url       = 601
 http_code_local     = 99
@ -59,10 +61,10 @@ class GenericCrawler:
                response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)

                # 处理 HTTP 状态码
-                if response.status_code == http_code_404:
-                    logging.debug(f"Page not found (404): {url}")
-                    return None, http_code_404  # 直接返回 404，调用方可以跳过
-
+                if response.status_code in [http_code_404, http_code_403]:
+                    logging.debug(f"get http code: {response.status_code}, url: {url}")
+                    return None, response.status_code  # 直接返回，调用方可以跳过
+                
                response.raise_for_status()  # 处理 HTTP 错误

                # 检查是否发生跳转，比如到登录页面
@ -86,6 +88,7 @@ class GenericCrawler:
                logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
            except Exception as e:
                logging.error(f"Unexpected error on {url}: {e}, Retrying...")
+                time.sleep(0.3)

        logging.error(f'Fetching failed after max retries. {url}')
        return None, None  # 达到最大重试次数仍然失败