From c2e94e043ae6d82158135646f36e4630feec1d7b Mon Sep 17 00:00:00 2001 From: sophon Date: Thu, 31 Jul 2025 11:27:24 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/scrapy_proj/settings.py | 7 +++++-- .../scrapy_proj/spiders/iafd_spider.py | 19 +++++++++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/scrapy_proj/scrapy_proj/settings.py b/scrapy_proj/scrapy_proj/settings.py index 7281e84..0779b79 100644 --- a/scrapy_proj/scrapy_proj/settings.py +++ b/scrapy_proj/scrapy_proj/settings.py @@ -10,6 +10,7 @@ import os from datetime import datetime +home_dir = os.path.expanduser("~") # 创建日志目录 LOG_DIR = './log' os.makedirs(LOG_DIR, exist_ok=True) @@ -165,15 +166,17 @@ RETRY_BACKOFF_MAX = 60 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 0 -HTTPCACHE_DIR = "~/sharedata/scrapy_cached" +HTTPCACHE_DIR = f"{home_dir}/sharedata/scrapy_cached" HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" +os.makedirs(HTTPCACHE_DIR, exist_ok=True) # Set settings whose default value is deprecated to a future-proof value FEED_EXPORT_ENCODING = "utf-8" # 设置磁盘队列存储目录 -JOBDIR = 'crawl_state' +JOBDIR = f"{home_dir}/sharedata/scrapy_job" +os.makedirs(JOBDIR, exist_ok=True) # 调度器配置 #SCHEDULER = "scrapy.core.scheduler.Scheduler" diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py index cb3566e..55ca526 100644 --- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py @@ -50,10 +50,15 @@ class IAFDSpider(BaseSpider): def custom_start_requests(self): self.crawler.stats.set_value(f"{self.name}/actor_all", 0) self.crawler.stats.set_value(f"{self.name}/actor_done", 0) - self.crawler.stats.set_value(f"{self.name}/actor_404", 0) self.crawler.stats.set_value(f"{self.name}/movie_all", 0) self.crawler.stats.set_value(f"{self.name}/movie_done", 0) - self.crawler.stats.set_value(f"{self.name}/movie_404", 0) + self.crawler.stats.set_value(f"{self.name}/ethnic_pages", 0) + self.crawler.stats.set_value(f"{self.name}/dist_pages", 0) + self.crawler.stats.set_value(f"{self.name}/stu_pages", 0) + self.crawler.stats.set_value(f"{self.name}/4xx_cnt", 0) + self.crawler.stats.set_value(f"{self.name}/5xx_cnt", 0) + self.crawler.stats.set_value(f"{self.name}/other_cnt", 0) + # 根据命令字执行 if self.cmd_astro in self.cmd_list: # 关键:迭代 start_astro 产生的生成器,转发其中的 Request @@ -178,6 +183,7 @@ class IAFDSpider(BaseSpider): data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic) if data: self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}") + self.crawler.stats.inc_value(f"{self.name}/ethnic_pages") for item in data: yield from self._create_performer_request(href=item['href'], name=item['person']) @@ -228,6 +234,7 @@ class IAFDSpider(BaseSpider): data, next_url = common_parser(html=response.text, page=list_type) if data: self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}") + self.crawler.stats.inc_value(f"{self.name}/{list_type}_pages") for movie in data: yield from self._create_movie_request(href=movie['href'], title=movie['title']) else: @@ -322,13 +329,21 @@ class IAFDSpider(BaseSpider): if response.status in [200]: if "invalid or outdated page" in response.text.lower(): self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}") + self.crawler.stats.inc_value(f"{self.name}/4xx_cnt") else: self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}") + self.crawler.stats.inc_value(f"{self.name}/other_cnt") elif response.status in [404, 403]: self.logger.warning(f"get 404 page. url: {response.url}") + self.crawler.stats.inc_value(f"{self.name}/4xx_cnt") + + elif response.status in [500, 502, 503, 504, 521, 522, 524]: + self.logger.error(f"get 5xx page. url: {response.url}") + self.crawler.stats.inc_value(f"{self.name}/5xx_cnt") else: self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}") + self.crawler.stats.inc_value(f"{self.name}/other_cnt") if page: if page == 'actor':