diff --git a/scrapy_proj/scrapy_proj/extensions/stats_extension.py b/scrapy_proj/scrapy_proj/extensions/stats_extension.py index fb125b8..1a3a41a 100644 --- a/scrapy_proj/scrapy_proj/extensions/stats_extension.py +++ b/scrapy_proj/scrapy_proj/extensions/stats_extension.py @@ -76,6 +76,14 @@ class StatsExtension: # 获取当前统计信息 stats = self.stats.get_stats() + # 获取spider自定义的信息 + spider_stat = {'sp': '-------'} + prefix = f"{self.spider_name}/" + for key, value in stats.items(): + if key.startswith(prefix): + short_key = key.replace(prefix, "", 1) + spider_stat[short_key] = value + # 构建统计摘要 stats_summary = { 'spider': self.spider_name, @@ -87,6 +95,8 @@ class StatsExtension: '404_cnt': stats.get('downloader/response_status_count/404', 0), 'log_err_cnt': stats.get('log_count/ERROR', 0), 'status': self.current_status, + **spider_stat, + 'db': '-------', **db_stat } diff --git a/scrapy_proj/scrapy_proj/spiders/clm_spider.py b/scrapy_proj/scrapy_proj/spiders/clm_spider.py index 11decdb..b70ec5b 100644 --- a/scrapy_proj/scrapy_proj/spiders/clm_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/clm_spider.py @@ -99,6 +99,10 @@ class ClmSpider(BaseSpider): callback=self.handle_redirect ) + # 添加统计项 + self.crawler.stats.set_value(f"{self.name}/req_words_all", len(keywords)) + self.crawler.stats.set_value(f"{self.name}/req_words_done", 0) + # 处理POST过来的302请求 def handle_redirect(self, response): """处理302重定向,获取location并访问结果页面""" @@ -242,6 +246,7 @@ class ClmSpider(BaseSpider): self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}') return elif not need_next or not next_page_url: + self.crawler.stats.inc_value(f"{self.name}/req_words_done") total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0)) curr_words = response.meta.get('words', '') self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}') diff --git a/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py b/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py index 82056da..91441aa 100644 --- a/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py @@ -70,6 +70,8 @@ class PornboxSpider(BaseSpider): if self.cmd_studio in self.cmd_list: url = self._build_studio_list_url() yield scrapy.Request(url, callback=self.parse_studios_list) + self.crawler.stats.set_value(f"{self.name}/req_list_all", 0) + self.crawler.stats.set_value(f"{self.name}/req_list_done", 0) # 获取每个stutio, 获取详情 if self.cmd_movie in self.cmd_list: @@ -80,7 +82,10 @@ class PornboxSpider(BaseSpider): for stu in stu_list: url = self._build_studio_url(stu['label_id']) yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']}) - + + # 添加统计项 + self.crawler.stats.set_value(f"{self.name}/req_mov_all", len(stu_list)) + self.crawler.stats.set_value(f"{self.name}/req_mov_done", 0) def parse_studios_list(self, response): # 尝试解析 JSON 响应 @@ -100,6 +105,8 @@ class PornboxSpider(BaseSpider): current_page = data.get('current_page', 1) total_pages = data.get('total_pages', 1) + self.crawler.stats.inc_value(f"{self.name}/req_list_done") + self.crawler.stats.set_value(f"{self.name}/req_list_all", total_pages) self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}") # 处理每个工作室项目 @@ -214,6 +221,7 @@ class PornboxSpider(BaseSpider): stu_name = response.meta['name'] scene_count = response.meta['scene_count'] if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000): + self.crawler.stats.inc_value(f"{self.name}/req_mov_done") total_rows = db_tools.get_stu_mov_count(stu_id) self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}') return