modify scripts

This commit is contained in:
2025-07-23 13:16:27 +08:00
parent 558ceee49c
commit 950294e882
3 changed files with 24 additions and 1 deletions

View File

@ -76,6 +76,14 @@ class StatsExtension:
# 获取当前统计信息
stats = self.stats.get_stats()
# 获取spider自定义的信息
spider_stat = {'sp': '-------'}
prefix = f"{self.spider_name}/"
for key, value in stats.items():
if key.startswith(prefix):
short_key = key.replace(prefix, "", 1)
spider_stat[short_key] = value
# 构建统计摘要
stats_summary = {
'spider': self.spider_name,
@ -87,6 +95,8 @@ class StatsExtension:
'404_cnt': stats.get('downloader/response_status_count/404', 0),
'log_err_cnt': stats.get('log_count/ERROR', 0),
'status': self.current_status,
**spider_stat,
'db': '-------',
**db_stat
}

View File

@ -99,6 +99,10 @@ class ClmSpider(BaseSpider):
callback=self.handle_redirect
)
# 添加统计项
self.crawler.stats.set_value(f"{self.name}/req_words_all", len(keywords))
self.crawler.stats.set_value(f"{self.name}/req_words_done", 0)
# 处理POST过来的302请求
def handle_redirect(self, response):
"""处理302重定向获取location并访问结果页面"""
@ -242,6 +246,7 @@ class ClmSpider(BaseSpider):
self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}')
return
elif not need_next or not next_page_url:
self.crawler.stats.inc_value(f"{self.name}/req_words_done")
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
curr_words = response.meta.get('words', '')
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')

View File

@ -70,6 +70,8 @@ class PornboxSpider(BaseSpider):
if self.cmd_studio in self.cmd_list:
url = self._build_studio_list_url()
yield scrapy.Request(url, callback=self.parse_studios_list)
self.crawler.stats.set_value(f"{self.name}/req_list_all", 0)
self.crawler.stats.set_value(f"{self.name}/req_list_done", 0)
# 获取每个stutio, 获取详情
if self.cmd_movie in self.cmd_list:
@ -80,7 +82,10 @@ class PornboxSpider(BaseSpider):
for stu in stu_list:
url = self._build_studio_url(stu['label_id'])
yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})
# 添加统计项
self.crawler.stats.set_value(f"{self.name}/req_mov_all", len(stu_list))
self.crawler.stats.set_value(f"{self.name}/req_mov_done", 0)
def parse_studios_list(self, response):
# 尝试解析 JSON 响应
@ -100,6 +105,8 @@ class PornboxSpider(BaseSpider):
current_page = data.get('current_page', 1)
total_pages = data.get('total_pages', 1)
self.crawler.stats.inc_value(f"{self.name}/req_list_done")
self.crawler.stats.set_value(f"{self.name}/req_list_all", total_pages)
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
# 处理每个工作室项目
@ -214,6 +221,7 @@ class PornboxSpider(BaseSpider):
stu_name = response.meta['name']
scene_count = response.meta['scene_count']
if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
self.crawler.stats.inc_value(f"{self.name}/req_mov_done")
total_rows = db_tools.get_stu_mov_count(stu_id)
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
return