modify scripts
This commit is contained in:
@ -76,6 +76,14 @@ class StatsExtension:
|
||||
|
||||
# 获取当前统计信息
|
||||
stats = self.stats.get_stats()
|
||||
# 获取spider自定义的信息
|
||||
spider_stat = {'sp': '-------'}
|
||||
prefix = f"{self.spider_name}/"
|
||||
for key, value in stats.items():
|
||||
if key.startswith(prefix):
|
||||
short_key = key.replace(prefix, "", 1)
|
||||
spider_stat[short_key] = value
|
||||
|
||||
# 构建统计摘要
|
||||
stats_summary = {
|
||||
'spider': self.spider_name,
|
||||
@ -87,6 +95,8 @@ class StatsExtension:
|
||||
'404_cnt': stats.get('downloader/response_status_count/404', 0),
|
||||
'log_err_cnt': stats.get('log_count/ERROR', 0),
|
||||
'status': self.current_status,
|
||||
**spider_stat,
|
||||
'db': '-------',
|
||||
**db_stat
|
||||
}
|
||||
|
||||
|
||||
@ -99,6 +99,10 @@ class ClmSpider(BaseSpider):
|
||||
callback=self.handle_redirect
|
||||
)
|
||||
|
||||
# 添加统计项
|
||||
self.crawler.stats.set_value(f"{self.name}/req_words_all", len(keywords))
|
||||
self.crawler.stats.set_value(f"{self.name}/req_words_done", 0)
|
||||
|
||||
# 处理POST过来的302请求
|
||||
def handle_redirect(self, response):
|
||||
"""处理302重定向,获取location并访问结果页面"""
|
||||
@ -242,6 +246,7 @@ class ClmSpider(BaseSpider):
|
||||
self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}')
|
||||
return
|
||||
elif not need_next or not next_page_url:
|
||||
self.crawler.stats.inc_value(f"{self.name}/req_words_done")
|
||||
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
|
||||
curr_words = response.meta.get('words', '')
|
||||
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
|
||||
|
||||
@ -70,6 +70,8 @@ class PornboxSpider(BaseSpider):
|
||||
if self.cmd_studio in self.cmd_list:
|
||||
url = self._build_studio_list_url()
|
||||
yield scrapy.Request(url, callback=self.parse_studios_list)
|
||||
self.crawler.stats.set_value(f"{self.name}/req_list_all", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/req_list_done", 0)
|
||||
|
||||
# 获取每个stutio, 获取详情
|
||||
if self.cmd_movie in self.cmd_list:
|
||||
@ -80,7 +82,10 @@ class PornboxSpider(BaseSpider):
|
||||
for stu in stu_list:
|
||||
url = self._build_studio_url(stu['label_id'])
|
||||
yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})
|
||||
|
||||
|
||||
# 添加统计项
|
||||
self.crawler.stats.set_value(f"{self.name}/req_mov_all", len(stu_list))
|
||||
self.crawler.stats.set_value(f"{self.name}/req_mov_done", 0)
|
||||
|
||||
def parse_studios_list(self, response):
|
||||
# 尝试解析 JSON 响应
|
||||
@ -100,6 +105,8 @@ class PornboxSpider(BaseSpider):
|
||||
current_page = data.get('current_page', 1)
|
||||
total_pages = data.get('total_pages', 1)
|
||||
|
||||
self.crawler.stats.inc_value(f"{self.name}/req_list_done")
|
||||
self.crawler.stats.set_value(f"{self.name}/req_list_all", total_pages)
|
||||
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
|
||||
|
||||
# 处理每个工作室项目
|
||||
@ -214,6 +221,7 @@ class PornboxSpider(BaseSpider):
|
||||
stu_name = response.meta['name']
|
||||
scene_count = response.meta['scene_count']
|
||||
if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
|
||||
self.crawler.stats.inc_value(f"{self.name}/req_mov_done")
|
||||
total_rows = db_tools.get_stu_mov_count(stu_id)
|
||||
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user