modify scripts
This commit is contained in:
@ -76,6 +76,14 @@ class StatsExtension:
|
|||||||
|
|
||||||
# 获取当前统计信息
|
# 获取当前统计信息
|
||||||
stats = self.stats.get_stats()
|
stats = self.stats.get_stats()
|
||||||
|
# 获取spider自定义的信息
|
||||||
|
spider_stat = {'sp': '-------'}
|
||||||
|
prefix = f"{self.spider_name}/"
|
||||||
|
for key, value in stats.items():
|
||||||
|
if key.startswith(prefix):
|
||||||
|
short_key = key.replace(prefix, "", 1)
|
||||||
|
spider_stat[short_key] = value
|
||||||
|
|
||||||
# 构建统计摘要
|
# 构建统计摘要
|
||||||
stats_summary = {
|
stats_summary = {
|
||||||
'spider': self.spider_name,
|
'spider': self.spider_name,
|
||||||
@ -87,6 +95,8 @@ class StatsExtension:
|
|||||||
'404_cnt': stats.get('downloader/response_status_count/404', 0),
|
'404_cnt': stats.get('downloader/response_status_count/404', 0),
|
||||||
'log_err_cnt': stats.get('log_count/ERROR', 0),
|
'log_err_cnt': stats.get('log_count/ERROR', 0),
|
||||||
'status': self.current_status,
|
'status': self.current_status,
|
||||||
|
**spider_stat,
|
||||||
|
'db': '-------',
|
||||||
**db_stat
|
**db_stat
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -99,6 +99,10 @@ class ClmSpider(BaseSpider):
|
|||||||
callback=self.handle_redirect
|
callback=self.handle_redirect
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 添加统计项
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/req_words_all", len(keywords))
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/req_words_done", 0)
|
||||||
|
|
||||||
# 处理POST过来的302请求
|
# 处理POST过来的302请求
|
||||||
def handle_redirect(self, response):
|
def handle_redirect(self, response):
|
||||||
"""处理302重定向,获取location并访问结果页面"""
|
"""处理302重定向,获取location并访问结果页面"""
|
||||||
@ -242,6 +246,7 @@ class ClmSpider(BaseSpider):
|
|||||||
self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}')
|
self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}')
|
||||||
return
|
return
|
||||||
elif not need_next or not next_page_url:
|
elif not need_next or not next_page_url:
|
||||||
|
self.crawler.stats.inc_value(f"{self.name}/req_words_done")
|
||||||
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
|
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
|
||||||
curr_words = response.meta.get('words', '')
|
curr_words = response.meta.get('words', '')
|
||||||
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
|
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
|
||||||
|
|||||||
@ -70,6 +70,8 @@ class PornboxSpider(BaseSpider):
|
|||||||
if self.cmd_studio in self.cmd_list:
|
if self.cmd_studio in self.cmd_list:
|
||||||
url = self._build_studio_list_url()
|
url = self._build_studio_list_url()
|
||||||
yield scrapy.Request(url, callback=self.parse_studios_list)
|
yield scrapy.Request(url, callback=self.parse_studios_list)
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/req_list_all", 0)
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/req_list_done", 0)
|
||||||
|
|
||||||
# 获取每个stutio, 获取详情
|
# 获取每个stutio, 获取详情
|
||||||
if self.cmd_movie in self.cmd_list:
|
if self.cmd_movie in self.cmd_list:
|
||||||
@ -81,6 +83,9 @@ class PornboxSpider(BaseSpider):
|
|||||||
url = self._build_studio_url(stu['label_id'])
|
url = self._build_studio_url(stu['label_id'])
|
||||||
yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})
|
yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})
|
||||||
|
|
||||||
|
# 添加统计项
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/req_mov_all", len(stu_list))
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/req_mov_done", 0)
|
||||||
|
|
||||||
def parse_studios_list(self, response):
|
def parse_studios_list(self, response):
|
||||||
# 尝试解析 JSON 响应
|
# 尝试解析 JSON 响应
|
||||||
@ -100,6 +105,8 @@ class PornboxSpider(BaseSpider):
|
|||||||
current_page = data.get('current_page', 1)
|
current_page = data.get('current_page', 1)
|
||||||
total_pages = data.get('total_pages', 1)
|
total_pages = data.get('total_pages', 1)
|
||||||
|
|
||||||
|
self.crawler.stats.inc_value(f"{self.name}/req_list_done")
|
||||||
|
self.crawler.stats.set_value(f"{self.name}/req_list_all", total_pages)
|
||||||
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
|
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
|
||||||
|
|
||||||
# 处理每个工作室项目
|
# 处理每个工作室项目
|
||||||
@ -214,6 +221,7 @@ class PornboxSpider(BaseSpider):
|
|||||||
stu_name = response.meta['name']
|
stu_name = response.meta['name']
|
||||||
scene_count = response.meta['scene_count']
|
scene_count = response.meta['scene_count']
|
||||||
if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
|
if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
|
||||||
|
self.crawler.stats.inc_value(f"{self.name}/req_mov_done")
|
||||||
total_rows = db_tools.get_stu_mov_count(stu_id)
|
total_rows = db_tools.get_stu_mov_count(stu_id)
|
||||||
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
|
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
|
||||||
return
|
return
|
||||||
|
|||||||
Reference in New Issue
Block a user