modify scripts

This commit is contained in:
2025-07-31 11:27:24 +08:00
parent c3f765f6c7
commit c2e94e043a
2 changed files with 22 additions and 4 deletions

View File

@ -10,6 +10,7 @@
import os
from datetime import datetime
home_dir = os.path.expanduser("~")
# 创建日志目录
LOG_DIR = './log'
os.makedirs(LOG_DIR, exist_ok=True)
@ -165,15 +166,17 @@ RETRY_BACKOFF_MAX = 60
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = "~/sharedata/scrapy_cached"
HTTPCACHE_DIR = f"{home_dir}/sharedata/scrapy_cached"
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
os.makedirs(HTTPCACHE_DIR, exist_ok=True)
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
# 设置磁盘队列存储目录
JOBDIR = 'crawl_state'
JOBDIR = f"{home_dir}/sharedata/scrapy_job"
os.makedirs(JOBDIR, exist_ok=True)
# 调度器配置
#SCHEDULER = "scrapy.core.scheduler.Scheduler"

View File

@ -50,10 +50,15 @@ class IAFDSpider(BaseSpider):
def custom_start_requests(self):
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
self.crawler.stats.set_value(f"{self.name}/actor_404", 0)
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
self.crawler.stats.set_value(f"{self.name}/movie_404", 0)
self.crawler.stats.set_value(f"{self.name}/ethnic_pages", 0)
self.crawler.stats.set_value(f"{self.name}/dist_pages", 0)
self.crawler.stats.set_value(f"{self.name}/stu_pages", 0)
self.crawler.stats.set_value(f"{self.name}/4xx_cnt", 0)
self.crawler.stats.set_value(f"{self.name}/5xx_cnt", 0)
self.crawler.stats.set_value(f"{self.name}/other_cnt", 0)
# 根据命令字执行
if self.cmd_astro in self.cmd_list:
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
@ -178,6 +183,7 @@ class IAFDSpider(BaseSpider):
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
if data:
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
self.crawler.stats.inc_value(f"{self.name}/ethnic_pages")
for item in data:
yield from self._create_performer_request(href=item['href'], name=item['person'])
@ -228,6 +234,7 @@ class IAFDSpider(BaseSpider):
data, next_url = common_parser(html=response.text, page=list_type)
if data:
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
self.crawler.stats.inc_value(f"{self.name}/{list_type}_pages")
for movie in data:
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
else:
@ -322,13 +329,21 @@ class IAFDSpider(BaseSpider):
if response.status in [200]:
if "invalid or outdated page" in response.text.lower():
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
else:
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
elif response.status in [404, 403]:
self.logger.warning(f"get 404 page. url: {response.url}")
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
elif response.status in [500, 502, 503, 504, 521, 522, 524]:
self.logger.error(f"get 5xx page. url: {response.url}")
self.crawler.stats.inc_value(f"{self.name}/5xx_cnt")
else:
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
if page:
if page == 'actor':