modify scripts
This commit is contained in:
@ -10,6 +10,7 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
home_dir = os.path.expanduser("~")
|
||||
# 创建日志目录
|
||||
LOG_DIR = './log'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
@ -165,15 +166,17 @@ RETRY_BACKOFF_MAX = 60
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
HTTPCACHE_ENABLED = True
|
||||
HTTPCACHE_EXPIRATION_SECS = 0
|
||||
HTTPCACHE_DIR = "~/sharedata/scrapy_cached"
|
||||
HTTPCACHE_DIR = f"{home_dir}/sharedata/scrapy_cached"
|
||||
HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
os.makedirs(HTTPCACHE_DIR, exist_ok=True)
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
# 设置磁盘队列存储目录
|
||||
JOBDIR = 'crawl_state'
|
||||
JOBDIR = f"{home_dir}/sharedata/scrapy_job"
|
||||
os.makedirs(JOBDIR, exist_ok=True)
|
||||
|
||||
# 调度器配置
|
||||
#SCHEDULER = "scrapy.core.scheduler.Scheduler"
|
||||
|
||||
@ -50,10 +50,15 @@ class IAFDSpider(BaseSpider):
|
||||
def custom_start_requests(self):
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_all", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_done", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/actor_404", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/movie_all", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/movie_done", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/movie_404", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/ethnic_pages", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/dist_pages", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/stu_pages", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/4xx_cnt", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/5xx_cnt", 0)
|
||||
self.crawler.stats.set_value(f"{self.name}/other_cnt", 0)
|
||||
|
||||
# 根据命令字执行
|
||||
if self.cmd_astro in self.cmd_list:
|
||||
# 关键:迭代 start_astro 产生的生成器,转发其中的 Request
|
||||
@ -178,6 +183,7 @@ class IAFDSpider(BaseSpider):
|
||||
data, next_url = common_parser(html=response.text, page='ethnic', ethnic=ethnic)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/ethnic_pages")
|
||||
for item in data:
|
||||
yield from self._create_performer_request(href=item['href'], name=item['person'])
|
||||
|
||||
@ -228,6 +234,7 @@ class IAFDSpider(BaseSpider):
|
||||
data, next_url = common_parser(html=response.text, page=list_type)
|
||||
if data:
|
||||
self.logger.debug(f"fetched data from {response.url}, data len: {len(data)}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/{list_type}_pages")
|
||||
for movie in data:
|
||||
yield from self._create_movie_request(href=movie['href'], title=movie['title'])
|
||||
else:
|
||||
@ -322,13 +329,21 @@ class IAFDSpider(BaseSpider):
|
||||
if response.status in [200]:
|
||||
if "invalid or outdated page" in response.text.lower():
|
||||
self.logger.warning(f"invalid or outdated page. url: {response.url}, status_code: {response.status}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, content: {response.text[:500]}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
|
||||
|
||||
elif response.status in [404, 403]:
|
||||
self.logger.warning(f"get 404 page. url: {response.url}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/4xx_cnt")
|
||||
|
||||
elif response.status in [500, 502, 503, 504, 521, 522, 524]:
|
||||
self.logger.error(f"get 5xx page. url: {response.url}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/5xx_cnt")
|
||||
else:
|
||||
self.logger.warning(f"unkown page. url:{response.url}, status: {response.status}, content: {response.text[:500]}")
|
||||
self.crawler.stats.inc_value(f"{self.name}/other_cnt")
|
||||
|
||||
if page:
|
||||
if page == 'actor':
|
||||
|
||||
Reference in New Issue
Block a user