From c34cfb458c3b057756bd307d05a863f951a4f537 Mon Sep 17 00:00:00 2001 From: oscarz Date: Thu, 3 Jul 2025 16:07:47 +0800 Subject: [PATCH] modify scripts --- .../scrapy_proj/db_wapper/iafd_query.py | 2 +- .../scrapy_proj/extensions/stats_extension.py | 18 +--- scrapy_proj/scrapy_proj/middlewares.py | 38 ++++++--- scrapy_proj/scrapy_proj/pipelines.py | 25 +++--- scrapy_proj/scrapy_proj/settings.py | 2 +- .../scrapy_proj/spiders/iafd_spider.py | 85 ++++++++++++------- .../scrapy_proj/spiders/u3c3_spider.py | 10 ++- 7 files changed, 108 insertions(+), 72 deletions(-) diff --git a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py b/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py index ecf9b17..06f7508 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py +++ b/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py @@ -61,7 +61,7 @@ class IAFDQuery(SQLiteDBHandler): # 按条件查询 href 列表 def get_movies(self, **filters): try: - sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1" + sql = f"SELECT href, title, id FROM {self.tbl_name_movies} WHERE 1=1" params = [] conditions = { diff --git a/scrapy_proj/scrapy_proj/extensions/stats_extension.py b/scrapy_proj/scrapy_proj/extensions/stats_extension.py index b12aa58..d339c71 100644 --- a/scrapy_proj/scrapy_proj/extensions/stats_extension.py +++ b/scrapy_proj/scrapy_proj/extensions/stats_extension.py @@ -49,23 +49,13 @@ class StatsExtension: def _export_stats(self, spider): # 获取当前统计信息 stats = self.stats.get_stats() - - # 修正:计算爬虫运行时间 - start_time = stats.get('start_time') - if start_time: - # 将 datetime 对象转换为时间戳 - start_timestamp = start_time.timestamp() - uptime = time.time() - start_timestamp - else: - uptime = 0 - # 构建统计摘要 stats_summary = { - 't': datetime.now().strftime('%H:%M:%S'), 'spider': self.spider_name, - 'interval(s)': int(uptime), - 'recv_cnt': stats.get('response_received_count', 0), - 'total_req': stats.get('downloader/request_count', 0), + 'scrapy_req': stats.get('downloader/request_count', 0), + 'middle_req': stats.get('cloudscraper/request_count', 0), + 'total_req': stats.get('cloudscraper/request_count', 0) + stats.get('downloader/request_count', 0), + 'total_rsp': stats.get('downloader/response_count', 0), '200_cnt': stats.get('downloader/response_status_count/200', 0), '404_cnt': stats.get('downloader/response_status_count/404', 0), 'log_err_cnt': stats.get('log_count/ERROR', 0) diff --git a/scrapy_proj/scrapy_proj/middlewares.py b/scrapy_proj/scrapy_proj/middlewares.py index fdfd870..a0651c4 100644 --- a/scrapy_proj/scrapy_proj/middlewares.py +++ b/scrapy_proj/scrapy_proj/middlewares.py @@ -102,14 +102,15 @@ class ScrapyProjDownloaderMiddleware: import cloudscraper from scrapy.http import TextResponse -import datetime +from datetime import datetime +from urllib.parse import urlparse # 使用cloudscraper做代理,去请求网站 class CloudScraperMiddleware: def __init__(self, stats): self.scraper = cloudscraper.create_scraper() self.stats = stats # 注入统计对象 # 指定需要使用 cloudscraper 的域名 - self.target_domains = {'iafd.com', 'another-domain.com'} + self.target_domains = ['iafd.com', 'another-domain.com'] # 设置 headers 和 scraper self.ifad_headers = { @@ -123,8 +124,19 @@ class CloudScraperMiddleware: ) def process_request(self, request, spider): + hostname = urlparse(request.url).hostname or '' + matched = False + for domain in self.target_domains: + if domain in hostname: + matched = True + break + + # 非目标域名的请求使用默认处理 + if not matched: + return None + # 记录请求开始时间 - start_time = datetime.datetime.now() + start_time = datetime.now() try: # 发送请求 @@ -135,18 +147,18 @@ class CloudScraperMiddleware: ) # 计算请求耗时(毫秒) - duration = (datetime.datetime.now() - start_time).total_seconds() * 1000 + duration = (datetime.now() - start_time).total_seconds() * 1000 # 更新统计数据 - self.stats.inc_value('downloader/request_count') - self.stats.inc_value('downloader/request_method_count/GET') - self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url)) + self.stats.inc_value('cloudscraper/request_count') + self.stats.inc_value('cloudscraper/request_method_count/GET') + self.stats.inc_value('cloudscraper/request_bytes', len(str(request.headers)) + len(request.url)) - self.stats.inc_value('downloader/response_count') - self.stats.inc_value(f'downloader/response_status_count/{response.status_code}') - self.stats.inc_value('downloader/response_bytes', len(response.content)) + self.stats.inc_value('cloudscraper/response_count') + self.stats.inc_value(f'cloudscraper/response_status_count/{response.status_code}') + self.stats.inc_value('cloudscraper/response_bytes', len(response.content)) - self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0)) + #self.stats.set_value(f'response_received_count', self.stats.get_value('cloudscraper/response_status_count/200', 0)) # 创建Scrapy响应对象 return TextResponse( @@ -159,7 +171,7 @@ class CloudScraperMiddleware: except Exception as e: # 记录错误 - self.stats.inc_value('downloader/exception_count') - self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}') + self.stats.inc_value('cloudscraper/exception_count') + self.stats.inc_value(f'cloudscraper/exception_type_count/{e.__class__.__name__}') spider.logger.error(f"CloudScraper请求失败: {e}") return None # 失败时使用默认下载器 diff --git a/scrapy_proj/scrapy_proj/pipelines.py b/scrapy_proj/scrapy_proj/pipelines.py index 18a3e4e..77015b3 100644 --- a/scrapy_proj/scrapy_proj/pipelines.py +++ b/scrapy_proj/scrapy_proj/pipelines.py @@ -61,23 +61,24 @@ class SQLitePipeline(SQLiteDBHandler): def process_item(self, item, spider): if isinstance(item, U001Item): - self._process_u001_item(item) + self._process_u001_item(item, spider) elif isinstance(item, Sis001Item): - self._process_sis001_item(item) + self._process_sis001_item(item, spider) elif isinstance(item, IAFDPersonItem): - self._process_iafd_person_item(item) + self._process_iafd_person_item(item, spider) elif isinstance(item, IAFDPersonDetailItem): - self._process_iafd_person_detail_item(item) + self._process_iafd_person_detail_item(item, spider) elif isinstance(item, IAFDMovieItem): - self._process_iafd_movie_item(item) + self._process_iafd_movie_item(item, spider) elif isinstance(item, IAFDMovieDetailItem): - self._process_iafd_movie_detail_item(item) + self._process_iafd_movie_detail_item(item, spider) return item - def _process_u001_item(self, item): + def _process_u001_item(self, item, spider): + logging.debug(f"insert one item. href:{spider.name}") return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url') - def _process_sis001_item(self, item): + def _process_sis001_item(self, item, spider): self.cursor.execute(''' INSERT OR IGNORE INTO sis001_data (title, url, plate_name) @@ -89,16 +90,16 @@ class SQLitePipeline(SQLiteDBHandler): )) self.conn.commit() - def _process_iafd_person_item(self, item): + def _process_iafd_person_item(self, item, spider): logging.info(f"deal with persion item. {item}") - def _process_iafd_movie_item(self, item): + def _process_iafd_movie_item(self, item, spider): logging.info(f"deal with movie item. {item}") - def _process_iafd_person_detail_item(self, item): + def _process_iafd_person_detail_item(self, item, spider): logging.info(f"deal with persion item. {item}") - def _process_iafd_movie_detail_item(self, item): + def _process_iafd_movie_detail_item(self, item, spider): logging.info(f"deal with movie item. {item}") def close_spider(self, spider): diff --git a/scrapy_proj/scrapy_proj/settings.py b/scrapy_proj/scrapy_proj/settings.py index b020070..0635ebe 100644 --- a/scrapy_proj/scrapy_proj/settings.py +++ b/scrapy_proj/scrapy_proj/settings.py @@ -34,7 +34,7 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 1 CONCURRENT_ITEMS = 100 # 下载延迟 -DOWNLOAD_DELAY = 1 +DOWNLOAD_DELAY = 0.3 # 启用管道 ITEM_PIPELINES = { diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py index d28f324..25b7cd2 100644 --- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py @@ -1,6 +1,5 @@ import scrapy import re -import logging from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem from scrapy_proj.db_wapper.iafd_query import IAFDQuery @@ -21,33 +20,41 @@ class IAFDSpider(scrapy.Spider): def __init__(self, debug='false', cmd='', update='0', *args, **kwargs): super().__init__(*args, **kwargs) self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False - self.cmd_list = cmd + self.cmd_str = cmd self.update = int(update) + self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}") + + self.cmd_astro = 'astro' + self.cmd_birth = 'birth' + self.cmd_ethnic = 'ethnic' + self.cmd_dist = 'dist' + self.cmd_stu = 'stu' + self.cmd_performers = 'performers' + self.cmd_movies = 'movies' + self.cmd_list = self.cmd_str.split(',') + if len(self.cmd_list) == 0 : + self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies] def start_requests(self): - # 按星座获取演员列表 - for astro in self.astro_list: - url = self.astr_base_url + astro - yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro}) - if self.debug: - break + # 根据命令字执行 + if self.cmd_astro in self.cmd_list: + self.start_astro() # 按生日获取演员列表 - for month in range(1, 13): - for day in range(1, 32): - url = self.birth_base_url.format(month=month, day=day) - yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day}) - if self.debug: - break + if self.cmd_birth in self.cmd_list: + self.start_birth() # 获取人种列表 - yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page) + if self.cmd_ethnic in self.cmd_list: + yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page) # 获取 distributors 列表 - yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page) + if self.cmd_dist in self.cmd_list: + yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page) # 获取 studios 列表 - yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page) + if self.cmd_stu in self.cmd_list: + yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page) query_args = {} if self.debug: @@ -56,23 +63,41 @@ class IAFDSpider(scrapy.Spider): query_args['is_full_data'] = 0 # 读取待更新的演员列表 - actors = db_tools.get_performers(**query_args) - if actors: - for item in actors: - href = item.get('href', '') - movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0 - logging.info(f"fetch from db. item: {item}") - yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt}) + if self.cmd_performers in self.cmd_list: + actors = db_tools.get_performers(**query_args) + if actors: + for item in actors: + href = item.get('href', '') + movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0 + self.logger.info(f"fetch from db. item: {item}") + yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt}) # 读取待更新的影片列表 - movies = db_tools.get_movies(**query_args) - if movies: - for item in movies: - href = item.get('href', '') - logging.info(f"fetch from db. item: {item}") - yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')}) + if self.cmd_movies in self.cmd_list: + movies = db_tools.get_movies(**query_args) + if movies: + for item in movies: + href = item.get('href', '') + self.logger.info(f"fetch from db. item: {item}") + yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')}) + def start_astro(self): + # 按星座获取演员列表 + for astro in self.astro_list: + url = self.astr_base_url + astro + yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro}) + if self.debug: + break + + def start_birth(self): + for month in range(1, 13): + for day in range(1, 32): + url = self.birth_base_url.format(month=month, day=day) + yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day}) + if self.debug: + break + async def start(self): # 调用原有 start_requests 方法 async for request in super().start(): diff --git a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py index ae87757..94be268 100644 --- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py @@ -27,6 +27,11 @@ class U001Spider(scrapy.Spider): allowed_domains = ["u001.25img.com"] start_urls = ["https://u001.25img.com/?p=1"] + def __init__(self, debug='False', *args, **kwargs): + super().__init__(*args, **kwargs) + self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False + self.logger.info(f"debug mod: {self.debug}") + def parse(self, response): for row in response.css('table.torrent-list tbody tr'): item = U001Item() @@ -49,4 +54,7 @@ class U001Spider(scrapy.Spider): current_page = int(response.url.split('=')[-1]) total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)')) if current_page < total_pages: - yield response.follow(f"?p={current_page + 1}", self.parse) \ No newline at end of file + if self.debug and current_page >= 5: + self.logger.info(f"debug mod. stop crawling.") + else: + yield response.follow(f"?p={current_page + 1}", self.parse) \ No newline at end of file