modify scripts

This commit is contained in:
oscarz
2025-07-03 16:07:47 +08:00
parent ff49046212
commit c34cfb458c
7 changed files with 108 additions and 72 deletions

View File

@ -61,7 +61,7 @@ class IAFDQuery(SQLiteDBHandler):
# 按条件查询 href 列表 # 按条件查询 href 列表
def get_movies(self, **filters): def get_movies(self, **filters):
try: try:
sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1" sql = f"SELECT href, title, id FROM {self.tbl_name_movies} WHERE 1=1"
params = [] params = []
conditions = { conditions = {

View File

@ -49,23 +49,13 @@ class StatsExtension:
def _export_stats(self, spider): def _export_stats(self, spider):
# 获取当前统计信息 # 获取当前统计信息
stats = self.stats.get_stats() stats = self.stats.get_stats()
# 修正:计算爬虫运行时间
start_time = stats.get('start_time')
if start_time:
# 将 datetime 对象转换为时间戳
start_timestamp = start_time.timestamp()
uptime = time.time() - start_timestamp
else:
uptime = 0
# 构建统计摘要 # 构建统计摘要
stats_summary = { stats_summary = {
't': datetime.now().strftime('%H:%M:%S'),
'spider': self.spider_name, 'spider': self.spider_name,
'interval(s)': int(uptime), 'scrapy_req': stats.get('downloader/request_count', 0),
'recv_cnt': stats.get('response_received_count', 0), 'middle_req': stats.get('cloudscraper/request_count', 0),
'total_req': stats.get('downloader/request_count', 0), 'total_req': stats.get('cloudscraper/request_count', 0) + stats.get('downloader/request_count', 0),
'total_rsp': stats.get('downloader/response_count', 0),
'200_cnt': stats.get('downloader/response_status_count/200', 0), '200_cnt': stats.get('downloader/response_status_count/200', 0),
'404_cnt': stats.get('downloader/response_status_count/404', 0), '404_cnt': stats.get('downloader/response_status_count/404', 0),
'log_err_cnt': stats.get('log_count/ERROR', 0) 'log_err_cnt': stats.get('log_count/ERROR', 0)

View File

@ -102,14 +102,15 @@ class ScrapyProjDownloaderMiddleware:
import cloudscraper import cloudscraper
from scrapy.http import TextResponse from scrapy.http import TextResponse
import datetime from datetime import datetime
from urllib.parse import urlparse
# 使用cloudscraper做代理去请求网站 # 使用cloudscraper做代理去请求网站
class CloudScraperMiddleware: class CloudScraperMiddleware:
def __init__(self, stats): def __init__(self, stats):
self.scraper = cloudscraper.create_scraper() self.scraper = cloudscraper.create_scraper()
self.stats = stats # 注入统计对象 self.stats = stats # 注入统计对象
# 指定需要使用 cloudscraper 的域名 # 指定需要使用 cloudscraper 的域名
self.target_domains = {'iafd.com', 'another-domain.com'} self.target_domains = ['iafd.com', 'another-domain.com']
# 设置 headers 和 scraper # 设置 headers 和 scraper
self.ifad_headers = { self.ifad_headers = {
@ -123,8 +124,19 @@ class CloudScraperMiddleware:
) )
def process_request(self, request, spider): def process_request(self, request, spider):
hostname = urlparse(request.url).hostname or ''
matched = False
for domain in self.target_domains:
if domain in hostname:
matched = True
break
# 非目标域名的请求使用默认处理
if not matched:
return None
# 记录请求开始时间 # 记录请求开始时间
start_time = datetime.datetime.now() start_time = datetime.now()
try: try:
# 发送请求 # 发送请求
@ -135,18 +147,18 @@ class CloudScraperMiddleware:
) )
# 计算请求耗时(毫秒) # 计算请求耗时(毫秒)
duration = (datetime.datetime.now() - start_time).total_seconds() * 1000 duration = (datetime.now() - start_time).total_seconds() * 1000
# 更新统计数据 # 更新统计数据
self.stats.inc_value('downloader/request_count') self.stats.inc_value('cloudscraper/request_count')
self.stats.inc_value('downloader/request_method_count/GET') self.stats.inc_value('cloudscraper/request_method_count/GET')
self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url)) self.stats.inc_value('cloudscraper/request_bytes', len(str(request.headers)) + len(request.url))
self.stats.inc_value('downloader/response_count') self.stats.inc_value('cloudscraper/response_count')
self.stats.inc_value(f'downloader/response_status_count/{response.status_code}') self.stats.inc_value(f'cloudscraper/response_status_count/{response.status_code}')
self.stats.inc_value('downloader/response_bytes', len(response.content)) self.stats.inc_value('cloudscraper/response_bytes', len(response.content))
self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0)) #self.stats.set_value(f'response_received_count', self.stats.get_value('cloudscraper/response_status_count/200', 0))
# 创建Scrapy响应对象 # 创建Scrapy响应对象
return TextResponse( return TextResponse(
@ -159,7 +171,7 @@ class CloudScraperMiddleware:
except Exception as e: except Exception as e:
# 记录错误 # 记录错误
self.stats.inc_value('downloader/exception_count') self.stats.inc_value('cloudscraper/exception_count')
self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}') self.stats.inc_value(f'cloudscraper/exception_type_count/{e.__class__.__name__}')
spider.logger.error(f"CloudScraper请求失败: {e}") spider.logger.error(f"CloudScraper请求失败: {e}")
return None # 失败时使用默认下载器 return None # 失败时使用默认下载器

View File

@ -61,23 +61,24 @@ class SQLitePipeline(SQLiteDBHandler):
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item, U001Item): if isinstance(item, U001Item):
self._process_u001_item(item) self._process_u001_item(item, spider)
elif isinstance(item, Sis001Item): elif isinstance(item, Sis001Item):
self._process_sis001_item(item) self._process_sis001_item(item, spider)
elif isinstance(item, IAFDPersonItem): elif isinstance(item, IAFDPersonItem):
self._process_iafd_person_item(item) self._process_iafd_person_item(item, spider)
elif isinstance(item, IAFDPersonDetailItem): elif isinstance(item, IAFDPersonDetailItem):
self._process_iafd_person_detail_item(item) self._process_iafd_person_detail_item(item, spider)
elif isinstance(item, IAFDMovieItem): elif isinstance(item, IAFDMovieItem):
self._process_iafd_movie_item(item) self._process_iafd_movie_item(item, spider)
elif isinstance(item, IAFDMovieDetailItem): elif isinstance(item, IAFDMovieDetailItem):
self._process_iafd_movie_detail_item(item) self._process_iafd_movie_detail_item(item, spider)
return item return item
def _process_u001_item(self, item): def _process_u001_item(self, item, spider):
logging.debug(f"insert one item. href:{spider.name}")
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url') return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url')
def _process_sis001_item(self, item): def _process_sis001_item(self, item, spider):
self.cursor.execute(''' self.cursor.execute('''
INSERT OR IGNORE INTO sis001_data INSERT OR IGNORE INTO sis001_data
(title, url, plate_name) (title, url, plate_name)
@ -89,16 +90,16 @@ class SQLitePipeline(SQLiteDBHandler):
)) ))
self.conn.commit() self.conn.commit()
def _process_iafd_person_item(self, item): def _process_iafd_person_item(self, item, spider):
logging.info(f"deal with persion item. {item}") logging.info(f"deal with persion item. {item}")
def _process_iafd_movie_item(self, item): def _process_iafd_movie_item(self, item, spider):
logging.info(f"deal with movie item. {item}") logging.info(f"deal with movie item. {item}")
def _process_iafd_person_detail_item(self, item): def _process_iafd_person_detail_item(self, item, spider):
logging.info(f"deal with persion item. {item}") logging.info(f"deal with persion item. {item}")
def _process_iafd_movie_detail_item(self, item): def _process_iafd_movie_detail_item(self, item, spider):
logging.info(f"deal with movie item. {item}") logging.info(f"deal with movie item. {item}")
def close_spider(self, spider): def close_spider(self, spider):

View File

@ -34,7 +34,7 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_ITEMS = 100 CONCURRENT_ITEMS = 100
# 下载延迟 # 下载延迟
DOWNLOAD_DELAY = 1 DOWNLOAD_DELAY = 0.3
# 启用管道 # 启用管道
ITEM_PIPELINES = { ITEM_PIPELINES = {

View File

@ -1,6 +1,5 @@
import scrapy import scrapy
import re import re
import logging
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.iafd_query import IAFDQuery from scrapy_proj.db_wapper.iafd_query import IAFDQuery
@ -21,33 +20,41 @@ class IAFDSpider(scrapy.Spider):
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs): def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.cmd_list = cmd self.cmd_str = cmd
self.update = int(update) self.update = int(update)
self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}")
self.cmd_astro = 'astro'
self.cmd_birth = 'birth'
self.cmd_ethnic = 'ethnic'
self.cmd_dist = 'dist'
self.cmd_stu = 'stu'
self.cmd_performers = 'performers'
self.cmd_movies = 'movies'
self.cmd_list = self.cmd_str.split(',')
if len(self.cmd_list) == 0 :
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
def start_requests(self): def start_requests(self):
# 按星座获取演员列表 # 根据命令字执行
for astro in self.astro_list: if self.cmd_astro in self.cmd_list:
url = self.astr_base_url + astro self.start_astro()
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
if self.debug:
break
# 按生日获取演员列表 # 按生日获取演员列表
for month in range(1, 13): if self.cmd_birth in self.cmd_list:
for day in range(1, 32): self.start_birth()
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
if self.debug:
break
# 获取人种列表 # 获取人种列表
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page) if self.cmd_ethnic in self.cmd_list:
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
# 获取 distributors 列表 # 获取 distributors 列表
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page) if self.cmd_dist in self.cmd_list:
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
# 获取 studios 列表 # 获取 studios 列表
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page) if self.cmd_stu in self.cmd_list:
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
query_args = {} query_args = {}
if self.debug: if self.debug:
@ -56,23 +63,41 @@ class IAFDSpider(scrapy.Spider):
query_args['is_full_data'] = 0 query_args['is_full_data'] = 0
# 读取待更新的演员列表 # 读取待更新的演员列表
actors = db_tools.get_performers(**query_args) if self.cmd_performers in self.cmd_list:
if actors: actors = db_tools.get_performers(**query_args)
for item in actors: if actors:
href = item.get('href', '') for item in actors:
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0 href = item.get('href', '')
logging.info(f"fetch from db. item: {item}") movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt}) self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
# 读取待更新的影片列表 # 读取待更新的影片列表
movies = db_tools.get_movies(**query_args) if self.cmd_movies in self.cmd_list:
if movies: movies = db_tools.get_movies(**query_args)
for item in movies: if movies:
href = item.get('href', '') for item in movies:
logging.info(f"fetch from db. item: {item}") href = item.get('href', '')
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')}) self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
def start_astro(self):
# 按星座获取演员列表
for astro in self.astro_list:
url = self.astr_base_url + astro
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
if self.debug:
break
def start_birth(self):
for month in range(1, 13):
for day in range(1, 32):
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
if self.debug:
break
async def start(self): async def start(self):
# 调用原有 start_requests 方法 # 调用原有 start_requests 方法
async for request in super().start(): async for request in super().start():

View File

@ -27,6 +27,11 @@ class U001Spider(scrapy.Spider):
allowed_domains = ["u001.25img.com"] allowed_domains = ["u001.25img.com"]
start_urls = ["https://u001.25img.com/?p=1"] start_urls = ["https://u001.25img.com/?p=1"]
def __init__(self, debug='False', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.logger.info(f"debug mod: {self.debug}")
def parse(self, response): def parse(self, response):
for row in response.css('table.torrent-list tbody tr'): for row in response.css('table.torrent-list tbody tr'):
item = U001Item() item = U001Item()
@ -49,4 +54,7 @@ class U001Spider(scrapy.Spider):
current_page = int(response.url.split('=')[-1]) current_page = int(response.url.split('=')[-1])
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)')) total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
if current_page < total_pages: if current_page < total_pages:
yield response.follow(f"?p={current_page + 1}", self.parse) if self.debug and current_page >= 5:
self.logger.info(f"debug mod. stop crawling.")
else:
yield response.follow(f"?p={current_page + 1}", self.parse)