modify scripts

This commit is contained in:
oscarz
2025-07-03 16:07:47 +08:00
parent ff49046212
commit c34cfb458c
7 changed files with 108 additions and 72 deletions

View File

@ -61,7 +61,7 @@ class IAFDQuery(SQLiteDBHandler):
# 按条件查询 href 列表
def get_movies(self, **filters):
try:
sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1"
sql = f"SELECT href, title, id FROM {self.tbl_name_movies} WHERE 1=1"
params = []
conditions = {

View File

@ -49,23 +49,13 @@ class StatsExtension:
def _export_stats(self, spider):
# 获取当前统计信息
stats = self.stats.get_stats()
# 修正:计算爬虫运行时间
start_time = stats.get('start_time')
if start_time:
# 将 datetime 对象转换为时间戳
start_timestamp = start_time.timestamp()
uptime = time.time() - start_timestamp
else:
uptime = 0
# 构建统计摘要
stats_summary = {
't': datetime.now().strftime('%H:%M:%S'),
'spider': self.spider_name,
'interval(s)': int(uptime),
'recv_cnt': stats.get('response_received_count', 0),
'total_req': stats.get('downloader/request_count', 0),
'scrapy_req': stats.get('downloader/request_count', 0),
'middle_req': stats.get('cloudscraper/request_count', 0),
'total_req': stats.get('cloudscraper/request_count', 0) + stats.get('downloader/request_count', 0),
'total_rsp': stats.get('downloader/response_count', 0),
'200_cnt': stats.get('downloader/response_status_count/200', 0),
'404_cnt': stats.get('downloader/response_status_count/404', 0),
'log_err_cnt': stats.get('log_count/ERROR', 0)

View File

@ -102,14 +102,15 @@ class ScrapyProjDownloaderMiddleware:
import cloudscraper
from scrapy.http import TextResponse
import datetime
from datetime import datetime
from urllib.parse import urlparse
# 使用cloudscraper做代理去请求网站
class CloudScraperMiddleware:
def __init__(self, stats):
self.scraper = cloudscraper.create_scraper()
self.stats = stats # 注入统计对象
# 指定需要使用 cloudscraper 的域名
self.target_domains = {'iafd.com', 'another-domain.com'}
self.target_domains = ['iafd.com', 'another-domain.com']
# 设置 headers 和 scraper
self.ifad_headers = {
@ -123,8 +124,19 @@ class CloudScraperMiddleware:
)
def process_request(self, request, spider):
hostname = urlparse(request.url).hostname or ''
matched = False
for domain in self.target_domains:
if domain in hostname:
matched = True
break
# 非目标域名的请求使用默认处理
if not matched:
return None
# 记录请求开始时间
start_time = datetime.datetime.now()
start_time = datetime.now()
try:
# 发送请求
@ -135,18 +147,18 @@ class CloudScraperMiddleware:
)
# 计算请求耗时(毫秒)
duration = (datetime.datetime.now() - start_time).total_seconds() * 1000
duration = (datetime.now() - start_time).total_seconds() * 1000
# 更新统计数据
self.stats.inc_value('downloader/request_count')
self.stats.inc_value('downloader/request_method_count/GET')
self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url))
self.stats.inc_value('cloudscraper/request_count')
self.stats.inc_value('cloudscraper/request_method_count/GET')
self.stats.inc_value('cloudscraper/request_bytes', len(str(request.headers)) + len(request.url))
self.stats.inc_value('downloader/response_count')
self.stats.inc_value(f'downloader/response_status_count/{response.status_code}')
self.stats.inc_value('downloader/response_bytes', len(response.content))
self.stats.inc_value('cloudscraper/response_count')
self.stats.inc_value(f'cloudscraper/response_status_count/{response.status_code}')
self.stats.inc_value('cloudscraper/response_bytes', len(response.content))
self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0))
#self.stats.set_value(f'response_received_count', self.stats.get_value('cloudscraper/response_status_count/200', 0))
# 创建Scrapy响应对象
return TextResponse(
@ -159,7 +171,7 @@ class CloudScraperMiddleware:
except Exception as e:
# 记录错误
self.stats.inc_value('downloader/exception_count')
self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}')
self.stats.inc_value('cloudscraper/exception_count')
self.stats.inc_value(f'cloudscraper/exception_type_count/{e.__class__.__name__}')
spider.logger.error(f"CloudScraper请求失败: {e}")
return None # 失败时使用默认下载器

View File

@ -61,23 +61,24 @@ class SQLitePipeline(SQLiteDBHandler):
def process_item(self, item, spider):
if isinstance(item, U001Item):
self._process_u001_item(item)
self._process_u001_item(item, spider)
elif isinstance(item, Sis001Item):
self._process_sis001_item(item)
self._process_sis001_item(item, spider)
elif isinstance(item, IAFDPersonItem):
self._process_iafd_person_item(item)
self._process_iafd_person_item(item, spider)
elif isinstance(item, IAFDPersonDetailItem):
self._process_iafd_person_detail_item(item)
self._process_iafd_person_detail_item(item, spider)
elif isinstance(item, IAFDMovieItem):
self._process_iafd_movie_item(item)
self._process_iafd_movie_item(item, spider)
elif isinstance(item, IAFDMovieDetailItem):
self._process_iafd_movie_detail_item(item)
self._process_iafd_movie_detail_item(item, spider)
return item
def _process_u001_item(self, item):
def _process_u001_item(self, item, spider):
logging.debug(f"insert one item. href:{spider.name}")
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url')
def _process_sis001_item(self, item):
def _process_sis001_item(self, item, spider):
self.cursor.execute('''
INSERT OR IGNORE INTO sis001_data
(title, url, plate_name)
@ -89,16 +90,16 @@ class SQLitePipeline(SQLiteDBHandler):
))
self.conn.commit()
def _process_iafd_person_item(self, item):
def _process_iafd_person_item(self, item, spider):
logging.info(f"deal with persion item. {item}")
def _process_iafd_movie_item(self, item):
def _process_iafd_movie_item(self, item, spider):
logging.info(f"deal with movie item. {item}")
def _process_iafd_person_detail_item(self, item):
def _process_iafd_person_detail_item(self, item, spider):
logging.info(f"deal with persion item. {item}")
def _process_iafd_movie_detail_item(self, item):
def _process_iafd_movie_detail_item(self, item, spider):
logging.info(f"deal with movie item. {item}")
def close_spider(self, spider):

View File

@ -34,7 +34,7 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_ITEMS = 100
# 下载延迟
DOWNLOAD_DELAY = 1
DOWNLOAD_DELAY = 0.3
# 启用管道
ITEM_PIPELINES = {

View File

@ -1,6 +1,5 @@
import scrapy
import re
import logging
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
@ -21,33 +20,41 @@ class IAFDSpider(scrapy.Spider):
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.cmd_list = cmd
self.cmd_str = cmd
self.update = int(update)
self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}")
self.cmd_astro = 'astro'
self.cmd_birth = 'birth'
self.cmd_ethnic = 'ethnic'
self.cmd_dist = 'dist'
self.cmd_stu = 'stu'
self.cmd_performers = 'performers'
self.cmd_movies = 'movies'
self.cmd_list = self.cmd_str.split(',')
if len(self.cmd_list) == 0 :
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
def start_requests(self):
# 按星座获取演员列表
for astro in self.astro_list:
url = self.astr_base_url + astro
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
if self.debug:
break
# 根据命令字执行
if self.cmd_astro in self.cmd_list:
self.start_astro()
# 按生日获取演员列表
for month in range(1, 13):
for day in range(1, 32):
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
if self.debug:
break
if self.cmd_birth in self.cmd_list:
self.start_birth()
# 获取人种列表
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
if self.cmd_ethnic in self.cmd_list:
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
# 获取 distributors 列表
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
if self.cmd_dist in self.cmd_list:
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
# 获取 studios 列表
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
if self.cmd_stu in self.cmd_list:
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
query_args = {}
if self.debug:
@ -56,23 +63,41 @@ class IAFDSpider(scrapy.Spider):
query_args['is_full_data'] = 0
# 读取待更新的演员列表
actors = db_tools.get_performers(**query_args)
if actors:
for item in actors:
href = item.get('href', '')
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
logging.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
if self.cmd_performers in self.cmd_list:
actors = db_tools.get_performers(**query_args)
if actors:
for item in actors:
href = item.get('href', '')
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
# 读取待更新的影片列表
movies = db_tools.get_movies(**query_args)
if movies:
for item in movies:
href = item.get('href', '')
logging.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
if self.cmd_movies in self.cmd_list:
movies = db_tools.get_movies(**query_args)
if movies:
for item in movies:
href = item.get('href', '')
self.logger.info(f"fetch from db. item: {item}")
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
def start_astro(self):
# 按星座获取演员列表
for astro in self.astro_list:
url = self.astr_base_url + astro
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
if self.debug:
break
def start_birth(self):
for month in range(1, 13):
for day in range(1, 32):
url = self.birth_base_url.format(month=month, day=day)
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
if self.debug:
break
async def start(self):
# 调用原有 start_requests 方法
async for request in super().start():

View File

@ -27,6 +27,11 @@ class U001Spider(scrapy.Spider):
allowed_domains = ["u001.25img.com"]
start_urls = ["https://u001.25img.com/?p=1"]
def __init__(self, debug='False', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.logger.info(f"debug mod: {self.debug}")
def parse(self, response):
for row in response.css('table.torrent-list tbody tr'):
item = U001Item()
@ -49,4 +54,7 @@ class U001Spider(scrapy.Spider):
current_page = int(response.url.split('=')[-1])
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
if current_page < total_pages:
yield response.follow(f"?p={current_page + 1}", self.parse)
if self.debug and current_page >= 5:
self.logger.info(f"debug mod. stop crawling.")
else:
yield response.follow(f"?p={current_page + 1}", self.parse)