modify scripts
This commit is contained in:
@ -61,7 +61,7 @@ class IAFDQuery(SQLiteDBHandler):
|
||||
# 按条件查询 href 列表
|
||||
def get_movies(self, **filters):
|
||||
try:
|
||||
sql = f"SELECT href, title, id FROM {self.tbl_name_performers} WHERE 1=1"
|
||||
sql = f"SELECT href, title, id FROM {self.tbl_name_movies} WHERE 1=1"
|
||||
params = []
|
||||
|
||||
conditions = {
|
||||
|
||||
@ -49,23 +49,13 @@ class StatsExtension:
|
||||
def _export_stats(self, spider):
|
||||
# 获取当前统计信息
|
||||
stats = self.stats.get_stats()
|
||||
|
||||
# 修正:计算爬虫运行时间
|
||||
start_time = stats.get('start_time')
|
||||
if start_time:
|
||||
# 将 datetime 对象转换为时间戳
|
||||
start_timestamp = start_time.timestamp()
|
||||
uptime = time.time() - start_timestamp
|
||||
else:
|
||||
uptime = 0
|
||||
|
||||
# 构建统计摘要
|
||||
stats_summary = {
|
||||
't': datetime.now().strftime('%H:%M:%S'),
|
||||
'spider': self.spider_name,
|
||||
'interval(s)': int(uptime),
|
||||
'recv_cnt': stats.get('response_received_count', 0),
|
||||
'total_req': stats.get('downloader/request_count', 0),
|
||||
'scrapy_req': stats.get('downloader/request_count', 0),
|
||||
'middle_req': stats.get('cloudscraper/request_count', 0),
|
||||
'total_req': stats.get('cloudscraper/request_count', 0) + stats.get('downloader/request_count', 0),
|
||||
'total_rsp': stats.get('downloader/response_count', 0),
|
||||
'200_cnt': stats.get('downloader/response_status_count/200', 0),
|
||||
'404_cnt': stats.get('downloader/response_status_count/404', 0),
|
||||
'log_err_cnt': stats.get('log_count/ERROR', 0)
|
||||
|
||||
@ -102,14 +102,15 @@ class ScrapyProjDownloaderMiddleware:
|
||||
|
||||
import cloudscraper
|
||||
from scrapy.http import TextResponse
|
||||
import datetime
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
# 使用cloudscraper做代理,去请求网站
|
||||
class CloudScraperMiddleware:
|
||||
def __init__(self, stats):
|
||||
self.scraper = cloudscraper.create_scraper()
|
||||
self.stats = stats # 注入统计对象
|
||||
# 指定需要使用 cloudscraper 的域名
|
||||
self.target_domains = {'iafd.com', 'another-domain.com'}
|
||||
self.target_domains = ['iafd.com', 'another-domain.com']
|
||||
|
||||
# 设置 headers 和 scraper
|
||||
self.ifad_headers = {
|
||||
@ -123,8 +124,19 @@ class CloudScraperMiddleware:
|
||||
)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
hostname = urlparse(request.url).hostname or ''
|
||||
matched = False
|
||||
for domain in self.target_domains:
|
||||
if domain in hostname:
|
||||
matched = True
|
||||
break
|
||||
|
||||
# 非目标域名的请求使用默认处理
|
||||
if not matched:
|
||||
return None
|
||||
|
||||
# 记录请求开始时间
|
||||
start_time = datetime.datetime.now()
|
||||
start_time = datetime.now()
|
||||
|
||||
try:
|
||||
# 发送请求
|
||||
@ -135,18 +147,18 @@ class CloudScraperMiddleware:
|
||||
)
|
||||
|
||||
# 计算请求耗时(毫秒)
|
||||
duration = (datetime.datetime.now() - start_time).total_seconds() * 1000
|
||||
duration = (datetime.now() - start_time).total_seconds() * 1000
|
||||
|
||||
# 更新统计数据
|
||||
self.stats.inc_value('downloader/request_count')
|
||||
self.stats.inc_value('downloader/request_method_count/GET')
|
||||
self.stats.inc_value('downloader/request_bytes', len(str(request.headers)) + len(request.url))
|
||||
self.stats.inc_value('cloudscraper/request_count')
|
||||
self.stats.inc_value('cloudscraper/request_method_count/GET')
|
||||
self.stats.inc_value('cloudscraper/request_bytes', len(str(request.headers)) + len(request.url))
|
||||
|
||||
self.stats.inc_value('downloader/response_count')
|
||||
self.stats.inc_value(f'downloader/response_status_count/{response.status_code}')
|
||||
self.stats.inc_value('downloader/response_bytes', len(response.content))
|
||||
self.stats.inc_value('cloudscraper/response_count')
|
||||
self.stats.inc_value(f'cloudscraper/response_status_count/{response.status_code}')
|
||||
self.stats.inc_value('cloudscraper/response_bytes', len(response.content))
|
||||
|
||||
self.stats.set_value(f'response_received_count', self.stats.get_value('downloader/response_status_count/200', 0))
|
||||
#self.stats.set_value(f'response_received_count', self.stats.get_value('cloudscraper/response_status_count/200', 0))
|
||||
|
||||
# 创建Scrapy响应对象
|
||||
return TextResponse(
|
||||
@ -159,7 +171,7 @@ class CloudScraperMiddleware:
|
||||
|
||||
except Exception as e:
|
||||
# 记录错误
|
||||
self.stats.inc_value('downloader/exception_count')
|
||||
self.stats.inc_value(f'downloader/exception_type_count/{e.__class__.__name__}')
|
||||
self.stats.inc_value('cloudscraper/exception_count')
|
||||
self.stats.inc_value(f'cloudscraper/exception_type_count/{e.__class__.__name__}')
|
||||
spider.logger.error(f"CloudScraper请求失败: {e}")
|
||||
return None # 失败时使用默认下载器
|
||||
|
||||
@ -61,23 +61,24 @@ class SQLitePipeline(SQLiteDBHandler):
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if isinstance(item, U001Item):
|
||||
self._process_u001_item(item)
|
||||
self._process_u001_item(item, spider)
|
||||
elif isinstance(item, Sis001Item):
|
||||
self._process_sis001_item(item)
|
||||
self._process_sis001_item(item, spider)
|
||||
elif isinstance(item, IAFDPersonItem):
|
||||
self._process_iafd_person_item(item)
|
||||
self._process_iafd_person_item(item, spider)
|
||||
elif isinstance(item, IAFDPersonDetailItem):
|
||||
self._process_iafd_person_detail_item(item)
|
||||
self._process_iafd_person_detail_item(item, spider)
|
||||
elif isinstance(item, IAFDMovieItem):
|
||||
self._process_iafd_movie_item(item)
|
||||
self._process_iafd_movie_item(item, spider)
|
||||
elif isinstance(item, IAFDMovieDetailItem):
|
||||
self._process_iafd_movie_detail_item(item)
|
||||
self._process_iafd_movie_detail_item(item, spider)
|
||||
return item
|
||||
|
||||
def _process_u001_item(self, item):
|
||||
def _process_u001_item(self, item, spider):
|
||||
logging.debug(f"insert one item. href:{spider.name}")
|
||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url')
|
||||
|
||||
def _process_sis001_item(self, item):
|
||||
def _process_sis001_item(self, item, spider):
|
||||
self.cursor.execute('''
|
||||
INSERT OR IGNORE INTO sis001_data
|
||||
(title, url, plate_name)
|
||||
@ -89,16 +90,16 @@ class SQLitePipeline(SQLiteDBHandler):
|
||||
))
|
||||
self.conn.commit()
|
||||
|
||||
def _process_iafd_person_item(self, item):
|
||||
def _process_iafd_person_item(self, item, spider):
|
||||
logging.info(f"deal with persion item. {item}")
|
||||
|
||||
def _process_iafd_movie_item(self, item):
|
||||
def _process_iafd_movie_item(self, item, spider):
|
||||
logging.info(f"deal with movie item. {item}")
|
||||
|
||||
def _process_iafd_person_detail_item(self, item):
|
||||
def _process_iafd_person_detail_item(self, item, spider):
|
||||
logging.info(f"deal with persion item. {item}")
|
||||
|
||||
def _process_iafd_movie_detail_item(self, item):
|
||||
def _process_iafd_movie_detail_item(self, item, spider):
|
||||
logging.info(f"deal with movie item. {item}")
|
||||
|
||||
def close_spider(self, spider):
|
||||
|
||||
@ -34,7 +34,7 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||
CONCURRENT_ITEMS = 100
|
||||
|
||||
# 下载延迟
|
||||
DOWNLOAD_DELAY = 1
|
||||
DOWNLOAD_DELAY = 0.3
|
||||
|
||||
# 启用管道
|
||||
ITEM_PIPELINES = {
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import scrapy
|
||||
import re
|
||||
import logging
|
||||
from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
|
||||
from scrapy_proj.db_wapper.iafd_query import IAFDQuery
|
||||
|
||||
@ -21,33 +20,41 @@ class IAFDSpider(scrapy.Spider):
|
||||
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||
self.cmd_list = cmd
|
||||
self.cmd_str = cmd
|
||||
self.update = int(update)
|
||||
self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}")
|
||||
|
||||
self.cmd_astro = 'astro'
|
||||
self.cmd_birth = 'birth'
|
||||
self.cmd_ethnic = 'ethnic'
|
||||
self.cmd_dist = 'dist'
|
||||
self.cmd_stu = 'stu'
|
||||
self.cmd_performers = 'performers'
|
||||
self.cmd_movies = 'movies'
|
||||
self.cmd_list = self.cmd_str.split(',')
|
||||
if len(self.cmd_list) == 0 :
|
||||
self.cmd_list = [self.cmd_astro, self.cmd_birth, self.cmd_ethnic, self.cmd_dist, self.cmd_stu, self.cmd_performers, self.cmd_movies]
|
||||
|
||||
def start_requests(self):
|
||||
# 按星座获取演员列表
|
||||
for astro in self.astro_list:
|
||||
url = self.astr_base_url + astro
|
||||
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
||||
if self.debug:
|
||||
break
|
||||
# 根据命令字执行
|
||||
if self.cmd_astro in self.cmd_list:
|
||||
self.start_astro()
|
||||
|
||||
# 按生日获取演员列表
|
||||
for month in range(1, 13):
|
||||
for day in range(1, 32):
|
||||
url = self.birth_base_url.format(month=month, day=day)
|
||||
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
||||
if self.debug:
|
||||
break
|
||||
if self.cmd_birth in self.cmd_list:
|
||||
self.start_birth()
|
||||
|
||||
# 获取人种列表
|
||||
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
|
||||
if self.cmd_ethnic in self.cmd_list:
|
||||
yield scrapy.Request(self.ethnic_list_url, callback=self.parse_ethnic_list_page)
|
||||
|
||||
# 获取 distributors 列表
|
||||
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
|
||||
if self.cmd_dist in self.cmd_list:
|
||||
yield scrapy.Request(self.distributors_list_url, callback=self.parse_distributors_list_page)
|
||||
|
||||
# 获取 studios 列表
|
||||
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
|
||||
if self.cmd_stu in self.cmd_list:
|
||||
yield scrapy.Request(self.studios_list_url, callback=self.parse_studios_list_page)
|
||||
|
||||
query_args = {}
|
||||
if self.debug:
|
||||
@ -56,23 +63,41 @@ class IAFDSpider(scrapy.Spider):
|
||||
query_args['is_full_data'] = 0
|
||||
|
||||
# 读取待更新的演员列表
|
||||
actors = db_tools.get_performers(**query_args)
|
||||
if actors:
|
||||
for item in actors:
|
||||
href = item.get('href', '')
|
||||
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
||||
logging.info(f"fetch from db. item: {item}")
|
||||
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
|
||||
if self.cmd_performers in self.cmd_list:
|
||||
actors = db_tools.get_performers(**query_args)
|
||||
if actors:
|
||||
for item in actors:
|
||||
href = item.get('href', '')
|
||||
movies_cnt = item['movies_cnt'] if item['movies_cnt'] else 0
|
||||
self.logger.info(f"fetch from db. item: {item}")
|
||||
yield scrapy.Request(href, callback=self.parse_person_detail_page, meta={'id': item.get('id', 0), 'name': item.get('name', ''), 'movies_cnt': movies_cnt})
|
||||
|
||||
# 读取待更新的影片列表
|
||||
movies = db_tools.get_movies(**query_args)
|
||||
if movies:
|
||||
for item in movies:
|
||||
href = item.get('href', '')
|
||||
logging.info(f"fetch from db. item: {item}")
|
||||
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
|
||||
if self.cmd_movies in self.cmd_list:
|
||||
movies = db_tools.get_movies(**query_args)
|
||||
if movies:
|
||||
for item in movies:
|
||||
href = item.get('href', '')
|
||||
self.logger.info(f"fetch from db. item: {item}")
|
||||
yield scrapy.Request(href, callback=self.parse_movie_detail_page, meta={'id': item.get('id', 0), 'title': item.get('title', '')})
|
||||
|
||||
|
||||
def start_astro(self):
|
||||
# 按星座获取演员列表
|
||||
for astro in self.astro_list:
|
||||
url = self.astr_base_url + astro
|
||||
yield scrapy.Request(url, callback=self.parse_astro_page, meta={'astro': astro})
|
||||
if self.debug:
|
||||
break
|
||||
|
||||
def start_birth(self):
|
||||
for month in range(1, 13):
|
||||
for day in range(1, 32):
|
||||
url = self.birth_base_url.format(month=month, day=day)
|
||||
yield scrapy.Request(url, callback=self.parse_birth_page, meta={'month': month, 'day': day})
|
||||
if self.debug:
|
||||
break
|
||||
|
||||
async def start(self):
|
||||
# 调用原有 start_requests 方法
|
||||
async for request in super().start():
|
||||
|
||||
@ -27,6 +27,11 @@ class U001Spider(scrapy.Spider):
|
||||
allowed_domains = ["u001.25img.com"]
|
||||
start_urls = ["https://u001.25img.com/?p=1"]
|
||||
|
||||
def __init__(self, debug='False', *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||
self.logger.info(f"debug mod: {self.debug}")
|
||||
|
||||
def parse(self, response):
|
||||
for row in response.css('table.torrent-list tbody tr'):
|
||||
item = U001Item()
|
||||
@ -49,4 +54,7 @@ class U001Spider(scrapy.Spider):
|
||||
current_page = int(response.url.split('=')[-1])
|
||||
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
|
||||
if current_page < total_pages:
|
||||
yield response.follow(f"?p={current_page + 1}", self.parse)
|
||||
if self.debug and current_page >= 5:
|
||||
self.logger.info(f"debug mod. stop crawling.")
|
||||
else:
|
||||
yield response.follow(f"?p={current_page + 1}", self.parse)
|
||||
Reference in New Issue
Block a user