modify scripts

This commit is contained in:
2025-07-18 16:54:22 +08:00
parent eeb879a293
commit 8db1a71d04
10 changed files with 306 additions and 30 deletions

View File

@ -10,6 +10,7 @@ SPIDER_NAME_SIS = 'sis'
SPIDER_NAME_U3C3 = 'u3c3' SPIDER_NAME_U3C3 = 'u3c3'
SPIDER_NAME_IAFD = 'iafd' SPIDER_NAME_IAFD = 'iafd'
SPIDER_NAME_PBOX = 'pbox' SPIDER_NAME_PBOX = 'pbox'
SPIDER_NAME_CLM = 'clm'
ITEM_TYPE_LIST = 'list' ITEM_TYPE_LIST = 'list'
ITEM_TYPE_STUDIO = 'studio' ITEM_TYPE_STUDIO = 'studio'
@ -17,3 +18,6 @@ ITEM_TYPE_MOVIE_INDEX = 'movie_index'
ITEM_TYPE_ACTOR_INDEX = 'actor_index' ITEM_TYPE_ACTOR_INDEX = 'actor_index'
ITEM_TYPE_MOVIE_DETAIL = 'movie_detail' ITEM_TYPE_MOVIE_DETAIL = 'movie_detail'
ITEM_TYPE_ACTOR_DETAIL = 'actor_detail' ITEM_TYPE_ACTOR_DETAIL = 'actor_detail'
ITEM_TYPE_CLM_KEYWORDS = 'keywords'
ITEM_TYPE_CLM_INDEX = 'index'

View File

@ -71,6 +71,45 @@ class U3C3DBHandler(SQLiteDBHandler):
self.conn.commit() self.conn.commit()
@register_handler(comm.SPIDER_NAME_CLM)
class ClmDBHandler(SQLiteDBHandler):
def __init__(self, db_path=default_dbpath):
super().__init__(db_path)
self.tbl_name_clm_index = 'clm_index'
self.tbl_name_clm_keywords = 'clm_keywords'
def insert_item(self, item):
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True)
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True)
else:
logging.error(f"unkown item.")
return item
def _create_tables(self):
# 创建 u001 数据表
self.cursor.execute(f'''
CREATE TABLE clm_index (
id INTEGER PRIMARY KEY AUTOINCREMENT,
category TEXT,
title TEXT,
href TEXT UNIQUE,
magnet_href TEXT,
size_text TEXT,
size_gb REAL,
heat INTEGER default 0,
add_date TEXT,
last_down_date TEXT,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
);
''')
self.conn.commit()
@register_handler(comm.SPIDER_NAME_IAFD) @register_handler(comm.SPIDER_NAME_IAFD)
class IAFDDBHandler(SQLiteDBHandler): class IAFDDBHandler(SQLiteDBHandler):
def __init__(self, db_path=shared_db_path): def __init__(self, db_path=shared_db_path):

View File

@ -80,6 +80,20 @@ class SQLiteDBHandler:
values = list(processed_data.values()) values = list(processed_data.values())
placeholders = ', '.join(['?' for _ in values]) placeholders = ', '.join(['?' for _ in values])
# 无唯一键时直接插入
if uniq_key is None:
sql = f'''
INSERT INTO {tbl_name} ({columns})
VALUES ({placeholders})
'''
self.cursor.execute(sql, values)
self.conn.commit()
# 获取最后插入的ID
self.cursor.execute("SELECT last_insert_rowid()")
record_id = self.cursor.fetchone()[0]
return record_id
# 有唯一键时的冲突处理
if exists_do_nothing: if exists_do_nothing:
conflict_clause = f'ON CONFLICT ({uniq_key}) DO NOTHING' conflict_clause = f'ON CONFLICT ({uniq_key}) DO NOTHING'
else: else:
@ -112,7 +126,20 @@ class SQLiteDBHandler:
values = list(processed_data.values()) values = list(processed_data.values())
placeholders = ', '.join(['?' for _ in values]) placeholders = ', '.join(['?' for _ in values])
# 先尝试插入数据 # 无唯一键时直接插入
if uniq_key is None:
sql = f'''
INSERT INTO {tbl_name} ({columns})
VALUES ({placeholders})
'''
self.cursor.execute(sql, values)
self.conn.commit()
# 获取最后插入的ID
self.cursor.execute("SELECT last_insert_rowid()")
record_id = self.cursor.fetchone()[0]
return record_id
# 有唯一键时的冲突处理
try: try:
sql = f''' sql = f'''
INSERT INTO {tbl_name} ({columns}) INSERT INTO {tbl_name} ({columns})
@ -120,7 +147,7 @@ class SQLiteDBHandler:
''' '''
self.cursor.execute(sql, values) self.cursor.execute(sql, values)
self.conn.commit() self.conn.commit()
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作 except sqlite3.IntegrityError: # 唯一键冲突
if not exists_do_nothing: if not exists_do_nothing:
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key] update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]

View File

@ -4,11 +4,12 @@ from scrapy.exceptions import NotConfigured
import time import time
class FailureMonitorExtension: class FailureMonitorExtension:
def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window): def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window, min_requests):
self.crawler = crawler self.crawler = crawler
self.max_consecutive_failures = max_consecutive_failures self.max_consecutive_failures = max_consecutive_failures
self.failure_rate_threshold = failure_rate_threshold self.failure_rate_threshold = failure_rate_threshold
self.time_window = time_window # 秒 self.time_window = time_window # 秒
self.min_requests = min_requests
self.consecutive_failures = 0 self.consecutive_failures = 0
self.total_requests = 0 self.total_requests = 0
@ -21,16 +22,17 @@ class FailureMonitorExtension:
max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 100) max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 100)
failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5) failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5)
time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60) time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60)
min_requests = crawler.settings.getint('EXT_FAIL_MONI_MIN_REQUESTS', 10)
if max_consecutive <= 0 and failure_rate <= 0: if max_consecutive <= 0 and failure_rate <= 0:
raise NotConfigured raise NotConfigured
ext = cls(crawler, max_consecutive, failure_rate, time_window) ext = cls(crawler, max_consecutive, failure_rate, time_window, min_requests)
# 注册信号处理函数 # 注册信号处理函数
crawler.signals.connect(ext.request_succeeded, signal=signals.response_received) crawler.signals.connect(ext.request_succeeded, signal=signals.response_received)
crawler.signals.connect(ext.request_failed, signal=signals.request_dropped) crawler.signals.connect(ext.request_dropped, signal=signals.request_dropped)
crawler.signals.connect(ext.request_failed, signal=signals.spider_error) crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
return ext return ext
@ -40,7 +42,23 @@ class FailureMonitorExtension:
self.request_times.append(time.time()) self.request_times.append(time.time())
self._cleanup_old_requests() # 移除时间窗口外的请求 self._cleanup_old_requests() # 移除时间窗口外的请求
def request_failed(self, request, exception, spider): '''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.'''
def request_dropped(self, request, spider):
spider.logger.warning(f"request_dropped on url {request.url}")
self.calculate_failure(spider)
'''
Sent when a spider callback generates an error (i.e. raises an exception).
https://docs.scrapy.org/en/latest/topics/signals.html#request-failed
'''
def spider_error(self, failure, response, spider):
# 忽略302重定向导致的失败核心过滤逻辑
if response.status in [301, 302, 307, 308]:
spider.logger.info(f"忽略302重定向{response.url}")
return # 直接返回,不处理该“失败”
self.calculate_failure(spider)
def calculate_failure(self, spider):
self.consecutive_failures += 1 self.consecutive_failures += 1
self.failed_requests += 1 self.failed_requests += 1
self.total_requests += 1 self.total_requests += 1
@ -53,7 +71,7 @@ class FailureMonitorExtension:
self.crawler.engine.close_spider(spider, 'consecutive_failures_exceeded') self.crawler.engine.close_spider(spider, 'consecutive_failures_exceeded')
# 检查失败率 # 检查失败率
if self.total_requests > 0 and self.failure_rate_threshold > 0: if self.total_requests >= self.min_requests and self.failure_rate_threshold > 0:
current_failure_rate = self.failed_requests / self.total_requests current_failure_rate = self.failed_requests / self.total_requests
if current_failure_rate >= self.failure_rate_threshold: if current_failure_rate >= self.failure_rate_threshold:
spider.logger.error(f"失败率超过阈值 ({current_failure_rate:.2%} > {self.failure_rate_threshold:.2%}),停止爬虫") spider.logger.error(f"失败率超过阈值 ({current_failure_rate:.2%} > {self.failure_rate_threshold:.2%}),停止爬虫")

View File

@ -21,11 +21,12 @@ class StatsExtension:
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
interval = crawler.settings.getint('STATS_EXPORT_INTERVAL', 600) interval = crawler.settings.getint('STATS_EXPORT_INTERVAL', 600)
script_path = crawler.settings.get('STATS_EXPORT_SCRIPT') script_path = crawler.settings.get('STATS_EXPORT_SCRIPT')
flag_send_msg = crawler.settings.getbool('STATS_PUSH_MSG', True)
if interval <= 0: if interval <= 0:
raise NotConfigured raise NotConfigured
ext = cls(crawler.stats, interval, script_path) ext = cls(crawler.stats, interval, script_path if flag_send_msg else None)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
return ext return ext

View File

@ -135,3 +135,18 @@ class PBoxMovItem(scrapy.Item):
mov_tags_list = scrapy.Field() mov_tags_list = scrapy.Field()
mov_alt_list = scrapy.Field() mov_alt_list = scrapy.Field()
class ClmKeyWordsItem(scrapy.Item):
item_type = scrapy.Field()
words = scrapy.Field()
class ClmIndexItem(scrapy.Item):
item_type = scrapy.Field()
category = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
magnet_href = scrapy.Field()
size_text = scrapy.Field()
size_gb = scrapy.Field()
heat = scrapy.Field()
add_date = scrapy.Field()
last_down_date = scrapy.Field()

View File

@ -36,12 +36,12 @@ class SQLitePipeline():
spider_name = spider.name.lower() spider_name = spider.name.lower()
handler = self.db_handlers.get(spider_name) handler = self.db_handlers.get(spider_name)
if not handler:
raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
# 转换为单行JSON格式 # 转换为单行JSON格式
#item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':')) #item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
#spider.logger.debug(f"spider name: {spider_name}, item: {item_json}") #spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
if not handler:
raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
handler.insert_item(item) handler.insert_item(item)

View File

@ -65,6 +65,7 @@ EXTENSIONS = {
EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES = 100 # 连续10次失败后退出 EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES = 100 # 连续10次失败后退出
EXT_FAIL_MONI_RATE_THRESHOLD = 0.6 # 失败率超过30%时退出 EXT_FAIL_MONI_RATE_THRESHOLD = 0.6 # 失败率超过30%时退出
EXT_FAIL_MONI_FAILURE_TIME_WINDOW = 300 # 时间窗口为300秒 EXT_FAIL_MONI_FAILURE_TIME_WINDOW = 300 # 时间窗口为300秒
EXT_FAIL_MONI_MIN_REQUESTS = 10 # 检测失败率时,窗口内的总请求次数不小于该值,避免单次失败直接退出
# 配置拦截检测和重试参数 # 配置拦截检测和重试参数
@ -76,6 +77,7 @@ BASE_SPIDER_RETRY_DELAY = 5
BASE_SPIDER_CLOSE_ON_MAX_RETRIES = False BASE_SPIDER_CLOSE_ON_MAX_RETRIES = False
# 配置统计导出参数 # 配置统计导出参数
STATS_PUSH_MSG = True
STATS_EXPORT_INTERVAL = 1800 # 每10分钟导出一次 STATS_EXPORT_INTERVAL = 1800 # 每10分钟导出一次
STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh' # 本地shell脚本路径 STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh' # 本地shell脚本路径

View File

@ -0,0 +1,184 @@
from datetime import datetime
import scrapy
from urllib.parse import urljoin, quote_plus
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
default_keywords = [
'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper', # vixen group
'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k', # VIP 4K
'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels', # Teen Mega World
'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd', # Fuck You Cash
'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings', # Naughty America (Network)
'MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn' # Nubiles Porn (Network)
'Real Wife Stories', 'brazzers', # Brazzers
'teenpies', 'shoplyfter', # TeamSkeet (Network)
'BangBus', 'BangBros', # BangBros
'nfbusty', 'NubileFilms' # Nubile Films
'DDFBusty', # DDF Network
'AdultTime', 'BurningAngel', # Adult Time (Network)
'AnalVids', # Anal Vids
'LegalPorno',
'Pornworld', # Pornbox
'WowGirls', # Wow (Network)
'x-art', # Malibu Media
'VIPissy', # VIPissy Cash
'Japan AV Blu-Ray', # japan
'siterip', # siterip
'NewMFX', # Brazil
'Wicked', # Wicked
'Swallowed', # Sticky Dollars
'ManyVids', # ManyVids
'AnalOverdose', # PervCity
]
class ClmSpider(BaseSpider):
name = SPIDER_NAME_CLM
allowed_domains = ["clmclm.com"]
search_url = 'https://www.clmclm.com/search'
def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.logger.info(f"debug mod: {self.debug}")
self.keywords = keywords
self.min_size = float(min_size) if min_size else 1.0
# 入口函数,由基类的方法触发
def custom_start_requests(self):
list_words = self.keywords.split(',') if self.keywords else default_keywords
item = ClmKeyWordsItem()
item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
item['words'] = self.keywords
yield item
for item in list_words:
encoded_keyword = quote_plus(item.strip())
# 构造POST表单数据
form_data = {
#'csrf_token': self.csrf_token,
'search': encoded_keyword
}
# 发起搜索POST请求
yield scrapy.FormRequest(
url=self.search_url,
method='POST',
formdata=form_data,
#headers=self._get_headers(),
# 不自动跟随重定向手动处理302
meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
callback=self.handle_redirect
)
# 处理POST过来的302请求
def handle_redirect(self, response):
"""处理302重定向获取location并访问结果页面"""
# 从响应头获取重定向地址
location = response.headers.get('Location', None)
if not location:
self.logger.error("未找到302重定向地址")
return
# 转换为字符串并处理编码
result_url = location.decode('utf-8')
self.logger.info(f"重定向到结果页: {result_url}")
# 访问重定向后的结果页面,使用之前的解析方法
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
callback=self.parse_page_common
)
def parse_page_common(self, response):
need_next = False
# 提取所有 ssbox 节点(每个 ssbox 对应一条数据)
ssboxes = response.xpath('//div[@class="ssbox"]')
for ssbox in ssboxes:
# 1. 提取 h3 中的链接和文本
h3_span = ssbox.xpath('.//div[@class="title"]/h3/span')
category = h3_span.xpath('text()').get().strip() if h3_span else ''
# h3 下的 a 标签(标题链接)
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
# 标题文本(如 "Vixen.2025.05"
title_text = h3_a.xpath('text()').get().strip() if h3_a else None
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html"
title_href = h3_a.xpath('@href').get() if h3_a else None
# 若链接是相对路径可拼接成完整URL根据网站域名调整
full_title_href = response.urljoin(title_href) if title_href else None
# 2. 提取 slist 中的文件名(可选,根据需求决定是否保留)
# 文件名(如 "vixen.25.05.09....mp4"
file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get()
# 去除文件名后的大小文本(如 "8.3 GB"),只保留文件名
if file_name:
file_name = file_name.split('&nbsp;')[0].strip() # 分割并取文件名部分
# 3. 提取 sbar 中的信息
sbar = ssbox.xpath('.//div[@class="sbar"]')
# 磁力链接sbar 中的 a 标签 href
magnet_href = sbar.xpath('.//a/@href').get() if sbar else None
# 添加时间(如 "2025-06-13"
add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
# 大小(如 "39.5 GB"
size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
# 热度(如 "435"
heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
# 最近下载时间(可选提取)
last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None
size_gb = parse_size(size)
if size_gb < self.min_size:
continue
item = ClmIndexItem()
item['item_type'] = ITEM_TYPE_CLM_INDEX
item['category'] = category
item['title'] = title_text
item['href'] = full_title_href
item['magnet_href'] = magnet_href
item['size_text'] = size
item['size_gb'] = size_gb
item['heat'] = int(heat)
item['add_date'] = add_time
item['last_down_date'] = last_download
yield item
if self.debug:
return
# 解析下一页链接
pager = response.xpath('//div[@class="pager"]')
if pager:
total_text = pager.xpath('.//span[contains(text(), "")]/text()').get() if sbar else ''
# 定位“下一页”的a标签通过文本定位避免混淆其他a标签
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
# 提取href属性
next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()
# 判断是否还有下一页
if next_page_href and next_page_href != '#':
# 拼接完整URL相对路径转绝对路径
next_page_url = response.urljoin(next_page_href)
self.logger.info(f'{total_text}, 发现下一页:{next_page_url}')
# 递归请求下一页
yield scrapy.Request(
url=next_page_url,
callback=self.parse_page_common,
dont_filter=True # 允许重复请求防止因URL参数被过滤
)
else:
# 当href为#或不存在时,说明已无下一页
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}')

View File

@ -24,6 +24,8 @@ def parse_size(size_text):
return round(value / 1024 / 1024, 2) return round(value / 1024 / 1024, 2)
elif unit.lower() == 'gb' or unit.lower() == 'g': elif unit.lower() == 'gb' or unit.lower() == 'g':
return round(value, 2) return round(value, 2)
elif unit.lower() == 'tb' or unit.lower() == 't':
return round(value * 1024, 2)
else: else:
return 0.0 return 0.0
except Exception: except Exception:
@ -42,22 +44,6 @@ def parse_size_format(size_text: str):
# 解析大小 # 解析大小
return parse_size( parts[0].strip() ), format_part return parse_size( parts[0].strip() ), format_part
size_part = parts[0].strip()
match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_part)
if not match:
return 0.0, format_part
value, unit = match.groups()
value = float(value)
if unit.lower() == 'mb' or unit.lower() == 'm':
return round(value / 1024, 2), format_part
elif unit.lower() == 'gb' or unit.lower() == 'g':
return round(value, 2), format_part
else:
return 0.0, format_part
except Exception as e: except Exception as e:
return 0.0, "未知格式" return 0.0, "未知格式"