From 8db1a71d04a5f9bba92a58fbd92f320c74a9a3b9 Mon Sep 17 00:00:00 2001 From: sophon Date: Fri, 18 Jul 2025 16:54:22 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/scrapy_proj/comm/comm_def.py | 6 +- .../db_wapper/spider_db_handler.py | 39 ++++ .../scrapy_proj/db_wapper/sqlite_base.py | 31 ++- .../scrapy_proj/extensions/failure_monitor.py | 30 ++- .../scrapy_proj/extensions/stats_extension.py | 3 +- scrapy_proj/scrapy_proj/items.py | 17 +- scrapy_proj/scrapy_proj/pipelines.py | 6 +- scrapy_proj/scrapy_proj/settings.py | 2 + scrapy_proj/scrapy_proj/spiders/clm.py | 184 ++++++++++++++++++ scrapy_proj/scrapy_proj/utils/utils.py | 18 +- 10 files changed, 306 insertions(+), 30 deletions(-) create mode 100644 scrapy_proj/scrapy_proj/spiders/clm.py diff --git a/scrapy_proj/scrapy_proj/comm/comm_def.py b/scrapy_proj/scrapy_proj/comm/comm_def.py index b0cf360..bed2398 100644 --- a/scrapy_proj/scrapy_proj/comm/comm_def.py +++ b/scrapy_proj/scrapy_proj/comm/comm_def.py @@ -10,10 +10,14 @@ SPIDER_NAME_SIS = 'sis' SPIDER_NAME_U3C3 = 'u3c3' SPIDER_NAME_IAFD = 'iafd' SPIDER_NAME_PBOX = 'pbox' +SPIDER_NAME_CLM = 'clm' ITEM_TYPE_LIST = 'list' ITEM_TYPE_STUDIO = 'studio' ITEM_TYPE_MOVIE_INDEX = 'movie_index' ITEM_TYPE_ACTOR_INDEX = 'actor_index' ITEM_TYPE_MOVIE_DETAIL = 'movie_detail' -ITEM_TYPE_ACTOR_DETAIL = 'actor_detail' \ No newline at end of file +ITEM_TYPE_ACTOR_DETAIL = 'actor_detail' + +ITEM_TYPE_CLM_KEYWORDS = 'keywords' +ITEM_TYPE_CLM_INDEX = 'index' \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 87e779b..ec22c71 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -71,6 +71,45 @@ class U3C3DBHandler(SQLiteDBHandler): self.conn.commit() +@register_handler(comm.SPIDER_NAME_CLM) +class ClmDBHandler(SQLiteDBHandler): + def __init__(self, db_path=default_dbpath): + super().__init__(db_path) + self.tbl_name_clm_index = 'clm_index' + self.tbl_name_clm_keywords = 'clm_keywords' + + def insert_item(self, item): + if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX: + self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True) + elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS: + self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True) + else: + logging.error(f"unkown item.") + + return item + + + def _create_tables(self): + # 创建 u001 数据表 + self.cursor.execute(f''' + CREATE TABLE clm_index ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + category TEXT, + title TEXT, + href TEXT UNIQUE, + magnet_href TEXT, + size_text TEXT, + size_gb REAL, + heat INTEGER default 0, + add_date TEXT, + last_down_date TEXT, + created_at TEXT DEFAULT (datetime('now', 'localtime')), + updated_at TEXT DEFAULT (datetime('now', 'localtime')) + ); + ''') + self.conn.commit() + + @register_handler(comm.SPIDER_NAME_IAFD) class IAFDDBHandler(SQLiteDBHandler): def __init__(self, db_path=shared_db_path): diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py index 8b69417..fc5ce13 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py +++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py @@ -80,6 +80,20 @@ class SQLiteDBHandler: values = list(processed_data.values()) placeholders = ', '.join(['?' for _ in values]) + # 无唯一键时直接插入 + if uniq_key is None: + sql = f''' + INSERT INTO {tbl_name} ({columns}) + VALUES ({placeholders}) + ''' + self.cursor.execute(sql, values) + self.conn.commit() + # 获取最后插入的ID + self.cursor.execute("SELECT last_insert_rowid()") + record_id = self.cursor.fetchone()[0] + return record_id + + # 有唯一键时的冲突处理 if exists_do_nothing: conflict_clause = f'ON CONFLICT ({uniq_key}) DO NOTHING' else: @@ -112,7 +126,20 @@ class SQLiteDBHandler: values = list(processed_data.values()) placeholders = ', '.join(['?' for _ in values]) - # 先尝试插入数据 + # 无唯一键时直接插入 + if uniq_key is None: + sql = f''' + INSERT INTO {tbl_name} ({columns}) + VALUES ({placeholders}) + ''' + self.cursor.execute(sql, values) + self.conn.commit() + # 获取最后插入的ID + self.cursor.execute("SELECT last_insert_rowid()") + record_id = self.cursor.fetchone()[0] + return record_id + + # 有唯一键时的冲突处理 try: sql = f''' INSERT INTO {tbl_name} ({columns}) @@ -120,7 +147,7 @@ class SQLiteDBHandler: ''' self.cursor.execute(sql, values) self.conn.commit() - except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作 + except sqlite3.IntegrityError: # 唯一键冲突 if not exists_do_nothing: update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key] diff --git a/scrapy_proj/scrapy_proj/extensions/failure_monitor.py b/scrapy_proj/scrapy_proj/extensions/failure_monitor.py index 73dcb84..cff51fe 100644 --- a/scrapy_proj/scrapy_proj/extensions/failure_monitor.py +++ b/scrapy_proj/scrapy_proj/extensions/failure_monitor.py @@ -4,11 +4,12 @@ from scrapy.exceptions import NotConfigured import time class FailureMonitorExtension: - def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window): + def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window, min_requests): self.crawler = crawler self.max_consecutive_failures = max_consecutive_failures self.failure_rate_threshold = failure_rate_threshold self.time_window = time_window # 秒 + self.min_requests = min_requests self.consecutive_failures = 0 self.total_requests = 0 @@ -21,16 +22,17 @@ class FailureMonitorExtension: max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 100) failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5) time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60) + min_requests = crawler.settings.getint('EXT_FAIL_MONI_MIN_REQUESTS', 10) if max_consecutive <= 0 and failure_rate <= 0: raise NotConfigured - ext = cls(crawler, max_consecutive, failure_rate, time_window) + ext = cls(crawler, max_consecutive, failure_rate, time_window, min_requests) # 注册信号处理函数 crawler.signals.connect(ext.request_succeeded, signal=signals.response_received) - crawler.signals.connect(ext.request_failed, signal=signals.request_dropped) - crawler.signals.connect(ext.request_failed, signal=signals.spider_error) + crawler.signals.connect(ext.request_dropped, signal=signals.request_dropped) + crawler.signals.connect(ext.spider_error, signal=signals.spider_error) return ext @@ -40,7 +42,23 @@ class FailureMonitorExtension: self.request_times.append(time.time()) self._cleanup_old_requests() # 移除时间窗口外的请求 - def request_failed(self, request, exception, spider): + '''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.''' + def request_dropped(self, request, spider): + spider.logger.warning(f"request_dropped on url {request.url}") + self.calculate_failure(spider) + + ''' + Sent when a spider callback generates an error (i.e. raises an exception). + https://docs.scrapy.org/en/latest/topics/signals.html#request-failed + ''' + def spider_error(self, failure, response, spider): + # 忽略302重定向导致的失败(核心过滤逻辑) + if response.status in [301, 302, 307, 308]: + spider.logger.info(f"忽略302重定向:{response.url}") + return # 直接返回,不处理该“失败” + self.calculate_failure(spider) + + def calculate_failure(self, spider): self.consecutive_failures += 1 self.failed_requests += 1 self.total_requests += 1 @@ -53,7 +71,7 @@ class FailureMonitorExtension: self.crawler.engine.close_spider(spider, 'consecutive_failures_exceeded') # 检查失败率 - if self.total_requests > 0 and self.failure_rate_threshold > 0: + if self.total_requests >= self.min_requests and self.failure_rate_threshold > 0: current_failure_rate = self.failed_requests / self.total_requests if current_failure_rate >= self.failure_rate_threshold: spider.logger.error(f"失败率超过阈值 ({current_failure_rate:.2%} > {self.failure_rate_threshold:.2%}),停止爬虫") diff --git a/scrapy_proj/scrapy_proj/extensions/stats_extension.py b/scrapy_proj/scrapy_proj/extensions/stats_extension.py index b5f0f31..5eb78e5 100644 --- a/scrapy_proj/scrapy_proj/extensions/stats_extension.py +++ b/scrapy_proj/scrapy_proj/extensions/stats_extension.py @@ -21,11 +21,12 @@ class StatsExtension: def from_crawler(cls, crawler): interval = crawler.settings.getint('STATS_EXPORT_INTERVAL', 600) script_path = crawler.settings.get('STATS_EXPORT_SCRIPT') + flag_send_msg = crawler.settings.getbool('STATS_PUSH_MSG', True) if interval <= 0: raise NotConfigured - ext = cls(crawler.stats, interval, script_path) + ext = cls(crawler.stats, interval, script_path if flag_send_msg else None) crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) return ext diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index 3c03cf8..d722366 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -134,4 +134,19 @@ class PBoxMovItem(scrapy.Item): actor_index_list = scrapy.Field() mov_tags_list = scrapy.Field() mov_alt_list = scrapy.Field() - \ No newline at end of file + +class ClmKeyWordsItem(scrapy.Item): + item_type = scrapy.Field() + words = scrapy.Field() + +class ClmIndexItem(scrapy.Item): + item_type = scrapy.Field() + category = scrapy.Field() + title = scrapy.Field() + href = scrapy.Field() + magnet_href = scrapy.Field() + size_text = scrapy.Field() + size_gb = scrapy.Field() + heat = scrapy.Field() + add_date = scrapy.Field() + last_down_date = scrapy.Field() \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/pipelines.py b/scrapy_proj/scrapy_proj/pipelines.py index a89029f..93d0f3c 100644 --- a/scrapy_proj/scrapy_proj/pipelines.py +++ b/scrapy_proj/scrapy_proj/pipelines.py @@ -36,12 +36,12 @@ class SQLitePipeline(): spider_name = spider.name.lower() handler = self.db_handlers.get(spider_name) - if not handler: - raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器") - # 转换为单行JSON格式 #item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':')) #spider.logger.debug(f"spider name: {spider_name}, item: {item_json}") + if not handler: + raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器") + handler.insert_item(item) diff --git a/scrapy_proj/scrapy_proj/settings.py b/scrapy_proj/scrapy_proj/settings.py index 0af6dd9..e557d65 100644 --- a/scrapy_proj/scrapy_proj/settings.py +++ b/scrapy_proj/scrapy_proj/settings.py @@ -65,6 +65,7 @@ EXTENSIONS = { EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES = 100 # 连续10次失败后退出 EXT_FAIL_MONI_RATE_THRESHOLD = 0.6 # 失败率超过30%时退出 EXT_FAIL_MONI_FAILURE_TIME_WINDOW = 300 # 时间窗口为300秒 +EXT_FAIL_MONI_MIN_REQUESTS = 10 # 检测失败率时,窗口内的总请求次数不小于该值,避免单次失败直接退出 # 配置拦截检测和重试参数 @@ -76,6 +77,7 @@ BASE_SPIDER_RETRY_DELAY = 5 BASE_SPIDER_CLOSE_ON_MAX_RETRIES = False # 配置统计导出参数 +STATS_PUSH_MSG = True STATS_EXPORT_INTERVAL = 1800 # 每10分钟导出一次 STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh' # 本地shell脚本路径 diff --git a/scrapy_proj/scrapy_proj/spiders/clm.py b/scrapy_proj/scrapy_proj/spiders/clm.py new file mode 100644 index 0000000..543e441 --- /dev/null +++ b/scrapy_proj/scrapy_proj/spiders/clm.py @@ -0,0 +1,184 @@ +from datetime import datetime +import scrapy +from urllib.parse import urljoin, quote_plus +from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime +from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element +from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem +from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS + +default_keywords = [ + 'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper', # vixen group + 'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k', # VIP 4K + 'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels', # Teen Mega World + 'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd', # Fuck You Cash + 'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings', # Naughty America (Network) + 'MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn' # Nubiles Porn (Network) + 'Real Wife Stories', 'brazzers', # Brazzers + 'teenpies', 'shoplyfter', # TeamSkeet (Network) + 'BangBus', 'BangBros', # BangBros + 'nfbusty', 'NubileFilms' # Nubile Films + 'DDFBusty', # DDF Network + 'AdultTime', 'BurningAngel', # Adult Time (Network) + 'AnalVids', # Anal Vids + 'LegalPorno', + 'Pornworld', # Pornbox + 'WowGirls', # Wow (Network) + 'x-art', # Malibu Media + 'VIPissy', # VIPissy Cash + 'Japan AV Blu-Ray', # japan + 'siterip', # siterip + 'NewMFX', # Brazil + 'Wicked', # Wicked + 'Swallowed', # Sticky Dollars + 'ManyVids', # ManyVids + 'AnalOverdose', # PervCity +] + +class ClmSpider(BaseSpider): + name = SPIDER_NAME_CLM + allowed_domains = ["clmclm.com"] + search_url = 'https://www.clmclm.com/search' + + def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False + self.logger.info(f"debug mod: {self.debug}") + + self.keywords = keywords + self.min_size = float(min_size) if min_size else 1.0 + + # 入口函数,由基类的方法触发 + def custom_start_requests(self): + list_words = self.keywords.split(',') if self.keywords else default_keywords + + item = ClmKeyWordsItem() + item['item_type'] = ITEM_TYPE_CLM_KEYWORDS + item['words'] = self.keywords + yield item + + for item in list_words: + encoded_keyword = quote_plus(item.strip()) + + # 构造POST表单数据 + form_data = { + #'csrf_token': self.csrf_token, + 'search': encoded_keyword + } + + # 发起搜索POST请求 + yield scrapy.FormRequest( + url=self.search_url, + method='POST', + formdata=form_data, + #headers=self._get_headers(), + # 不自动跟随重定向,手动处理302 + meta={'dont_redirect': True, 'handle_httpstatus_list': [302]}, + callback=self.handle_redirect + ) + + # 处理POST过来的302请求 + def handle_redirect(self, response): + """处理302重定向,获取location并访问结果页面""" + # 从响应头获取重定向地址 + location = response.headers.get('Location', None) + if not location: + self.logger.error("未找到302重定向地址") + return + + # 转换为字符串并处理编码 + result_url = location.decode('utf-8') + self.logger.info(f"重定向到结果页: {result_url}") + + # 访问重定向后的结果页面,使用之前的解析方法 + yield scrapy.Request( + url=result_url, + #headers=self._get_headers(), + callback=self.parse_page_common + ) + + + def parse_page_common(self, response): + + need_next = False + # 提取所有 ssbox 节点(每个 ssbox 对应一条数据) + ssboxes = response.xpath('//div[@class="ssbox"]') + + for ssbox in ssboxes: + # 1. 提取 h3 中的链接和文本 + h3_span = ssbox.xpath('.//div[@class="title"]/h3/span') + category = h3_span.xpath('text()').get().strip() if h3_span else '' + # h3 下的 a 标签(标题链接) + h3_a = ssbox.xpath('.//div[@class="title"]/h3/a') + # 标题文本(如 "Vixen.2025.05") + title_text = h3_a.xpath('text()').get().strip() if h3_a else None + # 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html") + title_href = h3_a.xpath('@href').get() if h3_a else None + # 若链接是相对路径,可拼接成完整URL(根据网站域名调整) + full_title_href = response.urljoin(title_href) if title_href else None + + # 2. 提取 slist 中的文件名(可选,根据需求决定是否保留) + # 文件名(如 "vixen.25.05.09....mp4") + file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get() + # 去除文件名后的大小文本(如 "8.3 GB"),只保留文件名 + if file_name: + file_name = file_name.split(' ')[0].strip() # 分割并取文件名部分 + + # 3. 提取 sbar 中的信息 + sbar = ssbox.xpath('.//div[@class="sbar"]') + # 磁力链接(sbar 中的 a 标签 href) + magnet_href = sbar.xpath('.//a/@href').get() if sbar else None + # 添加时间(如 "2025-06-13") + add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None + # 大小(如 "39.5 GB") + size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None + # 热度(如 "435") + heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None + # 最近下载时间(可选提取) + last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None + + size_gb = parse_size(size) + if size_gb < self.min_size: + continue + + item = ClmIndexItem() + item['item_type'] = ITEM_TYPE_CLM_INDEX + item['category'] = category + item['title'] = title_text + item['href'] = full_title_href + item['magnet_href'] = magnet_href + item['size_text'] = size + item['size_gb'] = size_gb + item['heat'] = int(heat) + item['add_date'] = add_time + item['last_down_date'] = last_download + + yield item + + if self.debug: + return + + # 解析下一页链接 + pager = response.xpath('//div[@class="pager"]') + if pager: + total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() if sbar else '' + + # 定位“下一页”的a标签(通过文本定位,避免混淆其他a标签) + next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get() + # 提取href属性 + next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get() + + # 判断是否还有下一页 + if next_page_href and next_page_href != '#': + # 拼接完整URL(相对路径转绝对路径) + next_page_url = response.urljoin(next_page_href) + self.logger.info(f'{total_text}, 发现下一页:{next_page_url}') + # 递归请求下一页 + yield scrapy.Request( + url=next_page_url, + callback=self.parse_page_common, + dont_filter=True # 允许重复请求(防止因URL参数被过滤) + ) + else: + # 当href为#或不存在时,说明已无下一页 + self.logger.info(f'已获取完所有页面,停止翻页. {total_text}') + \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/utils/utils.py b/scrapy_proj/scrapy_proj/utils/utils.py index 349f40a..07c6003 100644 --- a/scrapy_proj/scrapy_proj/utils/utils.py +++ b/scrapy_proj/scrapy_proj/utils/utils.py @@ -24,6 +24,8 @@ def parse_size(size_text): return round(value / 1024 / 1024, 2) elif unit.lower() == 'gb' or unit.lower() == 'g': return round(value, 2) + elif unit.lower() == 'tb' or unit.lower() == 't': + return round(value * 1024, 2) else: return 0.0 except Exception: @@ -41,22 +43,6 @@ def parse_size_format(size_text: str): # 解析大小 return parse_size( parts[0].strip() ), format_part - - size_part = parts[0].strip() - match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_part) - - if not match: - return 0.0, format_part - - value, unit = match.groups() - value = float(value) - - if unit.lower() == 'mb' or unit.lower() == 'm': - return round(value / 1024, 2), format_part - elif unit.lower() == 'gb' or unit.lower() == 'g': - return round(value, 2), format_part - else: - return 0.0, format_part except Exception as e: return 0.0, "未知格式"