modify scripts

2025-07-18 16:54:22 +08:00
parent eeb879a293
commit 8db1a71d04
10 changed files with 306 additions and 30 deletions
--- a/scrapy_proj/scrapy_proj/comm/comm_def.py
+++ b/scrapy_proj/scrapy_proj/comm/comm_def.py
@ -10,6 +10,7 @@ SPIDER_NAME_SIS = 'sis'
 SPIDER_NAME_U3C3 = 'u3c3'
 SPIDER_NAME_IAFD = 'iafd'
 SPIDER_NAME_PBOX = 'pbox'
 SPIDER_NAME_CLM = 'clm'
 ITEM_TYPE_LIST = 'list'
 ITEM_TYPE_STUDIO = 'studio'
@ -17,3 +18,6 @@ ITEM_TYPE_MOVIE_INDEX = 'movie_index'
 ITEM_TYPE_ACTOR_INDEX = 'actor_index'
 ITEM_TYPE_MOVIE_DETAIL = 'movie_detail'
 ITEM_TYPE_ACTOR_DETAIL = 'actor_detail'
 ITEM_TYPE_CLM_KEYWORDS = 'keywords'
 ITEM_TYPE_CLM_INDEX = 'index'
--- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
@ -71,6 +71,45 @@ class U3C3DBHandler(SQLiteDBHandler):
        self.conn.commit()
@register_handler(comm.SPIDER_NAME_CLM)
 class ClmDBHandler(SQLiteDBHandler):
    def __init__(self, db_path=default_dbpath):
        super().__init__(db_path)
        self.tbl_name_clm_index = 'clm_index'
        self.tbl_name_clm_keywords = 'clm_keywords'
    def insert_item(self, item):
        if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
            self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True)
        elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
            self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True)
        else:
            logging.error(f"unkown item.") 
        return item
    def _create_tables(self): 
        # 创建 u001 数据表
        self.cursor.execute(f'''
            CREATE TABLE clm_index (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                category TEXT,
                title TEXT,
                href TEXT UNIQUE,
                magnet_href TEXT,
                size_text TEXT,
                size_gb REAL,
                heat INTEGER default 0,
                add_date TEXT,
                last_down_date TEXT,
                created_at TEXT DEFAULT (datetime('now', 'localtime')),
                updated_at TEXT DEFAULT (datetime('now', 'localtime'))
            );
        ''')
        self.conn.commit()
@register_handler(comm.SPIDER_NAME_IAFD)
 class IAFDDBHandler(SQLiteDBHandler):
    def __init__(self, db_path=shared_db_path):
--- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
@ -80,6 +80,20 @@ class SQLiteDBHandler:
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            # 无唯一键时直接插入
            if uniq_key is None:
                sql = f'''
                    INSERT INTO {tbl_name} ({columns})
                    VALUES ({placeholders})
                '''
                self.cursor.execute(sql, values)
                self.conn.commit()
                # 获取最后插入的ID
                self.cursor.execute("SELECT last_insert_rowid()")
                record_id = self.cursor.fetchone()[0]
                return record_id
            # 有唯一键时的冲突处理
            if exists_do_nothing:
                conflict_clause = f'ON CONFLICT ({uniq_key}) DO NOTHING'
            else:
@ -112,7 +126,20 @@ class SQLiteDBHandler:
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
-            # 先尝试插入数据
+            # 无唯一键时直接插入
            if uniq_key is None:
                sql = f'''
                    INSERT INTO {tbl_name} ({columns})
                    VALUES ({placeholders})
                '''
                self.cursor.execute(sql, values)
                self.conn.commit()
                # 获取最后插入的ID
                self.cursor.execute("SELECT last_insert_rowid()")
                record_id = self.cursor.fetchone()[0]
                return record_id
            # 有唯一键时的冲突处理
            try:
                sql = f'''
                    INSERT INTO {tbl_name} ({columns})
@ -120,7 +147,7 @@ class SQLiteDBHandler:
                '''
                self.cursor.execute(sql, values)
                self.conn.commit()
-            except sqlite3.IntegrityError:  # 唯一键冲突，执行更新操作
+            except sqlite3.IntegrityError:  # 唯一键冲突
                if not exists_do_nothing:
                    update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
                    update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
--- a/scrapy_proj/scrapy_proj/extensions/failure_monitor.py
+++ b/scrapy_proj/scrapy_proj/extensions/failure_monitor.py
@ -4,11 +4,12 @@ from scrapy.exceptions import NotConfigured
 import time
 class FailureMonitorExtension:
-    def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window):
+    def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window, min_requests):
        self.crawler = crawler
        self.max_consecutive_failures = max_consecutive_failures
        self.failure_rate_threshold = failure_rate_threshold
        self.time_window = time_window  # 秒
        self.min_requests = min_requests
        self.consecutive_failures = 0
        self.total_requests = 0
@ -21,16 +22,17 @@ class FailureMonitorExtension:
        max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 100)
        failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5)
        time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60)
        min_requests = crawler.settings.getint('EXT_FAIL_MONI_MIN_REQUESTS', 10)
        if max_consecutive <= 0 and failure_rate <= 0:
            raise NotConfigured
-        ext = cls(crawler, max_consecutive, failure_rate, time_window)
+        ext = cls(crawler, max_consecutive, failure_rate, time_window, min_requests)
        # 注册信号处理函数
        crawler.signals.connect(ext.request_succeeded, signal=signals.response_received)
-        crawler.signals.connect(ext.request_failed, signal=signals.request_dropped)
+        crawler.signals.connect(ext.request_dropped, signal=signals.request_dropped)
-        crawler.signals.connect(ext.request_failed, signal=signals.spider_error)
+        crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
        return ext
@ -40,7 +42,23 @@ class FailureMonitorExtension:
        self.request_times.append(time.time())
        self._cleanup_old_requests()  # 移除时间窗口外的请求
-    def request_failed(self, request, exception, spider):
+    '''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.'''
    def request_dropped(self, request, spider):
        spider.logger.warning(f"request_dropped on url {request.url}")
        self.calculate_failure(spider)
    '''
        Sent when a spider callback generates an error (i.e. raises an exception).
        https://docs.scrapy.org/en/latest/topics/signals.html#request-failed
    '''
    def spider_error(self, failure, response, spider):
        # 忽略302重定向导致的失败（核心过滤逻辑）
        if response.status in [301, 302, 307, 308]:
            spider.logger.info(f"忽略302重定向：{response.url}")
            return  # 直接返回，不处理该“失败”
        self.calculate_failure(spider)
    def calculate_failure(self, spider):
        self.consecutive_failures += 1
        self.failed_requests += 1
        self.total_requests += 1
@ -53,7 +71,7 @@ class FailureMonitorExtension:
            self.crawler.engine.close_spider(spider, 'consecutive_failures_exceeded')
        # 检查失败率
-        if self.total_requests > 0 and self.failure_rate_threshold > 0:
+        if self.total_requests >= self.min_requests and self.failure_rate_threshold > 0:
            current_failure_rate = self.failed_requests / self.total_requests
            if current_failure_rate >= self.failure_rate_threshold:
                spider.logger.error(f"失败率超过阈值 ({current_failure_rate:.2%} > {self.failure_rate_threshold:.2%})，停止爬虫")
--- a/scrapy_proj/scrapy_proj/extensions/stats_extension.py
+++ b/scrapy_proj/scrapy_proj/extensions/stats_extension.py
@ -21,11 +21,12 @@ class StatsExtension:
    def from_crawler(cls, crawler):
        interval = crawler.settings.getint('STATS_EXPORT_INTERVAL', 600)
        script_path = crawler.settings.get('STATS_EXPORT_SCRIPT')
        flag_send_msg = crawler.settings.getbool('STATS_PUSH_MSG', True)
        if interval <= 0:
            raise NotConfigured
-        ext = cls(crawler.stats, interval, script_path)
+        ext = cls(crawler.stats, interval, script_path if flag_send_msg else None)
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        return ext
--- a/scrapy_proj/scrapy_proj/items.py
+++ b/scrapy_proj/scrapy_proj/items.py
@ -135,3 +135,18 @@ class PBoxMovItem(scrapy.Item):
    mov_tags_list = scrapy.Field()
    mov_alt_list = scrapy.Field()
 class ClmKeyWordsItem(scrapy.Item):
    item_type = scrapy.Field()
    words = scrapy.Field()
 class ClmIndexItem(scrapy.Item):
    item_type = scrapy.Field()
    category = scrapy.Field()
    title = scrapy.Field()
    href = scrapy.Field()
    magnet_href = scrapy.Field()
    size_text = scrapy.Field()
    size_gb = scrapy.Field()
    heat = scrapy.Field()
    add_date = scrapy.Field()
    last_down_date = scrapy.Field()
--- a/scrapy_proj/scrapy_proj/pipelines.py
+++ b/scrapy_proj/scrapy_proj/pipelines.py
@ -36,12 +36,12 @@ class SQLitePipeline():
        spider_name = spider.name.lower()
        handler = self.db_handlers.get(spider_name)
        if not handler:
            raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
        # 转换为单行JSON格式
        #item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
        #spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
        if not handler:
            raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
        handler.insert_item(item)
--- a/scrapy_proj/scrapy_proj/settings.py
+++ b/scrapy_proj/scrapy_proj/settings.py
@ -65,6 +65,7 @@ EXTENSIONS = {
 EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES = 100  # 连续10次失败后退出
 EXT_FAIL_MONI_RATE_THRESHOLD = 0.6  # 失败率超过30%时退出
 EXT_FAIL_MONI_FAILURE_TIME_WINDOW = 300  # 时间窗口为300秒
 EXT_FAIL_MONI_MIN_REQUESTS = 10 # 检测失败率时，窗口内的总请求次数不小于该值，避免单次失败直接退出
 # 配置拦截检测和重试参数
@ -76,6 +77,7 @@ BASE_SPIDER_RETRY_DELAY = 5
 BASE_SPIDER_CLOSE_ON_MAX_RETRIES = False
 # 配置统计导出参数
 STATS_PUSH_MSG = True
 STATS_EXPORT_INTERVAL = 1800  # 每10分钟导出一次
 STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh'  # 本地shell脚本路径
--- a/scrapy_proj/scrapy_proj/spiders/clm.py
+++ b/scrapy_proj/scrapy_proj/spiders/clm.py
@ -0,0 +1,184 @@
 from datetime import datetime
 import scrapy
 from urllib.parse import urljoin, quote_plus
 from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
 from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
 from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
 from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
 default_keywords = [
    'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper',  # vixen group
    'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k',   # VIP 4K
    'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels',  # Teen Mega World
    'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd',  # Fuck You Cash
    'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings',   # Naughty America (Network)
    'MyFamilyPies', 'StepSiblingsCaught',  'nubilesporn' # Nubiles Porn (Network)
    'Real Wife Stories', 'brazzers',  # Brazzers
    'teenpies', 'shoplyfter',  # TeamSkeet (Network)
    'BangBus', 'BangBros',   # BangBros
    'nfbusty', 'NubileFilms'  # Nubile Films
    'DDFBusty',  # DDF Network
    'AdultTime', 'BurningAngel',  # Adult Time (Network)
    'AnalVids',  # Anal Vids
    'LegalPorno', 
    'Pornworld', # Pornbox    
    'WowGirls',  # Wow (Network)
    'x-art', # Malibu Media
    'VIPissy', # VIPissy Cash
    'Japan AV Blu-Ray',  # japan
    'siterip',  # siterip
    'NewMFX', # Brazil 
    'Wicked', # Wicked
    'Swallowed',  # Sticky Dollars
    'ManyVids',   # ManyVids
    'AnalOverdose', # PervCity 
 ]
 class ClmSpider(BaseSpider):
    name = SPIDER_NAME_CLM
    allowed_domains = ["clmclm.com"]
    search_url = 'https://www.clmclm.com/search'
    def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
        self.logger.info(f"debug mod: {self.debug}")
        self.keywords = keywords
        self.min_size = float(min_size) if min_size else 1.0
    # 入口函数，由基类的方法触发
    def custom_start_requests(self):
        list_words = self.keywords.split(',') if self.keywords else default_keywords
        item = ClmKeyWordsItem()  
        item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
        item['words'] = self.keywords
        yield item
        for item in list_words:
            encoded_keyword = quote_plus(item.strip())
            # 构造POST表单数据
            form_data = {
                #'csrf_token': self.csrf_token,
                'search': encoded_keyword
            }
            # 发起搜索POST请求
            yield scrapy.FormRequest(
                url=self.search_url,
                method='POST',
                formdata=form_data,
                #headers=self._get_headers(),
                # 不自动跟随重定向，手动处理302
                meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
                callback=self.handle_redirect
            )
    # 处理POST过来的302请求
    def handle_redirect(self, response):
        """处理302重定向，获取location并访问结果页面"""
        # 从响应头获取重定向地址
        location = response.headers.get('Location', None)
        if not location:
            self.logger.error("未找到302重定向地址")
            return
        # 转换为字符串并处理编码
        result_url = location.decode('utf-8')
        self.logger.info(f"重定向到结果页: {result_url}")
        # 访问重定向后的结果页面，使用之前的解析方法
        yield scrapy.Request(
            url=result_url,
            #headers=self._get_headers(),
            callback=self.parse_page_common
        )
    def parse_page_common(self, response):
        need_next = False
        # 提取所有 ssbox 节点（每个 ssbox 对应一条数据）
        ssboxes = response.xpath('//div[@class="ssbox"]')
        for ssbox in ssboxes:
            # 1. 提取 h3 中的链接和文本
            h3_span = ssbox.xpath('.//div[@class="title"]/h3/span')
            category = h3_span.xpath('text()').get().strip() if h3_span else ''
            # h3 下的 a 标签（标题链接）
            h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
            # 标题文本（如 "Vixen.2025.05"）
            title_text = h3_a.xpath('text()').get().strip() if h3_a else None
            # 标题链接（如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html"）
            title_href = h3_a.xpath('@href').get() if h3_a else None
            # 若链接是相对路径，可拼接成完整URL（根据网站域名调整）
            full_title_href = response.urljoin(title_href) if title_href else None
            # 2. 提取 slist 中的文件名（可选，根据需求决定是否保留）
            # 文件名（如 "vixen.25.05.09....mp4"）
            file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get()
            # 去除文件名后的大小文本（如 "8.3 GB"），只保留文件名
            if file_name:
                file_name = file_name.split('&nbsp;')[0].strip()  # 分割并取文件名部分
            # 3. 提取 sbar 中的信息
            sbar = ssbox.xpath('.//div[@class="sbar"]')
            # 磁力链接（sbar 中的 a 标签 href）
            magnet_href = sbar.xpath('.//a/@href').get() if sbar else None
            # 添加时间（如 "2025-06-13"）
            add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
            # 大小（如 "39.5 GB"）
            size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
            # 热度（如 "435"）
            heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
            # 最近下载时间（可选提取）
            last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None
            size_gb = parse_size(size)
            if size_gb < self.min_size:
                continue
            item = ClmIndexItem()
            item['item_type'] = ITEM_TYPE_CLM_INDEX
            item['category'] = category
            item['title'] = title_text
            item['href'] = full_title_href
            item['magnet_href'] = magnet_href
            item['size_text'] = size
            item['size_gb'] = size_gb
            item['heat'] = int(heat)
            item['add_date'] = add_time
            item['last_down_date'] = last_download
            yield item
        if self.debug:
            return
        # 解析下一页链接
        pager = response.xpath('//div[@class="pager"]')
        if pager:
            total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() if sbar else ''
            # 定位“下一页”的a标签（通过文本定位，避免混淆其他a标签）
            next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
            # 提取href属性
            next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()
            # 判断是否还有下一页
            if next_page_href and next_page_href != '#':
                # 拼接完整URL（相对路径转绝对路径）
                next_page_url = response.urljoin(next_page_href)
                self.logger.info(f'{total_text}, 发现下一页：{next_page_url}')
                # 递归请求下一页
                yield scrapy.Request(
                    url=next_page_url,
                    callback=self.parse_page_common,
                    dont_filter=True  # 允许重复请求（防止因URL参数被过滤）
                )
            else:
                # 当href为#或不存在时，说明已无下一页
                self.logger.info(f'已获取完所有页面，停止翻页. {total_text}')    
--- a/scrapy_proj/scrapy_proj/utils/utils.py
+++ b/scrapy_proj/scrapy_proj/utils/utils.py
@ -24,6 +24,8 @@ def parse_size(size_text):
            return round(value / 1024 / 1024, 2)
        elif unit.lower() == 'gb' or unit.lower() == 'g':
            return round(value, 2)
        elif unit.lower() == 'tb' or unit.lower() == 't':
            return round(value * 1024, 2)
        else:
            return 0.0
    except Exception:
@ -42,22 +44,6 @@ def parse_size_format(size_text: str):
        # 解析大小
        return parse_size( parts[0].strip() ), format_part
        size_part = parts[0].strip()
        match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_part)
        if not match:
            return 0.0, format_part
        value, unit = match.groups()
        value = float(value)
        if unit.lower() == 'mb' or unit.lower() == 'm':
            return round(value / 1024, 2), format_part
        elif unit.lower() == 'gb' or unit.lower() == 'g':
            return round(value, 2), format_part
        else:
            return 0.0, format_part
    except Exception as e:
        return 0.0, "未知格式"