resources/scrapy_proj/scrapy_proj/spiders/clm_spider.py

from datetime import datetime
import scrapy
from urllib.parse import urljoin, quote_plus
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS

default_keywords = [
    'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper',  # vixen group
    'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k',   # VIP 4K
    'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels',  # Teen Mega World
    'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd',  # Fuck You Cash
    'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings',   # Naughty America (Network)
    'MyFamilyPies', 'StepSiblingsCaught',  'nubilesporn', # Nubiles Porn (Network)
    'Real Wife Stories', 'brazzers',  # Brazzers
    'teenpies', 'shoplyfter',  # TeamSkeet (Network)
    'BangBus', 'BangBros',   # BangBros
    'nfbusty', 'NubileFilms',  # Nubile Films
    'DDFBusty',  # DDF Network
    'AdultTime', 'BurningAngel',  # Adult Time (Network)
    'AnalVids',  # Anal Vids
    'LegalPorno',
    'Pornworld', # Pornbox
    'WowGirls',  # Wow (Network)
    'x-art', # Malibu Media
    'VIPissy', # VIPissy Cash
    'Japan AV Blu-Ray',  # japan
    'siterip',  # siterip
    'NewMFX', # Brazil
    'Wicked', # Wicked
    'Swallowed',  # Sticky Dollars
    'ManyVids',   # ManyVids
    'AnalOverdose', # PervCity
]

class ClmSpider(BaseSpider):
    name = SPIDER_NAME_CLM
    allowed_domains = ["clmclm.com"]
    search_url = 'https://www.clmclm.com/search'

    def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
        self.logger.info(f"debug mod: {self.debug}")

        self.keywords = keywords
        self.min_size = float(min_size) if min_size else 1.0

    # 入口函数，由基类的方法触发
    def custom_start_requests(self):
        list_words = self.keywords.split(',') if self.keywords else default_keywords

        item = ClmKeyWordsItem()
        item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
        item['words'] = self.keywords if self.keywords else 'default keywords'
        yield item

        for item in list_words:
            encoded_keyword = quote_plus(item.strip())

            # 构造POST表单数据
            form_data = {
                #'csrf_token': self.csrf_token,
                'search': encoded_keyword
            }

            # 发起搜索POST请求
            yield scrapy.FormRequest(
                url=self.search_url,
                method='POST',
                formdata=form_data,
                #headers=self._get_headers(),
                # 不自动跟随重定向，手动处理302
                meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
                callback=self.handle_redirect
            )

    # 处理POST过来的302请求
    def handle_redirect(self, response):
        """处理302重定向，获取location并访问结果页面"""
        # 从响应头获取重定向地址
        location = response.headers.get('Location', None)
        if not location:
            self.logger.error("未找到302重定向地址")
            return

        # 转换为字符串并处理编码
        result_url = location.decode('utf-8')
        self.logger.info(f"重定向到结果页: {result_url}")

        # 访问重定向后的结果页面，使用之前的解析方法
        yield scrapy.Request(
            url=result_url,
            #headers=self._get_headers(),
            callback=self.parse_page_common
        )


    def parse_page_common(self, response):

        need_next = False
        # 提取所有 ssbox 节点（每个 ssbox 对应一条数据）
        ssboxes = response.xpath('//div[@class="ssbox"]')

        for ssbox in ssboxes:
            # 1. 提取 h3 中的链接和文本
            h3_span = ssbox.xpath('.//div[@class="title"]/h3/span')
            category = h3_span.xpath('text()').get().strip() if h3_span else ''
            # h3 下的 a 标签（标题链接）
            h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
            # 标题文本（如 "Vixen.2025.05"）
            title_text = h3_a.xpath('text()').get().strip() if h3_a else None
            # 标题链接（如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html"）
            title_href = h3_a.xpath('@href').get() if h3_a else None
            # 若链接是相对路径，可拼接成完整URL（根据网站域名调整）
            full_title_href = response.urljoin(title_href) if title_href else None

            # 2. 提取 slist 中的文件名（可选，根据需求决定是否保留）
            # 文件名（如 "vixen.25.05.09....mp4"）
            file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get()
            # 去除文件名后的大小文本（如 "8.3 GB"），只保留文件名
            if file_name:
                file_name = file_name.split('&nbsp;')[0].strip()  # 分割并取文件名部分

            # 3. 提取 sbar 中的信息
            sbar = ssbox.xpath('.//div[@class="sbar"]')
            # 磁力链接（sbar 中的 a 标签 href）
            magnet_href = sbar.xpath('.//a/@href').get() if sbar else None
            # 添加时间（如 "2025-06-13"）
            add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
            # 大小（如 "39.5 GB"）
            size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
            # 热度（如 "435"）
            heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
            # 最近下载时间（可选提取）
            last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None

            size_gb = parse_size(size)
            if size_gb < self.min_size:
                continue

            item = ClmIndexItem()
            item['item_type'] = ITEM_TYPE_CLM_INDEX
            item['category'] = category
            item['title'] = title_text
            item['href'] = full_title_href
            item['magnet_href'] = magnet_href
            item['size_text'] = size
            item['size_gb'] = size_gb
            item['heat'] = int(heat)
            item['add_date'] = add_time
            item['last_down_date'] = last_download

            yield item

        if self.debug:
            return

        # 解析下一页链接
        pager = response.xpath('//div[@class="pager"]')
        if pager:
            total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() if sbar else ''

            # 定位“下一页”的a标签（通过文本定位，避免混淆其他a标签）
            next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
            # 提取href属性
            next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()

            # 判断是否还有下一页
            if next_page_href and next_page_href != '#':
                # 拼接完整URL（相对路径转绝对路径）
                next_page_url = response.urljoin(next_page_href)
                self.logger.info(f'{total_text}, 发现下一页：{next_page_url}')
                # 递归请求下一页
                yield scrapy.Request(
                    url=next_page_url,
                    callback=self.parse_page_common,
                    dont_filter=True  # 允许重复请求（防止因URL参数被过滤）
                )
            else:
                # 当href为#或不存在时，说明已无下一页
                self.logger.info(f'已获取完所有页面，停止翻页. {total_text}')