This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/clm_spider.py
2025-07-18 19:29:13 +08:00

184 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from datetime import datetime
import scrapy
from urllib.parse import urljoin, quote_plus
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
default_keywords = [
'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper', # vixen group
'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k', # VIP 4K
'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels', # Teen Mega World
'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd', # Fuck You Cash
'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings', # Naughty America (Network)
'MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn', # Nubiles Porn (Network)
'Real Wife Stories', 'brazzers', # Brazzers
'teenpies', 'shoplyfter', # TeamSkeet (Network)
'BangBus', 'BangBros', # BangBros
'nfbusty', 'NubileFilms', # Nubile Films
'DDFBusty', # DDF Network
'AdultTime', 'BurningAngel', # Adult Time (Network)
'AnalVids', # Anal Vids
'LegalPorno',
'Pornworld', # Pornbox
'WowGirls', # Wow (Network)
'x-art', # Malibu Media
'VIPissy', # VIPissy Cash
'Japan AV Blu-Ray', # japan
'siterip', # siterip
'NewMFX', # Brazil
'Wicked', # Wicked
'Swallowed', # Sticky Dollars
'ManyVids', # ManyVids
'AnalOverdose', # PervCity
]
class ClmSpider(BaseSpider):
name = SPIDER_NAME_CLM
allowed_domains = ["clmclm.com"]
search_url = 'https://www.clmclm.com/search'
def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.logger.info(f"debug mod: {self.debug}")
self.keywords = keywords
self.min_size = float(min_size) if min_size else 1.0
# 入口函数,由基类的方法触发
def custom_start_requests(self):
list_words = self.keywords.split(',') if self.keywords else default_keywords
item = ClmKeyWordsItem()
item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
item['words'] = self.keywords if self.keywords else 'default keywords'
yield item
for item in list_words:
encoded_keyword = quote_plus(item.strip())
# 构造POST表单数据
form_data = {
#'csrf_token': self.csrf_token,
'search': encoded_keyword
}
# 发起搜索POST请求
yield scrapy.FormRequest(
url=self.search_url,
method='POST',
formdata=form_data,
#headers=self._get_headers(),
# 不自动跟随重定向手动处理302
meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
callback=self.handle_redirect
)
# 处理POST过来的302请求
def handle_redirect(self, response):
"""处理302重定向获取location并访问结果页面"""
# 从响应头获取重定向地址
location = response.headers.get('Location', None)
if not location:
self.logger.error("未找到302重定向地址")
return
# 转换为字符串并处理编码
result_url = location.decode('utf-8')
self.logger.info(f"重定向到结果页: {result_url}")
# 访问重定向后的结果页面,使用之前的解析方法
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
callback=self.parse_page_common
)
def parse_page_common(self, response):
need_next = False
# 提取所有 ssbox 节点(每个 ssbox 对应一条数据)
ssboxes = response.xpath('//div[@class="ssbox"]')
for ssbox in ssboxes:
# 1. 提取 h3 中的链接和文本
h3_span = ssbox.xpath('.//div[@class="title"]/h3/span')
category = h3_span.xpath('text()').get().strip() if h3_span else ''
# h3 下的 a 标签(标题链接)
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
# 标题文本(如 "Vixen.2025.05"
title_text = h3_a.xpath('text()').get().strip() if h3_a else None
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html"
title_href = h3_a.xpath('@href').get() if h3_a else None
# 若链接是相对路径可拼接成完整URL根据网站域名调整
full_title_href = response.urljoin(title_href) if title_href else None
# 2. 提取 slist 中的文件名(可选,根据需求决定是否保留)
# 文件名(如 "vixen.25.05.09....mp4"
file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get()
# 去除文件名后的大小文本(如 "8.3 GB"),只保留文件名
if file_name:
file_name = file_name.split(' ')[0].strip() # 分割并取文件名部分
# 3. 提取 sbar 中的信息
sbar = ssbox.xpath('.//div[@class="sbar"]')
# 磁力链接sbar 中的 a 标签 href
magnet_href = sbar.xpath('.//a/@href').get() if sbar else None
# 添加时间(如 "2025-06-13"
add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
# 大小(如 "39.5 GB"
size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
# 热度(如 "435"
heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
# 最近下载时间(可选提取)
last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None
size_gb = parse_size(size)
if size_gb < self.min_size:
continue
item = ClmIndexItem()
item['item_type'] = ITEM_TYPE_CLM_INDEX
item['category'] = category
item['title'] = title_text
item['href'] = full_title_href
item['magnet_href'] = magnet_href
item['size_text'] = size
item['size_gb'] = size_gb
item['heat'] = int(heat)
item['add_date'] = add_time
item['last_down_date'] = last_download
yield item
if self.debug:
return
# 解析下一页链接
pager = response.xpath('//div[@class="pager"]')
if pager:
total_text = pager.xpath('.//span[contains(text(), "")]/text()').get() if sbar else ''
# 定位“下一页”的a标签通过文本定位避免混淆其他a标签
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
# 提取href属性
next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()
# 判断是否还有下一页
if next_page_href and next_page_href != '#':
# 拼接完整URL相对路径转绝对路径
next_page_url = response.urljoin(next_page_href)
self.logger.info(f'{total_text}, 发现下一页:{next_page_url}')
# 递归请求下一页
yield scrapy.Request(
url=next_page_url,
callback=self.parse_page_common,
dont_filter=True # 允许重复请求防止因URL参数被过滤
)
else:
# 当href为#或不存在时,说明已无下一页
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}')