184 lines
8.1 KiB
Python
184 lines
8.1 KiB
Python
from datetime import datetime
|
||
import scrapy
|
||
from urllib.parse import urljoin, quote_plus
|
||
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
|
||
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
|
||
|
||
default_keywords = [
|
||
'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper', # vixen group
|
||
'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k', # VIP 4K
|
||
'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels', # Teen Mega World
|
||
'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd', # Fuck You Cash
|
||
'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings', # Naughty America (Network)
|
||
'MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn', # Nubiles Porn (Network)
|
||
'Real Wife Stories', 'brazzers', # Brazzers
|
||
'teenpies', 'shoplyfter', # TeamSkeet (Network)
|
||
'BangBus', 'BangBros', # BangBros
|
||
'nfbusty', 'NubileFilms', # Nubile Films
|
||
'DDFBusty', # DDF Network
|
||
'AdultTime', 'BurningAngel', # Adult Time (Network)
|
||
'AnalVids', # Anal Vids
|
||
'LegalPorno',
|
||
'Pornworld', # Pornbox
|
||
'WowGirls', # Wow (Network)
|
||
'x-art', # Malibu Media
|
||
'VIPissy', # VIPissy Cash
|
||
'Japan AV Blu-Ray', # japan
|
||
'siterip', # siterip
|
||
'NewMFX', # Brazil
|
||
'Wicked', # Wicked
|
||
'Swallowed', # Sticky Dollars
|
||
'ManyVids', # ManyVids
|
||
'AnalOverdose', # PervCity
|
||
]
|
||
|
||
class ClmSpider(BaseSpider):
|
||
name = SPIDER_NAME_CLM
|
||
allowed_domains = ["clmclm.com"]
|
||
search_url = 'https://www.clmclm.com/search'
|
||
|
||
def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs):
|
||
super().__init__(*args, **kwargs)
|
||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||
self.logger.info(f"debug mod: {self.debug}")
|
||
|
||
self.keywords = keywords
|
||
self.min_size = float(min_size) if min_size else 1.0
|
||
|
||
# 入口函数,由基类的方法触发
|
||
def custom_start_requests(self):
|
||
list_words = self.keywords.split(',') if self.keywords else default_keywords
|
||
|
||
item = ClmKeyWordsItem()
|
||
item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||
item['words'] = self.keywords if self.keywords else 'default keywords'
|
||
yield item
|
||
|
||
for item in list_words:
|
||
encoded_keyword = quote_plus(item.strip())
|
||
|
||
# 构造POST表单数据
|
||
form_data = {
|
||
#'csrf_token': self.csrf_token,
|
||
'search': encoded_keyword
|
||
}
|
||
|
||
# 发起搜索POST请求
|
||
yield scrapy.FormRequest(
|
||
url=self.search_url,
|
||
method='POST',
|
||
formdata=form_data,
|
||
#headers=self._get_headers(),
|
||
# 不自动跟随重定向,手动处理302
|
||
meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
|
||
callback=self.handle_redirect
|
||
)
|
||
|
||
# 处理POST过来的302请求
|
||
def handle_redirect(self, response):
|
||
"""处理302重定向,获取location并访问结果页面"""
|
||
# 从响应头获取重定向地址
|
||
location = response.headers.get('Location', None)
|
||
if not location:
|
||
self.logger.error("未找到302重定向地址")
|
||
return
|
||
|
||
# 转换为字符串并处理编码
|
||
result_url = location.decode('utf-8')
|
||
self.logger.info(f"重定向到结果页: {result_url}")
|
||
|
||
# 访问重定向后的结果页面,使用之前的解析方法
|
||
yield scrapy.Request(
|
||
url=result_url,
|
||
#headers=self._get_headers(),
|
||
callback=self.parse_page_common
|
||
)
|
||
|
||
|
||
def parse_page_common(self, response):
|
||
|
||
need_next = False
|
||
# 提取所有 ssbox 节点(每个 ssbox 对应一条数据)
|
||
ssboxes = response.xpath('//div[@class="ssbox"]')
|
||
|
||
for ssbox in ssboxes:
|
||
# 1. 提取 h3 中的链接和文本
|
||
h3_span = ssbox.xpath('.//div[@class="title"]/h3/span')
|
||
category = h3_span.xpath('text()').get().strip() if h3_span else ''
|
||
# h3 下的 a 标签(标题链接)
|
||
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
|
||
# 标题文本(如 "Vixen.2025.05")
|
||
title_text = h3_a.xpath('text()').get().strip() if h3_a else None
|
||
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html")
|
||
title_href = h3_a.xpath('@href').get() if h3_a else None
|
||
# 若链接是相对路径,可拼接成完整URL(根据网站域名调整)
|
||
full_title_href = response.urljoin(title_href) if title_href else None
|
||
|
||
# 2. 提取 slist 中的文件名(可选,根据需求决定是否保留)
|
||
# 文件名(如 "vixen.25.05.09....mp4")
|
||
file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get()
|
||
# 去除文件名后的大小文本(如 "8.3 GB"),只保留文件名
|
||
if file_name:
|
||
file_name = file_name.split(' ')[0].strip() # 分割并取文件名部分
|
||
|
||
# 3. 提取 sbar 中的信息
|
||
sbar = ssbox.xpath('.//div[@class="sbar"]')
|
||
# 磁力链接(sbar 中的 a 标签 href)
|
||
magnet_href = sbar.xpath('.//a/@href').get() if sbar else None
|
||
# 添加时间(如 "2025-06-13")
|
||
add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
|
||
# 大小(如 "39.5 GB")
|
||
size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
|
||
# 热度(如 "435")
|
||
heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
|
||
# 最近下载时间(可选提取)
|
||
last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None
|
||
|
||
size_gb = parse_size(size)
|
||
if size_gb < self.min_size:
|
||
continue
|
||
|
||
item = ClmIndexItem()
|
||
item['item_type'] = ITEM_TYPE_CLM_INDEX
|
||
item['category'] = category
|
||
item['title'] = title_text
|
||
item['href'] = full_title_href
|
||
item['magnet_href'] = magnet_href
|
||
item['size_text'] = size
|
||
item['size_gb'] = size_gb
|
||
item['heat'] = int(heat)
|
||
item['add_date'] = add_time
|
||
item['last_down_date'] = last_download
|
||
|
||
yield item
|
||
|
||
if self.debug:
|
||
return
|
||
|
||
# 解析下一页链接
|
||
pager = response.xpath('//div[@class="pager"]')
|
||
if pager:
|
||
total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() if sbar else ''
|
||
|
||
# 定位“下一页”的a标签(通过文本定位,避免混淆其他a标签)
|
||
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
|
||
# 提取href属性
|
||
next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()
|
||
|
||
# 判断是否还有下一页
|
||
if next_page_href and next_page_href != '#':
|
||
# 拼接完整URL(相对路径转绝对路径)
|
||
next_page_url = response.urljoin(next_page_href)
|
||
self.logger.info(f'{total_text}, 发现下一页:{next_page_url}')
|
||
# 递归请求下一页
|
||
yield scrapy.Request(
|
||
url=next_page_url,
|
||
callback=self.parse_page_common,
|
||
dont_filter=True # 允许重复请求(防止因URL参数被过滤)
|
||
)
|
||
else:
|
||
# 当href为#或不存在时,说明已无下一页
|
||
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}')
|
||
|