This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/clm_spider.py
2025-07-20 13:40:45 +08:00

353 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from datetime import datetime
import scrapy
import sys
from urllib.parse import urljoin, quote_plus
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler
db_clm = ClmDBHandler()
default_keywords_file = 'scrapy_proj/data/clm_keywords.json'
class ClmSpider(BaseSpider):
name = SPIDER_NAME_CLM
allowed_domains = ["clmclm.com"]
search_url = 'https://www.clmclm.com/search'
def __init__(self, debug='False', min_size=None, begin=None, mod='all', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.update_mod = False
self.run_task = True
self.fix_title = False
# 增加一个更新模式,需要传入 mod == update 并且有 开始时间
self.begin = parse_date_to_datetime(begin) if begin else None
self.min_size = float(min_size) if min_size else 1.0
self.keywords_file = kwargs.get('file_path') if kwargs.get('file_path') else default_keywords_file
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
# 增加一个暗号
if mod.lower() == 'update' and self.begin:
self.update_mod = True
elif mod.lower() == 'reload' :
self.reload_keywords()
self.run_task = False
elif mod.lower() == 'fix' :
self.fix_title = True
self.run_task = False
# 重新加载关键词
def reload_keywords(self):
self.load_keywords_from_file()
self.load_keywords_from_db()
self.logger.info(f"reload keywords db succ!")
# 入口函数,由基类的方法触发
def custom_start_requests(self):
if self.fix_title:
data = db_clm.get_empty_title()
if data:
self.logger.info(f"rows to be fixed: {len(data)}")
for row in data:
url = row['href']
# 递归请求下一页
yield scrapy.Request(
url=url,
callback=self.parse_page_detail,
meta={'url': url},
dont_filter=True # 允许重复请求防止因URL参数被过滤
)
else:
self.logger.warning(f"no data.")
# 一些初始化等等的任务,不需要继续执行
if not self.run_task:
return
#tmp_query_str = f" groups='actress' and tags not like '%vixen%' "
tmp_query_str = f" 1=1 "
if self.debug:
keywords = db_clm.get_key_words(limit =5, query_str = tmp_query_str)
else:
#keywords = db_clm.get_key_words(groups='actress', tags='vixen')
keywords = db_clm.get_key_words(query_str = tmp_query_str)
for item in keywords:
words_id = item['id']
words = item['words']
encoded_keyword = quote_plus(words.strip())
# 构造POST表单数据
form_data = {
#'csrf_token': self.csrf_token,
'search': encoded_keyword
}
# 发起搜索POST请求
yield scrapy.FormRequest(
url=self.search_url,
method='POST',
formdata=form_data,
#headers=self._get_headers(),
# 不自动跟随重定向手动处理302
meta={'dont_redirect': True, 'handle_httpstatus_list': [302], 'words_id': words_id, 'words': words},
callback=self.handle_redirect
)
# 处理POST过来的302请求
def handle_redirect(self, response):
"""处理302重定向获取location并访问结果页面"""
# 从响应头获取重定向地址
location = response.headers.get('Location', None)
if not location:
self.logger.error("未找到302重定向地址")
return
# 转换为字符串并处理编码
result_url = location.decode('utf-8')
# 访问重定向后的结果页面,如果要求按时间排序,需要中转一下链接
if self.update_mod and self.begin :
self.logger.info(f"重定向到结果页: {result_url}, 继续寻找按时间排序的链接")
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
callback=self.parse_page_by_date,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
)
else:
self.logger.info(f"重定向到结果页: {result_url}, 全量拉取")
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
callback=self.parse_page_common,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
)
def parse_page_by_date(self, response):
# 解析"添加时间"对应的链接
# 使用CSS选择器定位包含"添加时间"文本的a标签
add_time_link = response.css(
'div.sortbar a:contains("添加时间")::attr(href)'
).get()
result_url = response.url
if add_time_link:
# 拼接完整URLresponse.url为当前页面URL用于补全相对路径
result_url = urljoin(response.url, add_time_link)
self.logger.info(f"获取到按时间排序的结果页,开始请求: {result_url}")
else:
self.logger.warning(f"未找到添加时间对应的链接,使用原来的地址: {result_url}")
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
callback=self.parse_page_common,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
)
def parse_page_common(self, response):
need_next = False
# 提取所有 ssbox 节点(每个 ssbox 对应一条数据)
ssboxes = response.xpath('//div[@class="ssbox"]')
if not ssboxes:
self.logger.warning(f"无法解析页面。 url: {response.url}")
return
for ssbox in ssboxes:
# 1. 提取 h3 中的链接和文本
h3_span = ssbox.xpath('.//div[@class="title"]/h3/span')
category = h3_span.xpath('text()').get().strip() if h3_span else ''
# h3 下的 a 标签(标题链接)
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
# 标题文本(如 "Vixen.2025.05"
#title_text = h3_a.xpath('text()').get().strip() if h3_a else None
title_text = extract_text_from_element(h3_a, use_title=True)
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html"
title_href = h3_a.xpath('@href').get() if h3_a else None
# 若链接是相对路径可拼接成完整URL根据网站域名调整
full_title_href = response.urljoin(title_href) if title_href else None
# 2. 提取 slist 中的文件名(可选,根据需求决定是否保留)
# 文件名(如 "vixen.25.05.09....mp4"
file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get()
# 去除文件名后的大小文本(如 "8.3 GB"),只保留文件名
if file_name:
file_name = file_name.split(' ')[0].strip() # 分割并取文件名部分
# 3. 提取 sbar 中的信息
sbar = ssbox.xpath('.//div[@class="sbar"]')
# 磁力链接sbar 中的 a 标签 href
magnet_href = sbar.xpath('.//a/@href').get() if sbar else None
# 添加时间(如 "2025-06-13"
add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
# 大小(如 "39.5 GB"
size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
size_gb = parse_size(size)
# 热度(如 "435"
heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
# 最近下载时间(可选提取)
last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None
item = ClmIndexItem()
item['item_type'] = ITEM_TYPE_CLM_INDEX
item['category'] = category
item['title'] = title_text
item['href'] = full_title_href
item['magnet_href'] = magnet_href
item['size_text'] = size
item['size_gb'] = size_gb
item['heat'] = int(heat)
item['add_date'] = add_time
item['last_down_date'] = last_download
item['key_words_id'] = response.meta.get('words_id', 0)
item['key_words'] = response.meta.get('words', '')
item['is_update'] = False
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
up_date = parse_date_to_datetime(item['add_date'])
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
pass
else:
need_next = True
# 太小的文件不要
if size_gb < self.min_size:
continue
yield item
# 解析下一页链接
next_page_url = None
total_text = ''
pager = response.xpath('//div[@class="pager"]')
if pager:
total_text = pager.xpath('.//span[contains(text(), "")]/text()').get()
# 定位“下一页”的a标签通过文本定位避免混淆其他a标签
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()
# 判断是否还有下一页
if next_page_href and next_page_href != '#':
# 拼接完整URL相对路径转绝对路径
next_page_url = response.urljoin(next_page_href)
if self.debug:
self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}')
return
elif not need_next or not next_page_url:
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
curr_words = response.meta.get('words', '')
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
return
self.logger.debug(f'{total_text}, 发现下一页:{next_page_url}')
# 递归请求下一页
yield scrapy.Request(
url=next_page_url,
callback=self.parse_page_common,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')},
dont_filter=True # 允许重复请求防止因URL参数被过滤
)
def parse_page_detail(self, response):
# 匹配 class 为 'bt_title' 的 div 下的 h2 标签文本
title_xpath = response.xpath('//div[@class="bt_title"]/h2/text()').get()
item = ClmIndexItem()
item['item_type'] = ITEM_TYPE_CLM_INDEX
item['title'] = title_xpath
item['href'] = response.meta['url']
item['is_update'] = True
yield item
# 指定的关键词,导入到数据库
def load_keywords_from_file(self):
self.logger.info(f"load keywords from file: {self.keywords_file}")
json_data, err = load_json_file(self.keywords_file)
if not json_data:
self.logger.warning(f"load file error. {err}")
return
total_lines = 0
for group, items in json_data.items():
total_lines += len(items)
self.logger.info(f"load group ({group}), {len(items)} items.")
for item in items:
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = item
words_item['groups'] = group
words_item['tags'] = ''
words_item['index_count'] = 0
db_clm.insert_item(words_item)
self.logger.debug(f"insert item: {item}: {group}")
self.logger.info(f"load {self.keywords_file} succ, total lines: {total_lines}")
# 从其他数据源获取到演员列表,导入到数据库
def load_keywords_from_db(self):
db_comm = IAFDDBHandler()
all_likes = {
'vixen' : ['vixen.com', 'Vixen Video'],
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
'x-art' : ['x-art.com', 'X-art'],
'nfbusty' : ['nfbusty.com']
}
# 先转换个格式
all_key_group = {}
all_keys = []
for group, keys in all_likes.items():
for key in keys:
all_key_group[key] = group
all_keys.append(key)
# 查询数据库,并转换数据
actor_tags = {}
total_lines = 0
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
for dist, actors in results.items():
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
total_lines += len(actors)
for actor in actors :
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
actor_name = actor['name']
current_tag = all_key_group.get(dist, '')
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
if current_tag:
actor_tags[actor_name].add(current_tag) # set的add方法重复值会自动忽略
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
# 查询另一个数据表,获取结果
load_results = db_comm.get_lord_actors()
if load_results:
self.logger.info(f"total actors in thelordofporn: {len(load_results)}")
for row in load_results:
actor_name = row['name']
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
actor_tags[actor_name].add('thelordofporn') # set的add方法重复值会自动忽略
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
for actor, tags_set in actor_tags.items():
tag_str = ','.join(tags_set) # set直接支持迭代无需额外转换
self.logger.debug(f"actor: {actor}, tags: {tag_str}")
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = actor
words_item['groups'] = 'actress'
words_item['tags'] = tag_str
words_item['index_count'] = 0
db_clm.insert_item(words_item)
self.logger.debug(f"insert item: {words_item}")