353 lines
16 KiB
Python
353 lines
16 KiB
Python
from datetime import datetime
|
||
import scrapy
|
||
import sys
|
||
from urllib.parse import urljoin, quote_plus
|
||
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file
|
||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
|
||
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
|
||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler
|
||
|
||
db_clm = ClmDBHandler()
|
||
default_keywords_file = 'scrapy_proj/data/clm_keywords.json'
|
||
|
||
class ClmSpider(BaseSpider):
|
||
name = SPIDER_NAME_CLM
|
||
allowed_domains = ["clmclm.com"]
|
||
search_url = 'https://www.clmclm.com/search'
|
||
|
||
def __init__(self, debug='False', min_size=None, begin=None, mod='all', *args, **kwargs):
|
||
super().__init__(*args, **kwargs)
|
||
|
||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||
self.update_mod = False
|
||
self.run_task = True
|
||
self.fix_title = False
|
||
|
||
# 增加一个更新模式,需要传入 mod == update 并且有 开始时间
|
||
self.begin = parse_date_to_datetime(begin) if begin else None
|
||
self.min_size = float(min_size) if min_size else 1.0
|
||
self.keywords_file = kwargs.get('file_path') if kwargs.get('file_path') else default_keywords_file
|
||
|
||
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||
|
||
# 增加一个暗号
|
||
if mod.lower() == 'update' and self.begin:
|
||
self.update_mod = True
|
||
elif mod.lower() == 'reload' :
|
||
self.reload_keywords()
|
||
self.run_task = False
|
||
elif mod.lower() == 'fix' :
|
||
self.fix_title = True
|
||
self.run_task = False
|
||
|
||
# 重新加载关键词
|
||
def reload_keywords(self):
|
||
self.load_keywords_from_file()
|
||
self.load_keywords_from_db()
|
||
self.logger.info(f"reload keywords db succ!")
|
||
|
||
# 入口函数,由基类的方法触发
|
||
def custom_start_requests(self):
|
||
if self.fix_title:
|
||
data = db_clm.get_empty_title()
|
||
if data:
|
||
self.logger.info(f"rows to be fixed: {len(data)}")
|
||
for row in data:
|
||
url = row['href']
|
||
# 递归请求下一页
|
||
yield scrapy.Request(
|
||
url=url,
|
||
callback=self.parse_page_detail,
|
||
meta={'url': url},
|
||
dont_filter=True # 允许重复请求(防止因URL参数被过滤)
|
||
)
|
||
else:
|
||
self.logger.warning(f"no data.")
|
||
|
||
# 一些初始化等等的任务,不需要继续执行
|
||
if not self.run_task:
|
||
return
|
||
|
||
#tmp_query_str = f" groups='actress' and tags not like '%vixen%' "
|
||
tmp_query_str = f" 1=1 "
|
||
if self.debug:
|
||
keywords = db_clm.get_key_words(limit =5, query_str = tmp_query_str)
|
||
else:
|
||
#keywords = db_clm.get_key_words(groups='actress', tags='vixen')
|
||
keywords = db_clm.get_key_words(query_str = tmp_query_str)
|
||
|
||
for item in keywords:
|
||
words_id = item['id']
|
||
words = item['words']
|
||
encoded_keyword = quote_plus(words.strip())
|
||
|
||
# 构造POST表单数据
|
||
form_data = {
|
||
#'csrf_token': self.csrf_token,
|
||
'search': encoded_keyword
|
||
}
|
||
|
||
# 发起搜索POST请求
|
||
yield scrapy.FormRequest(
|
||
url=self.search_url,
|
||
method='POST',
|
||
formdata=form_data,
|
||
#headers=self._get_headers(),
|
||
# 不自动跟随重定向,手动处理302
|
||
meta={'dont_redirect': True, 'handle_httpstatus_list': [302], 'words_id': words_id, 'words': words},
|
||
callback=self.handle_redirect
|
||
)
|
||
|
||
# 处理POST过来的302请求
|
||
def handle_redirect(self, response):
|
||
"""处理302重定向,获取location并访问结果页面"""
|
||
# 从响应头获取重定向地址
|
||
location = response.headers.get('Location', None)
|
||
if not location:
|
||
self.logger.error("未找到302重定向地址")
|
||
return
|
||
|
||
# 转换为字符串并处理编码
|
||
result_url = location.decode('utf-8')
|
||
|
||
# 访问重定向后的结果页面,如果要求按时间排序,需要中转一下链接
|
||
if self.update_mod and self.begin :
|
||
self.logger.info(f"重定向到结果页: {result_url}, 继续寻找按时间排序的链接")
|
||
yield scrapy.Request(
|
||
url=result_url,
|
||
#headers=self._get_headers(),
|
||
callback=self.parse_page_by_date,
|
||
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
|
||
)
|
||
else:
|
||
self.logger.info(f"重定向到结果页: {result_url}, 全量拉取")
|
||
yield scrapy.Request(
|
||
url=result_url,
|
||
#headers=self._get_headers(),
|
||
callback=self.parse_page_common,
|
||
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
|
||
)
|
||
|
||
def parse_page_by_date(self, response):
|
||
# 解析"添加时间"对应的链接
|
||
# 使用CSS选择器定位包含"添加时间"文本的a标签
|
||
add_time_link = response.css(
|
||
'div.sortbar a:contains("添加时间")::attr(href)'
|
||
).get()
|
||
|
||
result_url = response.url
|
||
if add_time_link:
|
||
# 拼接完整URL(response.url为当前页面URL,用于补全相对路径)
|
||
result_url = urljoin(response.url, add_time_link)
|
||
self.logger.info(f"获取到按时间排序的结果页,开始请求: {result_url}")
|
||
|
||
else:
|
||
self.logger.warning(f"未找到添加时间对应的链接,使用原来的地址: {result_url}")
|
||
|
||
yield scrapy.Request(
|
||
url=result_url,
|
||
#headers=self._get_headers(),
|
||
callback=self.parse_page_common,
|
||
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
|
||
)
|
||
|
||
def parse_page_common(self, response):
|
||
need_next = False
|
||
# 提取所有 ssbox 节点(每个 ssbox 对应一条数据)
|
||
ssboxes = response.xpath('//div[@class="ssbox"]')
|
||
if not ssboxes:
|
||
self.logger.warning(f"无法解析页面。 url: {response.url}")
|
||
return
|
||
|
||
for ssbox in ssboxes:
|
||
# 1. 提取 h3 中的链接和文本
|
||
h3_span = ssbox.xpath('.//div[@class="title"]/h3/span')
|
||
category = h3_span.xpath('text()').get().strip() if h3_span else ''
|
||
# h3 下的 a 标签(标题链接)
|
||
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
|
||
# 标题文本(如 "Vixen.2025.05")
|
||
#title_text = h3_a.xpath('text()').get().strip() if h3_a else None
|
||
title_text = extract_text_from_element(h3_a, use_title=True)
|
||
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html")
|
||
title_href = h3_a.xpath('@href').get() if h3_a else None
|
||
# 若链接是相对路径,可拼接成完整URL(根据网站域名调整)
|
||
full_title_href = response.urljoin(title_href) if title_href else None
|
||
|
||
# 2. 提取 slist 中的文件名(可选,根据需求决定是否保留)
|
||
# 文件名(如 "vixen.25.05.09....mp4")
|
||
file_name = ssbox.xpath('.//div[@class="slist"]/ul/li/text()').get()
|
||
# 去除文件名后的大小文本(如 "8.3 GB"),只保留文件名
|
||
if file_name:
|
||
file_name = file_name.split(' ')[0].strip() # 分割并取文件名部分
|
||
|
||
# 3. 提取 sbar 中的信息
|
||
sbar = ssbox.xpath('.//div[@class="sbar"]')
|
||
# 磁力链接(sbar 中的 a 标签 href)
|
||
magnet_href = sbar.xpath('.//a/@href').get() if sbar else None
|
||
# 添加时间(如 "2025-06-13")
|
||
add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
|
||
# 大小(如 "39.5 GB")
|
||
size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
|
||
size_gb = parse_size(size)
|
||
# 热度(如 "435")
|
||
heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
|
||
# 最近下载时间(可选提取)
|
||
last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None
|
||
|
||
item = ClmIndexItem()
|
||
item['item_type'] = ITEM_TYPE_CLM_INDEX
|
||
item['category'] = category
|
||
item['title'] = title_text
|
||
item['href'] = full_title_href
|
||
item['magnet_href'] = magnet_href
|
||
item['size_text'] = size
|
||
item['size_gb'] = size_gb
|
||
item['heat'] = int(heat)
|
||
item['add_date'] = add_time
|
||
item['last_down_date'] = last_download
|
||
item['key_words_id'] = response.meta.get('words_id', 0)
|
||
item['key_words'] = response.meta.get('words', '')
|
||
item['is_update'] = False
|
||
|
||
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
||
up_date = parse_date_to_datetime(item['add_date'])
|
||
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
|
||
pass
|
||
else:
|
||
need_next = True
|
||
|
||
# 太小的文件不要
|
||
if size_gb < self.min_size:
|
||
continue
|
||
|
||
yield item
|
||
|
||
# 解析下一页链接
|
||
next_page_url = None
|
||
total_text = ''
|
||
pager = response.xpath('//div[@class="pager"]')
|
||
if pager:
|
||
total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get()
|
||
# 定位“下一页”的a标签(通过文本定位,避免混淆其他a标签)
|
||
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
|
||
next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()
|
||
# 判断是否还有下一页
|
||
if next_page_href and next_page_href != '#':
|
||
# 拼接完整URL(相对路径转绝对路径)
|
||
next_page_url = response.urljoin(next_page_href)
|
||
|
||
if self.debug:
|
||
self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}')
|
||
return
|
||
elif not need_next or not next_page_url:
|
||
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
|
||
curr_words = response.meta.get('words', '')
|
||
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
|
||
return
|
||
|
||
self.logger.debug(f'{total_text}, 发现下一页:{next_page_url}')
|
||
# 递归请求下一页
|
||
yield scrapy.Request(
|
||
url=next_page_url,
|
||
callback=self.parse_page_common,
|
||
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')},
|
||
dont_filter=True # 允许重复请求(防止因URL参数被过滤)
|
||
)
|
||
|
||
def parse_page_detail(self, response):
|
||
# 匹配 class 为 'bt_title' 的 div 下的 h2 标签文本
|
||
title_xpath = response.xpath('//div[@class="bt_title"]/h2/text()').get()
|
||
|
||
item = ClmIndexItem()
|
||
item['item_type'] = ITEM_TYPE_CLM_INDEX
|
||
item['title'] = title_xpath
|
||
item['href'] = response.meta['url']
|
||
item['is_update'] = True
|
||
|
||
yield item
|
||
|
||
|
||
# 指定的关键词,导入到数据库
|
||
def load_keywords_from_file(self):
|
||
self.logger.info(f"load keywords from file: {self.keywords_file}")
|
||
json_data, err = load_json_file(self.keywords_file)
|
||
if not json_data:
|
||
self.logger.warning(f"load file error. {err}")
|
||
return
|
||
|
||
total_lines = 0
|
||
for group, items in json_data.items():
|
||
total_lines += len(items)
|
||
self.logger.info(f"load group ({group}), {len(items)} items.")
|
||
for item in items:
|
||
words_item = ClmKeyWordsItem()
|
||
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||
words_item['words'] = item
|
||
words_item['groups'] = group
|
||
words_item['tags'] = ''
|
||
words_item['index_count'] = 0
|
||
db_clm.insert_item(words_item)
|
||
self.logger.debug(f"insert item: {item}: {group}")
|
||
|
||
self.logger.info(f"load {self.keywords_file} succ, total lines: {total_lines}")
|
||
|
||
# 从其他数据源获取到演员列表,导入到数据库
|
||
def load_keywords_from_db(self):
|
||
db_comm = IAFDDBHandler()
|
||
all_likes = {
|
||
'vixen' : ['vixen.com', 'Vixen Video'],
|
||
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
|
||
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
|
||
'x-art' : ['x-art.com', 'X-art'],
|
||
'nfbusty' : ['nfbusty.com']
|
||
}
|
||
# 先转换个格式
|
||
all_key_group = {}
|
||
all_keys = []
|
||
for group, keys in all_likes.items():
|
||
for key in keys:
|
||
all_key_group[key] = group
|
||
all_keys.append(key)
|
||
|
||
# 查询数据库,并转换数据
|
||
actor_tags = {}
|
||
total_lines = 0
|
||
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
|
||
for dist, actors in results.items():
|
||
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
|
||
total_lines += len(actors)
|
||
for actor in actors :
|
||
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
|
||
actor_name = actor['name']
|
||
current_tag = all_key_group.get(dist, '')
|
||
if actor_name not in actor_tags:
|
||
actor_tags[actor_name] = set() # 用set自动去重
|
||
if current_tag:
|
||
actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略
|
||
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
|
||
|
||
# 查询另一个数据表,获取结果
|
||
load_results = db_comm.get_lord_actors()
|
||
if load_results:
|
||
self.logger.info(f"total actors in thelordofporn: {len(load_results)}")
|
||
for row in load_results:
|
||
actor_name = row['name']
|
||
if actor_name not in actor_tags:
|
||
actor_tags[actor_name] = set() # 用set自动去重
|
||
actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略
|
||
|
||
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
|
||
for actor, tags_set in actor_tags.items():
|
||
tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换
|
||
self.logger.debug(f"actor: {actor}, tags: {tag_str}")
|
||
words_item = ClmKeyWordsItem()
|
||
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||
words_item['words'] = actor
|
||
words_item['groups'] = 'actress'
|
||
words_item['tags'] = tag_str
|
||
words_item['index_count'] = 0
|
||
db_clm.insert_item(words_item)
|
||
self.logger.debug(f"insert item: {words_item}")
|