modify scripts

This commit is contained in:
2025-07-20 13:40:45 +08:00
parent 5c01084095
commit 34728b7868
3 changed files with 154 additions and 86 deletions

View File

@ -358,7 +358,7 @@ class IAFDDBHandler(SQLiteDBHandler):
placeholders=', '.join(['?'] * len(names))
)
logging.info(f'{insert_sql}')
logging.debug(f'{insert_sql}')
self.cursor.execute(insert_sql, names)
self.conn.commit() # 提交临时表数据

View File

@ -1,41 +1,52 @@
from datetime import datetime
import scrapy
import sys
from urllib.parse import urljoin, quote_plus
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler
db_clm = ClmDBHandler()
default_keywords_file = 'scrapy_proj/data/clm_keywords.json'
class ClmSpider(BaseSpider):
name = SPIDER_NAME_CLM
allowed_domains = ["clmclm.com"]
search_url = 'https://www.clmclm.com/search'
def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs):
def __init__(self, debug='False', min_size=None, begin=None, mod='all', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.logger.info(f"debug mod: {self.debug}")
self.keywords = keywords
self.min_size = float(min_size) if min_size else 1.0
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.update_mod = False
self.run_task = True
self.fix_title = False
# 增加一个暗号
if keywords and keywords.lower() == 'reload' :
self.initDB()
self.init_load_actors_from_others()
self.run_task = False
self.logger.info(f"reload keywords db succ!")
# 增加一个更新模式,需要传入 mod == update 并且有 开始时间
self.begin = parse_date_to_datetime(begin) if begin else None
self.min_size = float(min_size) if min_size else 1.0
self.keywords_file = kwargs.get('file_path') if kwargs.get('file_path') else default_keywords_file
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
# 增加一个暗号
if keywords and keywords.lower() == 'fix' :
if mod.lower() == 'update' and self.begin:
self.update_mod = True
elif mod.lower() == 'reload' :
self.reload_keywords()
self.run_task = False
elif mod.lower() == 'fix' :
self.fix_title = True
self.run_task = False
# 重新加载关键词
def reload_keywords(self):
self.load_keywords_from_file()
self.load_keywords_from_db()
self.logger.info(f"reload keywords db succ!")
# 入口函数,由基类的方法触发
def custom_start_requests(self):
if self.fix_title:
@ -54,10 +65,12 @@ class ClmSpider(BaseSpider):
else:
self.logger.warning(f"no data.")
# 一些初始化等等的任务,不需要继续执行
if not self.run_task:
return
tmp_query_str = f" groups='actress' and tags not like '%vixen%' "
#tmp_query_str = f" groups='actress' and tags not like '%vixen%' "
tmp_query_str = f" 1=1 "
if self.debug:
keywords = db_clm.get_key_words(limit =5, query_str = tmp_query_str)
else:
@ -97,9 +110,41 @@ class ClmSpider(BaseSpider):
# 转换为字符串并处理编码
result_url = location.decode('utf-8')
self.logger.info(f"重定向到结果页: {result_url}")
# 访问重定向后的结果页面,使用之前的解析方法
# 访问重定向后的结果页面,如果要求按时间排序,需要中转一下链接
if self.update_mod and self.begin :
self.logger.info(f"重定向到结果页: {result_url}, 继续寻找按时间排序的链接")
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
callback=self.parse_page_by_date,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
)
else:
self.logger.info(f"重定向到结果页: {result_url}, 全量拉取")
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
callback=self.parse_page_common,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
)
def parse_page_by_date(self, response):
# 解析"添加时间"对应的链接
# 使用CSS选择器定位包含"添加时间"文本的a标签
add_time_link = response.css(
'div.sortbar a:contains("添加时间")::attr(href)'
).get()
result_url = response.url
if add_time_link:
# 拼接完整URLresponse.url为当前页面URL用于补全相对路径
result_url = urljoin(response.url, add_time_link)
self.logger.info(f"获取到按时间排序的结果页,开始请求: {result_url}")
else:
self.logger.warning(f"未找到添加时间对应的链接,使用原来的地址: {result_url}")
yield scrapy.Request(
url=result_url,
#headers=self._get_headers(),
@ -107,12 +152,13 @@ class ClmSpider(BaseSpider):
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
)
def parse_page_common(self, response):
need_next = False
# 提取所有 ssbox 节点(每个 ssbox 对应一条数据)
ssboxes = response.xpath('//div[@class="ssbox"]')
if not ssboxes:
self.logger.warning(f"无法解析页面。 url: {response.url}")
return
for ssbox in ssboxes:
# 1. 提取 h3 中的链接和文本
@ -143,15 +189,12 @@ class ClmSpider(BaseSpider):
add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None
# 大小(如 "39.5 GB"
size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None
size_gb = parse_size(size)
# 热度(如 "435"
heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None
# 最近下载时间(可选提取)
last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None
size_gb = parse_size(size)
if size_gb < self.min_size:
continue
item = ClmIndexItem()
item['item_type'] = ITEM_TYPE_CLM_INDEX
item['category'] = category
@ -167,38 +210,51 @@ class ClmSpider(BaseSpider):
item['key_words'] = response.meta.get('words', '')
item['is_update'] = False
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
up_date = parse_date_to_datetime(item['add_date'])
#self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
pass
else:
need_next = True
# 太小的文件不要
if size_gb < self.min_size:
continue
yield item
if self.debug:
return
# 解析下一页链接
next_page_url = None
total_text = ''
pager = response.xpath('//div[@class="pager"]')
if pager:
total_text = pager.xpath('.//span[contains(text(), "")]/text()').get()
# 定位“下一页”的a标签通过文本定位避免混淆其他a标签
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
# 提取href属性
next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get()
# 判断是否还有下一页
if next_page_href and next_page_href != '#':
# 拼接完整URL相对路径转绝对路径
next_page_url = response.urljoin(next_page_href)
self.logger.debug(f'{total_text}, 发现下一页:{next_page_url}')
# 递归请求下一页
yield scrapy.Request(
url=next_page_url,
callback=self.parse_page_common,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')},
dont_filter=True # 允许重复请求防止因URL参数被过滤
)
else:
# 当href为#或不存在时,说明已无下一页
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
curr_words = response.meta.get('words', '')
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
next_page_url = response.urljoin(next_page_href)
if self.debug:
self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}')
return
elif not need_next or not next_page_url:
total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
curr_words = response.meta.get('words', '')
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
return
self.logger.debug(f'{total_text}, 发现下一页:{next_page_url}')
# 递归请求下一页
yield scrapy.Request(
url=next_page_url,
callback=self.parse_page_common,
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')},
dont_filter=True # 允许重复请求防止因URL参数被过滤
)
def parse_page_detail(self, response):
# 匹配 class 为 'bt_title' 的 div 下的 h2 标签文本
@ -214,48 +270,31 @@ class ClmSpider(BaseSpider):
# 指定的关键词,导入到数据库
def initDB(self):
default_keywords = [
{'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']},
{'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']},
{'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']},
{'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']},
{'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']},
{'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']},
{'Brazzers' : ['Real Wife Stories', 'brazzers']},
{'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']},
{'BangBros' : ['BangBus', 'BangBros']},
{'Nubile Films' : ['nfbusty', 'NubileFilms']},
{'DDF Network' : ['DDFBusty']},
{'Adult Time (Network)' : ['AdultTime', 'BurningAngel']},
{'Anal Vids' : ['AnalVids']},
{'LegalPorno' : ['LegalPorno']},
{'Pornbox' : ['Pornworld']},
{'Wow (Network)' : ['WowGirls']},
{'Malibu Media' : ['x-art']},
{'VIPissy Cash' : ['VIPissy']},
{'japan Blu-Ray' : ['Japan AV Blu-Ray']},
{'siterip' : ['siterip']},
{'Brazil' : ['NewMFX']},
{'Wicked' : ['Wicked']},
{'Sticky Dollars' : ['Swallowed']},
{'ManyVids' : ['ManyVids']},
{'PervCity' : ['AnalOverdose']}
]
for row in default_keywords:
for group, items in row.items():
for item in items:
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = item
words_item['groups'] = group
words_item['tags'] = ''
words_item['index_count'] = 0
db_clm.insert_item(words_item)
self.logger.debug(f"insert item: {item}: {group}")
def load_keywords_from_file(self):
self.logger.info(f"load keywords from file: {self.keywords_file}")
json_data, err = load_json_file(self.keywords_file)
if not json_data:
self.logger.warning(f"load file error. {err}")
return
total_lines = 0
for group, items in json_data.items():
total_lines += len(items)
self.logger.info(f"load group ({group}), {len(items)} items.")
for item in items:
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = item
words_item['groups'] = group
words_item['tags'] = ''
words_item['index_count'] = 0
db_clm.insert_item(words_item)
self.logger.debug(f"insert item: {item}: {group}")
self.logger.info(f"load {self.keywords_file} succ, total lines: {total_lines}")
# 从其他数据源获取到演员列表,导入到数据库
def init_load_actors_from_others(self):
def load_keywords_from_db(self):
db_comm = IAFDDBHandler()
all_likes = {
'vixen' : ['vixen.com', 'Vixen Video'],
@ -292,7 +331,7 @@ class ClmSpider(BaseSpider):
# 查询另一个数据表,获取结果
load_results = db_comm.get_lord_actors()
if load_results:
self.logger.info(f"total actors in lord: {len(load_results)}")
self.logger.info(f"total actors in thelordofporn: {len(load_results)}")
for row in load_results:
actor_name = row['name']
if actor_name not in actor_tags:
@ -302,7 +341,7 @@ class ClmSpider(BaseSpider):
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
for actor, tags_set in actor_tags.items():
tag_str = ','.join(tags_set) # set直接支持迭代无需额外转换
self.logger.info(f"actor: {actor}, tags: {tag_str}")
self.logger.debug(f"actor: {actor}, tags: {tag_str}")
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = actor
@ -310,4 +349,4 @@ class ClmSpider(BaseSpider):
words_item['tags'] = tag_str
words_item['index_count'] = 0
db_clm.insert_item(words_item)
#self.logger.debug(f"insert item: {words_item}")
self.logger.debug(f"insert item: {words_item}")

View File

@ -1,6 +1,35 @@
import re
import json
import os
from datetime import datetime, timezone
def load_json_file(file_path):
# 检查文件是否存在
if not os.path.exists(file_path):
return None, f"{file_path} not exists."
# 检查是否是文件(避免目录路径)
if not os.path.isfile(file_path):
return None, f"{file_path} is not file."
try:
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
try:
# 解析JSON数据
json_data = json.load(f)
return json_data, None
except json.JSONDecodeError as e:
return None, f"JSON格式解析失败 - {e}"
except Exception as e:
return None, f"读取文件内容时发生异常 - {e}"
except PermissionError:
return None, f"错误: 没有权限读取文件 - {file_path}"
except Exception as e:
return None, f"错误: 打开文件时发生异常 - {e}"
return None, "未知错误"
'''timestamp(ms) 转为日期'''
def format_timestamp(ts, is_ms=True):
if not ts: