diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index f9dfe1b..10c9aa5 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -358,7 +358,7 @@ class IAFDDBHandler(SQLiteDBHandler): placeholders=', '.join(['?'] * len(names)) ) - logging.info(f'{insert_sql}') + logging.debug(f'{insert_sql}') self.cursor.execute(insert_sql, names) self.conn.commit() # 提交临时表数据 diff --git a/scrapy_proj/scrapy_proj/spiders/clm_spider.py b/scrapy_proj/scrapy_proj/spiders/clm_spider.py index 4610d11..7969480 100644 --- a/scrapy_proj/scrapy_proj/spiders/clm_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/clm_spider.py @@ -1,41 +1,52 @@ from datetime import datetime import scrapy +import sys from urllib.parse import urljoin, quote_plus -from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime +from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime, load_json_file from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler db_clm = ClmDBHandler() +default_keywords_file = 'scrapy_proj/data/clm_keywords.json' class ClmSpider(BaseSpider): name = SPIDER_NAME_CLM allowed_domains = ["clmclm.com"] search_url = 'https://www.clmclm.com/search' - def __init__(self, debug='False', keywords=None, min_size=None, *args, **kwargs): + def __init__(self, debug='False', min_size=None, begin=None, mod='all', *args, **kwargs): super().__init__(*args, **kwargs) - self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False - self.logger.info(f"debug mod: {self.debug}") - self.keywords = keywords - self.min_size = float(min_size) if min_size else 1.0 + self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False + self.update_mod = False self.run_task = True self.fix_title = False - # 增加一个暗号 - if keywords and keywords.lower() == 'reload' : - self.initDB() - self.init_load_actors_from_others() - self.run_task = False - self.logger.info(f"reload keywords db succ!") + # 增加一个更新模式,需要传入 mod == update 并且有 开始时间 + self.begin = parse_date_to_datetime(begin) if begin else None + self.min_size = float(min_size) if min_size else 1.0 + self.keywords_file = kwargs.get('file_path') if kwargs.get('file_path') else default_keywords_file + + self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") # 增加一个暗号 - if keywords and keywords.lower() == 'fix' : + if mod.lower() == 'update' and self.begin: + self.update_mod = True + elif mod.lower() == 'reload' : + self.reload_keywords() + self.run_task = False + elif mod.lower() == 'fix' : self.fix_title = True self.run_task = False + # 重新加载关键词 + def reload_keywords(self): + self.load_keywords_from_file() + self.load_keywords_from_db() + self.logger.info(f"reload keywords db succ!") + # 入口函数,由基类的方法触发 def custom_start_requests(self): if self.fix_title: @@ -54,10 +65,12 @@ class ClmSpider(BaseSpider): else: self.logger.warning(f"no data.") + # 一些初始化等等的任务,不需要继续执行 if not self.run_task: return - tmp_query_str = f" groups='actress' and tags not like '%vixen%' " + #tmp_query_str = f" groups='actress' and tags not like '%vixen%' " + tmp_query_str = f" 1=1 " if self.debug: keywords = db_clm.get_key_words(limit =5, query_str = tmp_query_str) else: @@ -97,9 +110,41 @@ class ClmSpider(BaseSpider): # 转换为字符串并处理编码 result_url = location.decode('utf-8') - self.logger.info(f"重定向到结果页: {result_url}") - # 访问重定向后的结果页面,使用之前的解析方法 + # 访问重定向后的结果页面,如果要求按时间排序,需要中转一下链接 + if self.update_mod and self.begin : + self.logger.info(f"重定向到结果页: {result_url}, 继续寻找按时间排序的链接") + yield scrapy.Request( + url=result_url, + #headers=self._get_headers(), + callback=self.parse_page_by_date, + meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')} + ) + else: + self.logger.info(f"重定向到结果页: {result_url}, 全量拉取") + yield scrapy.Request( + url=result_url, + #headers=self._get_headers(), + callback=self.parse_page_common, + meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')} + ) + + def parse_page_by_date(self, response): + # 解析"添加时间"对应的链接 + # 使用CSS选择器定位包含"添加时间"文本的a标签 + add_time_link = response.css( + 'div.sortbar a:contains("添加时间")::attr(href)' + ).get() + + result_url = response.url + if add_time_link: + # 拼接完整URL(response.url为当前页面URL,用于补全相对路径) + result_url = urljoin(response.url, add_time_link) + self.logger.info(f"获取到按时间排序的结果页,开始请求: {result_url}") + + else: + self.logger.warning(f"未找到添加时间对应的链接,使用原来的地址: {result_url}") + yield scrapy.Request( url=result_url, #headers=self._get_headers(), @@ -107,12 +152,13 @@ class ClmSpider(BaseSpider): meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')} ) - def parse_page_common(self, response): - need_next = False # 提取所有 ssbox 节点(每个 ssbox 对应一条数据) ssboxes = response.xpath('//div[@class="ssbox"]') + if not ssboxes: + self.logger.warning(f"无法解析页面。 url: {response.url}") + return for ssbox in ssboxes: # 1. 提取 h3 中的链接和文本 @@ -143,15 +189,12 @@ class ClmSpider(BaseSpider): add_time = sbar.xpath('.//span[contains(text(), "添加时间:")]/b/text()').get() if sbar else None # 大小(如 "39.5 GB") size = sbar.xpath('.//span[contains(text(), "大小:")]/b/text()').get() if sbar else None + size_gb = parse_size(size) # 热度(如 "435") heat = sbar.xpath('.//span[contains(text(), "热度:")]/b/text()').get() if sbar else None # 最近下载时间(可选提取) last_download = sbar.xpath('.//span[contains(text(), "最近下载:")]/b/text()').get() if sbar else None - size_gb = parse_size(size) - if size_gb < self.min_size: - continue - item = ClmIndexItem() item['item_type'] = ITEM_TYPE_CLM_INDEX item['category'] = category @@ -167,38 +210,51 @@ class ClmSpider(BaseSpider): item['key_words'] = response.meta.get('words', '') item['is_update'] = False + # 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页 + up_date = parse_date_to_datetime(item['add_date']) + #self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") + if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()): + pass + else: + need_next = True + + # 太小的文件不要 + if size_gb < self.min_size: + continue + yield item - if self.debug: - return - # 解析下一页链接 + next_page_url = None + total_text = '' pager = response.xpath('//div[@class="pager"]') if pager: total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() - # 定位“下一页”的a标签(通过文本定位,避免混淆其他a标签) next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get() - # 提取href属性 next_page_href = pager.xpath('.//a[contains(text(), "下一页")]/@href').get() - # 判断是否还有下一页 if next_page_href and next_page_href != '#': # 拼接完整URL(相对路径转绝对路径) - next_page_url = response.urljoin(next_page_href) - self.logger.debug(f'{total_text}, 发现下一页:{next_page_url}') - # 递归请求下一页 - yield scrapy.Request( - url=next_page_url, - callback=self.parse_page_common, - meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}, - dont_filter=True # 允许重复请求(防止因URL参数被过滤) - ) - else: - # 当href为#或不存在时,说明已无下一页 - total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0)) - curr_words = response.meta.get('words', '') - self.logger.info(f'已获取完所有页面,停止翻页. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}') + next_page_url = response.urljoin(next_page_href) + + if self.debug: + self.logger.info(f'debug模式下停止翻页. {total_text}. url: {response.url}') + return + elif not need_next or not next_page_url: + total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0)) + curr_words = response.meta.get('words', '') + self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}') + return + + self.logger.debug(f'{total_text}, 发现下一页:{next_page_url}') + # 递归请求下一页 + yield scrapy.Request( + url=next_page_url, + callback=self.parse_page_common, + meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}, + dont_filter=True # 允许重复请求(防止因URL参数被过滤) + ) def parse_page_detail(self, response): # 匹配 class 为 'bt_title' 的 div 下的 h2 标签文本 @@ -214,48 +270,31 @@ class ClmSpider(BaseSpider): # 指定的关键词,导入到数据库 - def initDB(self): - default_keywords = [ - {'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']}, - {'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']}, - {'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']}, - {'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']}, - {'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']}, - {'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']}, - {'Brazzers' : ['Real Wife Stories', 'brazzers']}, - {'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']}, - {'BangBros' : ['BangBus', 'BangBros']}, - {'Nubile Films' : ['nfbusty', 'NubileFilms']}, - {'DDF Network' : ['DDFBusty']}, - {'Adult Time (Network)' : ['AdultTime', 'BurningAngel']}, - {'Anal Vids' : ['AnalVids']}, - {'LegalPorno' : ['LegalPorno']}, - {'Pornbox' : ['Pornworld']}, - {'Wow (Network)' : ['WowGirls']}, - {'Malibu Media' : ['x-art']}, - {'VIPissy Cash' : ['VIPissy']}, - {'japan Blu-Ray' : ['Japan AV Blu-Ray']}, - {'siterip' : ['siterip']}, - {'Brazil' : ['NewMFX']}, - {'Wicked' : ['Wicked']}, - {'Sticky Dollars' : ['Swallowed']}, - {'ManyVids' : ['ManyVids']}, - {'PervCity' : ['AnalOverdose']} - ] - for row in default_keywords: - for group, items in row.items(): - for item in items: - words_item = ClmKeyWordsItem() - words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS - words_item['words'] = item - words_item['groups'] = group - words_item['tags'] = '' - words_item['index_count'] = 0 - db_clm.insert_item(words_item) - self.logger.debug(f"insert item: {item}: {group}") + def load_keywords_from_file(self): + self.logger.info(f"load keywords from file: {self.keywords_file}") + json_data, err = load_json_file(self.keywords_file) + if not json_data: + self.logger.warning(f"load file error. {err}") + return + + total_lines = 0 + for group, items in json_data.items(): + total_lines += len(items) + self.logger.info(f"load group ({group}), {len(items)} items.") + for item in items: + words_item = ClmKeyWordsItem() + words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS + words_item['words'] = item + words_item['groups'] = group + words_item['tags'] = '' + words_item['index_count'] = 0 + db_clm.insert_item(words_item) + self.logger.debug(f"insert item: {item}: {group}") + + self.logger.info(f"load {self.keywords_file} succ, total lines: {total_lines}") # 从其他数据源获取到演员列表,导入到数据库 - def init_load_actors_from_others(self): + def load_keywords_from_db(self): db_comm = IAFDDBHandler() all_likes = { 'vixen' : ['vixen.com', 'Vixen Video'], @@ -292,7 +331,7 @@ class ClmSpider(BaseSpider): # 查询另一个数据表,获取结果 load_results = db_comm.get_lord_actors() if load_results: - self.logger.info(f"total actors in lord: {len(load_results)}") + self.logger.info(f"total actors in thelordofporn: {len(load_results)}") for row in load_results: actor_name = row['name'] if actor_name not in actor_tags: @@ -302,7 +341,7 @@ class ClmSpider(BaseSpider): self.logger.info(f"after merge, total actors: {len(actor_tags)}") for actor, tags_set in actor_tags.items(): tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换 - self.logger.info(f"actor: {actor}, tags: {tag_str}") + self.logger.debug(f"actor: {actor}, tags: {tag_str}") words_item = ClmKeyWordsItem() words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS words_item['words'] = actor @@ -310,4 +349,4 @@ class ClmSpider(BaseSpider): words_item['tags'] = tag_str words_item['index_count'] = 0 db_clm.insert_item(words_item) - #self.logger.debug(f"insert item: {words_item}") + self.logger.debug(f"insert item: {words_item}") diff --git a/scrapy_proj/scrapy_proj/utils/utils.py b/scrapy_proj/scrapy_proj/utils/utils.py index 07c6003..6c671f3 100644 --- a/scrapy_proj/scrapy_proj/utils/utils.py +++ b/scrapy_proj/scrapy_proj/utils/utils.py @@ -1,6 +1,35 @@ import re +import json +import os from datetime import datetime, timezone +def load_json_file(file_path): + # 检查文件是否存在 + if not os.path.exists(file_path): + return None, f"{file_path} not exists." + + # 检查是否是文件(避免目录路径) + if not os.path.isfile(file_path): + return None, f"{file_path} is not file." + + try: + # 读取文件内容 + with open(file_path, 'r', encoding='utf-8') as f: + try: + # 解析JSON数据 + json_data = json.load(f) + return json_data, None + except json.JSONDecodeError as e: + return None, f"JSON格式解析失败 - {e}" + except Exception as e: + return None, f"读取文件内容时发生异常 - {e}" + except PermissionError: + return None, f"错误: 没有权限读取文件 - {file_path}" + except Exception as e: + return None, f"错误: 打开文件时发生异常 - {e}" + + return None, "未知错误" + '''timestamp(ms) 转为日期''' def format_timestamp(ts, is_ms=True): if not ts: