From e6e76805424c6e3d56faeb2f9681137fba54b63c Mon Sep 17 00:00:00 2001 From: sophon Date: Sat, 19 Jul 2025 23:59:47 +0800 Subject: [PATCH] modify scripts --- .../db_wapper/spider_db_handler.py | 22 +- scrapy_proj/scrapy_proj/items.py | 1 + scrapy_proj/scrapy_proj/spiders/clm_spider.py | 249 ++++++++++-------- 3 files changed, 166 insertions(+), 106 deletions(-) diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 3f459d1..f9dfe1b 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -84,13 +84,17 @@ class ClmDBHandler(SQLiteDBHandler): if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX: self.insert_index(item) elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS: - self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=False) + self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key='words', exists_do_nothing=False) else: logging.error(f"unkown item.") return item def insert_index(self, item): + if item['is_update']: # 仅更新 + self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=False) + return + row_id = self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True) if row_id: lnk_data = {} @@ -106,6 +110,19 @@ class ClmDBHandler(SQLiteDBHandler): else: logging.warning(f"insert index error: {item}") + def get_empty_title(self): + try: + self.cursor.execute(f"SELECT id, href FROM {self.tbl_name_clm_index} WHERE title='' ") + return [dict(row) for row in self.cursor.fetchall()] + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + + def get_count_by_keywords_id(self, key_words_id): + self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_words_index} WHERE words_id = ?", (key_words_id,)) + row = self.cursor.fetchone() + return row[0] if row else None + # 按条件查询 href 列表 def get_key_words(self, **filters): try: @@ -128,6 +145,9 @@ class ClmDBHandler(SQLiteDBHandler): else: params.append(filters[key]) + if "query_str" in filters: + sql += f" AND {filters['query_str']} " + if "order_by" in filters: # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 sql += f" ORDER BY {filters['order_by']} " diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index 73ac33a..a42ebfa 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -155,6 +155,7 @@ class ClmIndexItem(scrapy.Item): last_down_date = scrapy.Field() key_words_id = scrapy.Field() key_words = scrapy.Field() + is_update = scrapy.Field() class ClmKeywordsIndexItem(scrapy.Item): item_type = scrapy.Field() diff --git a/scrapy_proj/scrapy_proj/spiders/clm_spider.py b/scrapy_proj/scrapy_proj/spiders/clm_spider.py index c96c9e6..6a7450d 100644 --- a/scrapy_proj/scrapy_proj/spiders/clm_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/clm_spider.py @@ -8,35 +8,6 @@ from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler db_clm = ClmDBHandler() -db_comm = IAFDDBHandler() - -default_keywords = [ - {'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']}, - {'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']}, - {'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']}, - {'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']}, - {'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']}, - {'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']}, - {'Brazzers' : ['Real Wife Stories', 'brazzers']}, - {'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']}, - {'BangBros' : ['BangBus', 'BangBros']}, - {'Nubile Films' : ['nfbusty', 'NubileFilms']}, - {'DDF Network' : ['DDFBusty']}, - {'Adult Time (Network)' : ['AdultTime', 'BurningAngel']}, - {'Anal Vids' : ['AnalVids']}, - {'LegalPorno' : ['LegalPorno']}, - {'Pornbox' : ['Pornworld']}, - {'Wow (Network)' : ['WowGirls']}, - {'Malibu Media' : ['x-art']}, - {'VIPissy Cash' : ['VIPissy']}, - {'japan Blu-Ray' : ['Japan AV Blu-Ray']}, - {'siterip' : ['siterip']}, - {'Brazil' : ['NewMFX']}, - {'Wicked' : ['Wicked']}, - {'Sticky Dollars' : ['Swallowed']}, - {'ManyVids' : ['ManyVids']}, - {'PervCity' : ['AnalOverdose']} -] class ClmSpider(BaseSpider): name = SPIDER_NAME_CLM @@ -51,6 +22,7 @@ class ClmSpider(BaseSpider): self.keywords = keywords self.min_size = float(min_size) if min_size else 1.0 self.run_task = True + self.fix_title = False # 增加一个暗号 if keywords and keywords.lower() == 'reload' : @@ -59,86 +31,38 @@ class ClmSpider(BaseSpider): self.run_task = False self.logger.info(f"reload keywords db succ!") - # 指定的关键词,导入到数据库 - def initDB(self): - for row in default_keywords: - for group, items in row.items(): - for item in items: - words_item = ClmKeyWordsItem() - words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS - words_item['words'] = item - words_item['groups'] = group - words_item['tags'] = '' - words_item['index_count'] = 0 - db_clm.insert_item(words_item) - self.logger.debug(f"insert item: {item}: {group}") - - # 从其他数据源获取到演员列表,导入到数据库 - def init_load_actors_from_others(self): - all_likes = { - 'vixen' : ['vixen.com', 'Vixen Video'], - 'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'], - 'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'], - 'x-art' : ['x-art.com', 'X-art'], - 'nfbusty' : ['nfbusty.com'] - } - # 先转换个格式 - all_key_group = {} - all_keys = [] - for group, keys in all_likes.items(): - for key in keys: - all_key_group[key] = group - all_keys.append(key) - - # 查询数据库,并转换数据 - actor_tags = {} - total_lines = 0 - results = db_comm.get_iafd_actors(names=all_keys, tbl='stu') - for dist, actors in results.items(): - self.logger.info(f"dist: {dist}, actors count: {len(actors)}") - total_lines += len(actors) - for actor in actors : - #self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}") - actor_name = actor['name'] - current_tag = all_key_group.get(dist, '') - if actor_name not in actor_tags: - actor_tags[actor_name] = set() # 用set自动去重 - if current_tag: - actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略 - self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}") - - # 查询另一个数据表,获取结果 - load_results = db_comm.get_lord_actors() - if load_results: - self.logger.info(f"total actors in lord: {len(load_results)}") - for row in load_results: - actor_name = row['name'] - if actor_name not in actor_tags: - actor_tags[actor_name] = set() # 用set自动去重 - actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略 - - self.logger.info(f"after merge, total actors: {len(actor_tags)}") - for actor, tags_set in actor_tags.items(): - tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换 - self.logger.info(f"actor: {actor}, tags: {tag_str}") - words_item = ClmKeyWordsItem() - words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS - words_item['words'] = actor - words_item['groups'] = 'actress' - words_item['tags'] = tag_str - words_item['index_count'] = 0 - db_clm.insert_item(words_item) - #self.logger.debug(f"insert item: {words_item}") + # 增加一个暗号 + if keywords and keywords.lower() == 'fix' : + self.fix_title = True + self.run_task = False # 入口函数,由基类的方法触发 def custom_start_requests(self): + if self.fix_title: + data = db_clm.get_empty_title() + if data: + self.logger.info(f"rows to be fixed: {len(data)}") + for row in data: + url = row['href'] + # 递归请求下一页 + yield scrapy.Request( + url=url, + callback=self.parse_page_detail, + meta={'url': url}, + dont_filter=True # 允许重复请求(防止因URL参数被过滤) + ) + else: + self.logger.warning(f"no data.") + if not self.run_task: return + tmp_query_str = f" groups='actress' and tags not like '%vixen%' " if self.debug: - keywords = db_clm.get_key_words(limit =5) + keywords = db_clm.get_key_words(limit =5, query_str = tmp_query_str) else: - keywords = db_clm.get_key_words(groups='actress', tags='vixen') + #keywords = db_clm.get_key_words(groups='actress', tags='vixen') + keywords = db_clm.get_key_words(query_str = tmp_query_str) for item in keywords: words_id = item['id'] @@ -197,7 +121,8 @@ class ClmSpider(BaseSpider): # h3 下的 a 标签(标题链接) h3_a = ssbox.xpath('.//div[@class="title"]/h3/a') # 标题文本(如 "Vixen.2025.05") - title_text = h3_a.xpath('text()').get().strip() if h3_a else None + #title_text = h3_a.xpath('text()').get().strip() if h3_a else None + title_text = extract_text_from_element(h3_a, use_title=True) # 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html") title_href = h3_a.xpath('@href').get() if h3_a else None # 若链接是相对路径,可拼接成完整URL(根据网站域名调整) @@ -240,6 +165,7 @@ class ClmSpider(BaseSpider): item['last_down_date'] = last_download item['key_words_id'] = response.meta.get('words_id', 0) item['key_words'] = response.meta.get('words', '') + item['is_update'] = False yield item @@ -249,7 +175,7 @@ class ClmSpider(BaseSpider): # 解析下一页链接 pager = response.xpath('//div[@class="pager"]') if pager: - total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() if sbar else '' + total_text = pager.xpath('.//span[contains(text(), "共")]/text()').get() # 定位“下一页”的a标签(通过文本定位,避免混淆其他a标签) next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get() @@ -270,5 +196,118 @@ class ClmSpider(BaseSpider): ) else: # 当href为#或不存在时,说明已无下一页 - self.logger.info(f'已获取完所有页面,停止翻页. {total_text}') - \ No newline at end of file + total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0)) + curr_words = response.meta.get('words', '') + self.logger.info(f'已获取完所有页面,停止翻页. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}') + + def parse_page_detail(self, response): + # 匹配 class 为 'bt_title' 的 div 下的 h2 标签文本 + title_xpath = response.xpath('//div[@class="bt_title"]/h2/text()').get() + + item = ClmIndexItem() + item['item_type'] = ITEM_TYPE_CLM_INDEX + item['title'] = title_xpath + item['href'] = response.meta['url'] + item['is_update'] = True + + yield item + + + # 指定的关键词,导入到数据库 + def initDB(self): + default_keywords = [ + {'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']}, + {'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']}, + {'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']}, + {'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']}, + {'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']}, + {'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']}, + {'Brazzers' : ['Real Wife Stories', 'brazzers']}, + {'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']}, + {'BangBros' : ['BangBus', 'BangBros']}, + {'Nubile Films' : ['nfbusty', 'NubileFilms']}, + {'DDF Network' : ['DDFBusty']}, + {'Adult Time (Network)' : ['AdultTime', 'BurningAngel']}, + {'Anal Vids' : ['AnalVids']}, + {'LegalPorno' : ['LegalPorno']}, + {'Pornbox' : ['Pornworld']}, + {'Wow (Network)' : ['WowGirls']}, + {'Malibu Media' : ['x-art']}, + {'VIPissy Cash' : ['VIPissy']}, + {'japan Blu-Ray' : ['Japan AV Blu-Ray']}, + {'siterip' : ['siterip']}, + {'Brazil' : ['NewMFX']}, + {'Wicked' : ['Wicked']}, + {'Sticky Dollars' : ['Swallowed']}, + {'ManyVids' : ['ManyVids']}, + {'PervCity' : ['AnalOverdose']} + ] + for row in default_keywords: + for group, items in row.items(): + for item in items: + words_item = ClmKeyWordsItem() + words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS + words_item['words'] = item + words_item['groups'] = group + words_item['tags'] = '' + words_item['index_count'] = 0 + db_clm.insert_item(words_item) + self.logger.debug(f"insert item: {item}: {group}") + + # 从其他数据源获取到演员列表,导入到数据库 + def init_load_actors_from_others(self): + db_comm = IAFDDBHandler() + all_likes = { + 'vixen' : ['vixen.com', 'Vixen Video'], + 'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'], + 'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'], + 'x-art' : ['x-art.com', 'X-art'], + 'nfbusty' : ['nfbusty.com'] + } + # 先转换个格式 + all_key_group = {} + all_keys = [] + for group, keys in all_likes.items(): + for key in keys: + all_key_group[key] = group + all_keys.append(key) + + # 查询数据库,并转换数据 + actor_tags = {} + total_lines = 0 + results = db_comm.get_iafd_actors(names=all_keys, tbl='stu') + for dist, actors in results.items(): + self.logger.info(f"dist: {dist}, actors count: {len(actors)}") + total_lines += len(actors) + for actor in actors : + #self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}") + actor_name = actor['name'] + current_tag = all_key_group.get(dist, '') + if actor_name not in actor_tags: + actor_tags[actor_name] = set() # 用set自动去重 + if current_tag: + actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略 + self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}") + + # 查询另一个数据表,获取结果 + load_results = db_comm.get_lord_actors() + if load_results: + self.logger.info(f"total actors in lord: {len(load_results)}") + for row in load_results: + actor_name = row['name'] + if actor_name not in actor_tags: + actor_tags[actor_name] = set() # 用set自动去重 + actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略 + + self.logger.info(f"after merge, total actors: {len(actor_tags)}") + for actor, tags_set in actor_tags.items(): + tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换 + self.logger.info(f"actor: {actor}, tags: {tag_str}") + words_item = ClmKeyWordsItem() + words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS + words_item['words'] = actor + words_item['groups'] = 'actress' + words_item['tags'] = tag_str + words_item['index_count'] = 0 + db_clm.insert_item(words_item) + #self.logger.debug(f"insert item: {words_item}")