diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index e89b274..3f459d1 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -3,6 +3,7 @@ import sqlite3 import json import logging from datetime import datetime +from typing import List, Dict from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path import scrapy_proj.comm.comm_def as comm @@ -83,7 +84,7 @@ class ClmDBHandler(SQLiteDBHandler): if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX: self.insert_index(item) elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS: - self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True) + self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=False) else: logging.error(f"unkown item.") @@ -115,13 +116,14 @@ class ClmDBHandler(SQLiteDBHandler): "id": " AND id = ?", "words": " AND words LIKE ?", "groups": " AND groups LIKE ?", + "tags": " AND tags LIKE ?", "start_id": " AND id > ?", } for key, condition in conditions.items(): if key in filters: sql += condition - if key == "words" or key == 'groups': + if key == "words" or key == 'groups' or key == 'tags': params.append(f"%{filters[key]}%") else: params.append(filters[key]) @@ -280,6 +282,110 @@ class IAFDDBHandler(SQLiteDBHandler): logging.error(f"查询 href 失败: {e}") return None + # 按条件查询 href 列表 + def get_iafd_actors( + self, + names: List[str], + tbl = 'stu' + ) -> Dict[str, List[Dict[str, str]]]: + """ + 分两步查询指定发行商对应的女性演员(使用临时表减少内存占用) + + 步骤1:筛选目标发行商及其关联的影片,存入临时表(小集合) + 步骤2:用临时表的影片ID关联演员表,获取女性演员信息 + """ + tbl_name = 'iafd_studios' if tbl.lower() == 'stu' else 'iafd_distributors' + join_key = 'studio_id' if tbl.lower() == 'stu' else 'distributor_id' + if not names: + return {} + + # 结果容器 + final_result: Dict[str, List[Dict[str, str]]] = {} + + try: + # -------------------------- + # 步骤1:创建临时表,存储目标发行商及其关联的影片 + # -------------------------- + # 先删除可能残留的临时表(避免冲突) + self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies") + # 创建临时表(只在当前连接可见,连接关闭后自动删除) + self.cursor.execute(""" + CREATE TEMPORARY TABLE temp_distributor_movies ( + distributor_id INTEGER, + distributor_name TEXT, + movie_id INTEGER, + PRIMARY KEY (distributor_id, movie_id) + ) + """) + + # 批量插入目标发行商及其关联的影片(小集合) + # 先筛选发行商,再关联影片,结果插入临时表 + insert_sql = """ + INSERT INTO temp_distributor_movies (distributor_id, distributor_name, movie_id) + SELECT + d.id AS distributor_id, + d.name AS distributor_name, + m.id AS movie_id + FROM + {tbl_name} d + INNER JOIN + iafd_movies m ON d.id = m.{join_key} + WHERE + d.name IN ({placeholders}) + """.format( + tbl_name=tbl_name, + join_key=join_key, + placeholders=', '.join(['?'] * len(names)) + ) + + logging.info(f'{insert_sql}') + + self.cursor.execute(insert_sql, names) + self.conn.commit() # 提交临时表数据 + + # -------------------------- + # 步骤2:用临时表关联演员信息(仅处理小集合) + # -------------------------- + query_sql = """ + SELECT + t.distributor_name, + p.name AS performer_name, + p.href AS performer_href + FROM + temp_distributor_movies t + INNER JOIN + iafd_performers_movies pm ON t.movie_id = pm.movie_id + INNER JOIN + iafd_performers p ON pm.performer_id = p.id + WHERE + p.gender = 'Woman' + ORDER BY + t.distributor_name, p.name + """ + + self.cursor.execute(query_sql) + rows = self.cursor.fetchall() + + # 整理结果:按发行商分组 + for row in rows: + dist_name = row['distributor_name'] + performer = { + 'name': row['performer_name'], + 'href': row['performer_href'] + } + if dist_name not in final_result: + final_result[dist_name] = [] + final_result[dist_name].append(performer) + + # 主动清理临时表(可选,连接关闭后会自动删除) + self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies") + + except sqlite3.Error as e: + print(f"查询失败:{e}") + return {} + + return final_result + @register_handler(comm.SPIDER_NAME_PBOX) class PboxDBHandler(SQLiteDBHandler): diff --git a/scrapy_proj/scrapy_proj/spiders/clm_spider.py b/scrapy_proj/scrapy_proj/spiders/clm_spider.py index fee4c74..c96c9e6 100644 --- a/scrapy_proj/scrapy_proj/spiders/clm_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/clm_spider.py @@ -50,9 +50,16 @@ class ClmSpider(BaseSpider): self.keywords = keywords self.min_size = float(min_size) if min_size else 1.0 + self.run_task = True - #self.initDB() + # 增加一个暗号 + if keywords and keywords.lower() == 'reload' : + self.initDB() + self.init_load_actors_from_others() + self.run_task = False + self.logger.info(f"reload keywords db succ!") + # 指定的关键词,导入到数据库 def initDB(self): for row in default_keywords: for group, items in row.items(): @@ -63,15 +70,75 @@ class ClmSpider(BaseSpider): words_item['groups'] = group words_item['tags'] = '' words_item['index_count'] = 0 - db_tools.insert_item(words_item) + db_clm.insert_item(words_item) self.logger.debug(f"insert item: {item}: {group}") + # 从其他数据源获取到演员列表,导入到数据库 + def init_load_actors_from_others(self): + all_likes = { + 'vixen' : ['vixen.com', 'Vixen Video'], + 'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'], + 'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'], + 'x-art' : ['x-art.com', 'X-art'], + 'nfbusty' : ['nfbusty.com'] + } + # 先转换个格式 + all_key_group = {} + all_keys = [] + for group, keys in all_likes.items(): + for key in keys: + all_key_group[key] = group + all_keys.append(key) + + # 查询数据库,并转换数据 + actor_tags = {} + total_lines = 0 + results = db_comm.get_iafd_actors(names=all_keys, tbl='stu') + for dist, actors in results.items(): + self.logger.info(f"dist: {dist}, actors count: {len(actors)}") + total_lines += len(actors) + for actor in actors : + #self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}") + actor_name = actor['name'] + current_tag = all_key_group.get(dist, '') + if actor_name not in actor_tags: + actor_tags[actor_name] = set() # 用set自动去重 + if current_tag: + actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略 + self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}") + + # 查询另一个数据表,获取结果 + load_results = db_comm.get_lord_actors() + if load_results: + self.logger.info(f"total actors in lord: {len(load_results)}") + for row in load_results: + actor_name = row['name'] + if actor_name not in actor_tags: + actor_tags[actor_name] = set() # 用set自动去重 + actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略 + + self.logger.info(f"after merge, total actors: {len(actor_tags)}") + for actor, tags_set in actor_tags.items(): + tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换 + self.logger.info(f"actor: {actor}, tags: {tag_str}") + words_item = ClmKeyWordsItem() + words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS + words_item['words'] = actor + words_item['groups'] = 'actress' + words_item['tags'] = tag_str + words_item['index_count'] = 0 + db_clm.insert_item(words_item) + #self.logger.debug(f"insert item: {words_item}") + # 入口函数,由基类的方法触发 def custom_start_requests(self): + if not self.run_task: + return + if self.debug: keywords = db_clm.get_key_words(limit =5) else: - keywords = db_clm.get_key_words() + keywords = db_clm.get_key_words(groups='actress', tags='vixen') for item in keywords: words_id = item['id']