modify scripts

This commit is contained in:
2025-07-19 17:17:35 +08:00
parent 19353a830c
commit 6522970dcb
2 changed files with 178 additions and 5 deletions

View File

@ -3,6 +3,7 @@ import sqlite3
import json
import logging
from datetime import datetime
from typing import List, Dict
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
import scrapy_proj.comm.comm_def as comm
@ -83,7 +84,7 @@ class ClmDBHandler(SQLiteDBHandler):
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
self.insert_index(item)
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True)
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=False)
else:
logging.error(f"unkown item.")
@ -115,13 +116,14 @@ class ClmDBHandler(SQLiteDBHandler):
"id": " AND id = ?",
"words": " AND words LIKE ?",
"groups": " AND groups LIKE ?",
"tags": " AND tags LIKE ?",
"start_id": " AND id > ?",
}
for key, condition in conditions.items():
if key in filters:
sql += condition
if key == "words" or key == 'groups':
if key == "words" or key == 'groups' or key == 'tags':
params.append(f"%{filters[key]}%")
else:
params.append(filters[key])
@ -280,6 +282,110 @@ class IAFDDBHandler(SQLiteDBHandler):
logging.error(f"查询 href 失败: {e}")
return None
# 按条件查询 href 列表
def get_iafd_actors(
self,
names: List[str],
tbl = 'stu'
) -> Dict[str, List[Dict[str, str]]]:
"""
分两步查询指定发行商对应的女性演员(使用临时表减少内存占用)
步骤1筛选目标发行商及其关联的影片存入临时表小集合
步骤2用临时表的影片ID关联演员表获取女性演员信息
"""
tbl_name = 'iafd_studios' if tbl.lower() == 'stu' else 'iafd_distributors'
join_key = 'studio_id' if tbl.lower() == 'stu' else 'distributor_id'
if not names:
return {}
# 结果容器
final_result: Dict[str, List[Dict[str, str]]] = {}
try:
# --------------------------
# 步骤1创建临时表存储目标发行商及其关联的影片
# --------------------------
# 先删除可能残留的临时表(避免冲突)
self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies")
# 创建临时表(只在当前连接可见,连接关闭后自动删除)
self.cursor.execute("""
CREATE TEMPORARY TABLE temp_distributor_movies (
distributor_id INTEGER,
distributor_name TEXT,
movie_id INTEGER,
PRIMARY KEY (distributor_id, movie_id)
)
""")
# 批量插入目标发行商及其关联的影片(小集合)
# 先筛选发行商,再关联影片,结果插入临时表
insert_sql = """
INSERT INTO temp_distributor_movies (distributor_id, distributor_name, movie_id)
SELECT
d.id AS distributor_id,
d.name AS distributor_name,
m.id AS movie_id
FROM
{tbl_name} d
INNER JOIN
iafd_movies m ON d.id = m.{join_key}
WHERE
d.name IN ({placeholders})
""".format(
tbl_name=tbl_name,
join_key=join_key,
placeholders=', '.join(['?'] * len(names))
)
logging.info(f'{insert_sql}')
self.cursor.execute(insert_sql, names)
self.conn.commit() # 提交临时表数据
# --------------------------
# 步骤2用临时表关联演员信息仅处理小集合
# --------------------------
query_sql = """
SELECT
t.distributor_name,
p.name AS performer_name,
p.href AS performer_href
FROM
temp_distributor_movies t
INNER JOIN
iafd_performers_movies pm ON t.movie_id = pm.movie_id
INNER JOIN
iafd_performers p ON pm.performer_id = p.id
WHERE
p.gender = 'Woman'
ORDER BY
t.distributor_name, p.name
"""
self.cursor.execute(query_sql)
rows = self.cursor.fetchall()
# 整理结果:按发行商分组
for row in rows:
dist_name = row['distributor_name']
performer = {
'name': row['performer_name'],
'href': row['performer_href']
}
if dist_name not in final_result:
final_result[dist_name] = []
final_result[dist_name].append(performer)
# 主动清理临时表(可选,连接关闭后会自动删除)
self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies")
except sqlite3.Error as e:
print(f"查询失败:{e}")
return {}
return final_result
@register_handler(comm.SPIDER_NAME_PBOX)
class PboxDBHandler(SQLiteDBHandler):

View File

@ -50,9 +50,16 @@ class ClmSpider(BaseSpider):
self.keywords = keywords
self.min_size = float(min_size) if min_size else 1.0
self.run_task = True
#self.initDB()
# 增加一个暗号
if keywords and keywords.lower() == 'reload' :
self.initDB()
self.init_load_actors_from_others()
self.run_task = False
self.logger.info(f"reload keywords db succ!")
# 指定的关键词,导入到数据库
def initDB(self):
for row in default_keywords:
for group, items in row.items():
@ -63,15 +70,75 @@ class ClmSpider(BaseSpider):
words_item['groups'] = group
words_item['tags'] = ''
words_item['index_count'] = 0
db_tools.insert_item(words_item)
db_clm.insert_item(words_item)
self.logger.debug(f"insert item: {item}: {group}")
# 从其他数据源获取到演员列表,导入到数据库
def init_load_actors_from_others(self):
all_likes = {
'vixen' : ['vixen.com', 'Vixen Video'],
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
'x-art' : ['x-art.com', 'X-art'],
'nfbusty' : ['nfbusty.com']
}
# 先转换个格式
all_key_group = {}
all_keys = []
for group, keys in all_likes.items():
for key in keys:
all_key_group[key] = group
all_keys.append(key)
# 查询数据库,并转换数据
actor_tags = {}
total_lines = 0
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
for dist, actors in results.items():
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
total_lines += len(actors)
for actor in actors :
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
actor_name = actor['name']
current_tag = all_key_group.get(dist, '')
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
if current_tag:
actor_tags[actor_name].add(current_tag) # set的add方法重复值会自动忽略
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
# 查询另一个数据表,获取结果
load_results = db_comm.get_lord_actors()
if load_results:
self.logger.info(f"total actors in lord: {len(load_results)}")
for row in load_results:
actor_name = row['name']
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
actor_tags[actor_name].add('thelordofporn') # set的add方法重复值会自动忽略
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
for actor, tags_set in actor_tags.items():
tag_str = ','.join(tags_set) # set直接支持迭代无需额外转换
self.logger.info(f"actor: {actor}, tags: {tag_str}")
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = actor
words_item['groups'] = 'actress'
words_item['tags'] = tag_str
words_item['index_count'] = 0
db_clm.insert_item(words_item)
#self.logger.debug(f"insert item: {words_item}")
# 入口函数,由基类的方法触发
def custom_start_requests(self):
if not self.run_task:
return
if self.debug:
keywords = db_clm.get_key_words(limit =5)
else:
keywords = db_clm.get_key_words()
keywords = db_clm.get_key_words(groups='actress', tags='vixen')
for item in keywords:
words_id = item['id']