modify scripts
This commit is contained in:
@ -3,6 +3,7 @@ import sqlite3
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Dict
|
||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
||||
import scrapy_proj.comm.comm_def as comm
|
||||
|
||||
@ -83,7 +84,7 @@ class ClmDBHandler(SQLiteDBHandler):
|
||||
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
|
||||
self.insert_index(item)
|
||||
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
|
||||
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True)
|
||||
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=False)
|
||||
else:
|
||||
logging.error(f"unkown item.")
|
||||
|
||||
@ -115,13 +116,14 @@ class ClmDBHandler(SQLiteDBHandler):
|
||||
"id": " AND id = ?",
|
||||
"words": " AND words LIKE ?",
|
||||
"groups": " AND groups LIKE ?",
|
||||
"tags": " AND tags LIKE ?",
|
||||
"start_id": " AND id > ?",
|
||||
}
|
||||
|
||||
for key, condition in conditions.items():
|
||||
if key in filters:
|
||||
sql += condition
|
||||
if key == "words" or key == 'groups':
|
||||
if key == "words" or key == 'groups' or key == 'tags':
|
||||
params.append(f"%{filters[key]}%")
|
||||
else:
|
||||
params.append(filters[key])
|
||||
@ -280,6 +282,110 @@ class IAFDDBHandler(SQLiteDBHandler):
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def get_iafd_actors(
|
||||
self,
|
||||
names: List[str],
|
||||
tbl = 'stu'
|
||||
) -> Dict[str, List[Dict[str, str]]]:
|
||||
"""
|
||||
分两步查询指定发行商对应的女性演员(使用临时表减少内存占用)
|
||||
|
||||
步骤1:筛选目标发行商及其关联的影片,存入临时表(小集合)
|
||||
步骤2:用临时表的影片ID关联演员表,获取女性演员信息
|
||||
"""
|
||||
tbl_name = 'iafd_studios' if tbl.lower() == 'stu' else 'iafd_distributors'
|
||||
join_key = 'studio_id' if tbl.lower() == 'stu' else 'distributor_id'
|
||||
if not names:
|
||||
return {}
|
||||
|
||||
# 结果容器
|
||||
final_result: Dict[str, List[Dict[str, str]]] = {}
|
||||
|
||||
try:
|
||||
# --------------------------
|
||||
# 步骤1:创建临时表,存储目标发行商及其关联的影片
|
||||
# --------------------------
|
||||
# 先删除可能残留的临时表(避免冲突)
|
||||
self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies")
|
||||
# 创建临时表(只在当前连接可见,连接关闭后自动删除)
|
||||
self.cursor.execute("""
|
||||
CREATE TEMPORARY TABLE temp_distributor_movies (
|
||||
distributor_id INTEGER,
|
||||
distributor_name TEXT,
|
||||
movie_id INTEGER,
|
||||
PRIMARY KEY (distributor_id, movie_id)
|
||||
)
|
||||
""")
|
||||
|
||||
# 批量插入目标发行商及其关联的影片(小集合)
|
||||
# 先筛选发行商,再关联影片,结果插入临时表
|
||||
insert_sql = """
|
||||
INSERT INTO temp_distributor_movies (distributor_id, distributor_name, movie_id)
|
||||
SELECT
|
||||
d.id AS distributor_id,
|
||||
d.name AS distributor_name,
|
||||
m.id AS movie_id
|
||||
FROM
|
||||
{tbl_name} d
|
||||
INNER JOIN
|
||||
iafd_movies m ON d.id = m.{join_key}
|
||||
WHERE
|
||||
d.name IN ({placeholders})
|
||||
""".format(
|
||||
tbl_name=tbl_name,
|
||||
join_key=join_key,
|
||||
placeholders=', '.join(['?'] * len(names))
|
||||
)
|
||||
|
||||
logging.info(f'{insert_sql}')
|
||||
|
||||
self.cursor.execute(insert_sql, names)
|
||||
self.conn.commit() # 提交临时表数据
|
||||
|
||||
# --------------------------
|
||||
# 步骤2:用临时表关联演员信息(仅处理小集合)
|
||||
# --------------------------
|
||||
query_sql = """
|
||||
SELECT
|
||||
t.distributor_name,
|
||||
p.name AS performer_name,
|
||||
p.href AS performer_href
|
||||
FROM
|
||||
temp_distributor_movies t
|
||||
INNER JOIN
|
||||
iafd_performers_movies pm ON t.movie_id = pm.movie_id
|
||||
INNER JOIN
|
||||
iafd_performers p ON pm.performer_id = p.id
|
||||
WHERE
|
||||
p.gender = 'Woman'
|
||||
ORDER BY
|
||||
t.distributor_name, p.name
|
||||
"""
|
||||
|
||||
self.cursor.execute(query_sql)
|
||||
rows = self.cursor.fetchall()
|
||||
|
||||
# 整理结果:按发行商分组
|
||||
for row in rows:
|
||||
dist_name = row['distributor_name']
|
||||
performer = {
|
||||
'name': row['performer_name'],
|
||||
'href': row['performer_href']
|
||||
}
|
||||
if dist_name not in final_result:
|
||||
final_result[dist_name] = []
|
||||
final_result[dist_name].append(performer)
|
||||
|
||||
# 主动清理临时表(可选,连接关闭后会自动删除)
|
||||
self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
print(f"查询失败:{e}")
|
||||
return {}
|
||||
|
||||
return final_result
|
||||
|
||||
|
||||
@register_handler(comm.SPIDER_NAME_PBOX)
|
||||
class PboxDBHandler(SQLiteDBHandler):
|
||||
|
||||
@ -50,9 +50,16 @@ class ClmSpider(BaseSpider):
|
||||
|
||||
self.keywords = keywords
|
||||
self.min_size = float(min_size) if min_size else 1.0
|
||||
self.run_task = True
|
||||
|
||||
#self.initDB()
|
||||
# 增加一个暗号
|
||||
if keywords and keywords.lower() == 'reload' :
|
||||
self.initDB()
|
||||
self.init_load_actors_from_others()
|
||||
self.run_task = False
|
||||
self.logger.info(f"reload keywords db succ!")
|
||||
|
||||
# 指定的关键词,导入到数据库
|
||||
def initDB(self):
|
||||
for row in default_keywords:
|
||||
for group, items in row.items():
|
||||
@ -63,15 +70,75 @@ class ClmSpider(BaseSpider):
|
||||
words_item['groups'] = group
|
||||
words_item['tags'] = ''
|
||||
words_item['index_count'] = 0
|
||||
db_tools.insert_item(words_item)
|
||||
db_clm.insert_item(words_item)
|
||||
self.logger.debug(f"insert item: {item}: {group}")
|
||||
|
||||
# 从其他数据源获取到演员列表,导入到数据库
|
||||
def init_load_actors_from_others(self):
|
||||
all_likes = {
|
||||
'vixen' : ['vixen.com', 'Vixen Video'],
|
||||
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
|
||||
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
|
||||
'x-art' : ['x-art.com', 'X-art'],
|
||||
'nfbusty' : ['nfbusty.com']
|
||||
}
|
||||
# 先转换个格式
|
||||
all_key_group = {}
|
||||
all_keys = []
|
||||
for group, keys in all_likes.items():
|
||||
for key in keys:
|
||||
all_key_group[key] = group
|
||||
all_keys.append(key)
|
||||
|
||||
# 查询数据库,并转换数据
|
||||
actor_tags = {}
|
||||
total_lines = 0
|
||||
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
|
||||
for dist, actors in results.items():
|
||||
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
|
||||
total_lines += len(actors)
|
||||
for actor in actors :
|
||||
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
|
||||
actor_name = actor['name']
|
||||
current_tag = all_key_group.get(dist, '')
|
||||
if actor_name not in actor_tags:
|
||||
actor_tags[actor_name] = set() # 用set自动去重
|
||||
if current_tag:
|
||||
actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略
|
||||
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
|
||||
|
||||
# 查询另一个数据表,获取结果
|
||||
load_results = db_comm.get_lord_actors()
|
||||
if load_results:
|
||||
self.logger.info(f"total actors in lord: {len(load_results)}")
|
||||
for row in load_results:
|
||||
actor_name = row['name']
|
||||
if actor_name not in actor_tags:
|
||||
actor_tags[actor_name] = set() # 用set自动去重
|
||||
actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略
|
||||
|
||||
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
|
||||
for actor, tags_set in actor_tags.items():
|
||||
tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换
|
||||
self.logger.info(f"actor: {actor}, tags: {tag_str}")
|
||||
words_item = ClmKeyWordsItem()
|
||||
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||||
words_item['words'] = actor
|
||||
words_item['groups'] = 'actress'
|
||||
words_item['tags'] = tag_str
|
||||
words_item['index_count'] = 0
|
||||
db_clm.insert_item(words_item)
|
||||
#self.logger.debug(f"insert item: {words_item}")
|
||||
|
||||
# 入口函数,由基类的方法触发
|
||||
def custom_start_requests(self):
|
||||
if not self.run_task:
|
||||
return
|
||||
|
||||
if self.debug:
|
||||
keywords = db_clm.get_key_words(limit =5)
|
||||
else:
|
||||
keywords = db_clm.get_key_words()
|
||||
keywords = db_clm.get_key_words(groups='actress', tags='vixen')
|
||||
|
||||
for item in keywords:
|
||||
words_id = item['id']
|
||||
|
||||
Reference in New Issue
Block a user