modify scripts
This commit is contained in:
@ -3,6 +3,7 @@ import sqlite3
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import List, Dict
|
||||||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
|
||||||
import scrapy_proj.comm.comm_def as comm
|
import scrapy_proj.comm.comm_def as comm
|
||||||
|
|
||||||
@ -83,7 +84,7 @@ class ClmDBHandler(SQLiteDBHandler):
|
|||||||
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
|
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
|
||||||
self.insert_index(item)
|
self.insert_index(item)
|
||||||
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
|
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
|
||||||
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True)
|
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=False)
|
||||||
else:
|
else:
|
||||||
logging.error(f"unkown item.")
|
logging.error(f"unkown item.")
|
||||||
|
|
||||||
@ -115,13 +116,14 @@ class ClmDBHandler(SQLiteDBHandler):
|
|||||||
"id": " AND id = ?",
|
"id": " AND id = ?",
|
||||||
"words": " AND words LIKE ?",
|
"words": " AND words LIKE ?",
|
||||||
"groups": " AND groups LIKE ?",
|
"groups": " AND groups LIKE ?",
|
||||||
|
"tags": " AND tags LIKE ?",
|
||||||
"start_id": " AND id > ?",
|
"start_id": " AND id > ?",
|
||||||
}
|
}
|
||||||
|
|
||||||
for key, condition in conditions.items():
|
for key, condition in conditions.items():
|
||||||
if key in filters:
|
if key in filters:
|
||||||
sql += condition
|
sql += condition
|
||||||
if key == "words" or key == 'groups':
|
if key == "words" or key == 'groups' or key == 'tags':
|
||||||
params.append(f"%{filters[key]}%")
|
params.append(f"%{filters[key]}%")
|
||||||
else:
|
else:
|
||||||
params.append(filters[key])
|
params.append(filters[key])
|
||||||
@ -280,6 +282,110 @@ class IAFDDBHandler(SQLiteDBHandler):
|
|||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# 按条件查询 href 列表
|
||||||
|
def get_iafd_actors(
|
||||||
|
self,
|
||||||
|
names: List[str],
|
||||||
|
tbl = 'stu'
|
||||||
|
) -> Dict[str, List[Dict[str, str]]]:
|
||||||
|
"""
|
||||||
|
分两步查询指定发行商对应的女性演员(使用临时表减少内存占用)
|
||||||
|
|
||||||
|
步骤1:筛选目标发行商及其关联的影片,存入临时表(小集合)
|
||||||
|
步骤2:用临时表的影片ID关联演员表,获取女性演员信息
|
||||||
|
"""
|
||||||
|
tbl_name = 'iafd_studios' if tbl.lower() == 'stu' else 'iafd_distributors'
|
||||||
|
join_key = 'studio_id' if tbl.lower() == 'stu' else 'distributor_id'
|
||||||
|
if not names:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# 结果容器
|
||||||
|
final_result: Dict[str, List[Dict[str, str]]] = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# --------------------------
|
||||||
|
# 步骤1:创建临时表,存储目标发行商及其关联的影片
|
||||||
|
# --------------------------
|
||||||
|
# 先删除可能残留的临时表(避免冲突)
|
||||||
|
self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies")
|
||||||
|
# 创建临时表(只在当前连接可见,连接关闭后自动删除)
|
||||||
|
self.cursor.execute("""
|
||||||
|
CREATE TEMPORARY TABLE temp_distributor_movies (
|
||||||
|
distributor_id INTEGER,
|
||||||
|
distributor_name TEXT,
|
||||||
|
movie_id INTEGER,
|
||||||
|
PRIMARY KEY (distributor_id, movie_id)
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# 批量插入目标发行商及其关联的影片(小集合)
|
||||||
|
# 先筛选发行商,再关联影片,结果插入临时表
|
||||||
|
insert_sql = """
|
||||||
|
INSERT INTO temp_distributor_movies (distributor_id, distributor_name, movie_id)
|
||||||
|
SELECT
|
||||||
|
d.id AS distributor_id,
|
||||||
|
d.name AS distributor_name,
|
||||||
|
m.id AS movie_id
|
||||||
|
FROM
|
||||||
|
{tbl_name} d
|
||||||
|
INNER JOIN
|
||||||
|
iafd_movies m ON d.id = m.{join_key}
|
||||||
|
WHERE
|
||||||
|
d.name IN ({placeholders})
|
||||||
|
""".format(
|
||||||
|
tbl_name=tbl_name,
|
||||||
|
join_key=join_key,
|
||||||
|
placeholders=', '.join(['?'] * len(names))
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info(f'{insert_sql}')
|
||||||
|
|
||||||
|
self.cursor.execute(insert_sql, names)
|
||||||
|
self.conn.commit() # 提交临时表数据
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 步骤2:用临时表关联演员信息(仅处理小集合)
|
||||||
|
# --------------------------
|
||||||
|
query_sql = """
|
||||||
|
SELECT
|
||||||
|
t.distributor_name,
|
||||||
|
p.name AS performer_name,
|
||||||
|
p.href AS performer_href
|
||||||
|
FROM
|
||||||
|
temp_distributor_movies t
|
||||||
|
INNER JOIN
|
||||||
|
iafd_performers_movies pm ON t.movie_id = pm.movie_id
|
||||||
|
INNER JOIN
|
||||||
|
iafd_performers p ON pm.performer_id = p.id
|
||||||
|
WHERE
|
||||||
|
p.gender = 'Woman'
|
||||||
|
ORDER BY
|
||||||
|
t.distributor_name, p.name
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.cursor.execute(query_sql)
|
||||||
|
rows = self.cursor.fetchall()
|
||||||
|
|
||||||
|
# 整理结果:按发行商分组
|
||||||
|
for row in rows:
|
||||||
|
dist_name = row['distributor_name']
|
||||||
|
performer = {
|
||||||
|
'name': row['performer_name'],
|
||||||
|
'href': row['performer_href']
|
||||||
|
}
|
||||||
|
if dist_name not in final_result:
|
||||||
|
final_result[dist_name] = []
|
||||||
|
final_result[dist_name].append(performer)
|
||||||
|
|
||||||
|
# 主动清理临时表(可选,连接关闭后会自动删除)
|
||||||
|
self.cursor.execute("DROP TABLE IF EXISTS temp_distributor_movies")
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
print(f"查询失败:{e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
return final_result
|
||||||
|
|
||||||
|
|
||||||
@register_handler(comm.SPIDER_NAME_PBOX)
|
@register_handler(comm.SPIDER_NAME_PBOX)
|
||||||
class PboxDBHandler(SQLiteDBHandler):
|
class PboxDBHandler(SQLiteDBHandler):
|
||||||
|
|||||||
@ -50,9 +50,16 @@ class ClmSpider(BaseSpider):
|
|||||||
|
|
||||||
self.keywords = keywords
|
self.keywords = keywords
|
||||||
self.min_size = float(min_size) if min_size else 1.0
|
self.min_size = float(min_size) if min_size else 1.0
|
||||||
|
self.run_task = True
|
||||||
|
|
||||||
#self.initDB()
|
# 增加一个暗号
|
||||||
|
if keywords and keywords.lower() == 'reload' :
|
||||||
|
self.initDB()
|
||||||
|
self.init_load_actors_from_others()
|
||||||
|
self.run_task = False
|
||||||
|
self.logger.info(f"reload keywords db succ!")
|
||||||
|
|
||||||
|
# 指定的关键词,导入到数据库
|
||||||
def initDB(self):
|
def initDB(self):
|
||||||
for row in default_keywords:
|
for row in default_keywords:
|
||||||
for group, items in row.items():
|
for group, items in row.items():
|
||||||
@ -63,15 +70,75 @@ class ClmSpider(BaseSpider):
|
|||||||
words_item['groups'] = group
|
words_item['groups'] = group
|
||||||
words_item['tags'] = ''
|
words_item['tags'] = ''
|
||||||
words_item['index_count'] = 0
|
words_item['index_count'] = 0
|
||||||
db_tools.insert_item(words_item)
|
db_clm.insert_item(words_item)
|
||||||
self.logger.debug(f"insert item: {item}: {group}")
|
self.logger.debug(f"insert item: {item}: {group}")
|
||||||
|
|
||||||
|
# 从其他数据源获取到演员列表,导入到数据库
|
||||||
|
def init_load_actors_from_others(self):
|
||||||
|
all_likes = {
|
||||||
|
'vixen' : ['vixen.com', 'Vixen Video'],
|
||||||
|
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
|
||||||
|
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
|
||||||
|
'x-art' : ['x-art.com', 'X-art'],
|
||||||
|
'nfbusty' : ['nfbusty.com']
|
||||||
|
}
|
||||||
|
# 先转换个格式
|
||||||
|
all_key_group = {}
|
||||||
|
all_keys = []
|
||||||
|
for group, keys in all_likes.items():
|
||||||
|
for key in keys:
|
||||||
|
all_key_group[key] = group
|
||||||
|
all_keys.append(key)
|
||||||
|
|
||||||
|
# 查询数据库,并转换数据
|
||||||
|
actor_tags = {}
|
||||||
|
total_lines = 0
|
||||||
|
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
|
||||||
|
for dist, actors in results.items():
|
||||||
|
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
|
||||||
|
total_lines += len(actors)
|
||||||
|
for actor in actors :
|
||||||
|
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
|
||||||
|
actor_name = actor['name']
|
||||||
|
current_tag = all_key_group.get(dist, '')
|
||||||
|
if actor_name not in actor_tags:
|
||||||
|
actor_tags[actor_name] = set() # 用set自动去重
|
||||||
|
if current_tag:
|
||||||
|
actor_tags[actor_name].add(current_tag) # set的add方法,重复值会自动忽略
|
||||||
|
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
|
||||||
|
|
||||||
|
# 查询另一个数据表,获取结果
|
||||||
|
load_results = db_comm.get_lord_actors()
|
||||||
|
if load_results:
|
||||||
|
self.logger.info(f"total actors in lord: {len(load_results)}")
|
||||||
|
for row in load_results:
|
||||||
|
actor_name = row['name']
|
||||||
|
if actor_name not in actor_tags:
|
||||||
|
actor_tags[actor_name] = set() # 用set自动去重
|
||||||
|
actor_tags[actor_name].add('thelordofporn') # set的add方法,重复值会自动忽略
|
||||||
|
|
||||||
|
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
|
||||||
|
for actor, tags_set in actor_tags.items():
|
||||||
|
tag_str = ','.join(tags_set) # set直接支持迭代,无需额外转换
|
||||||
|
self.logger.info(f"actor: {actor}, tags: {tag_str}")
|
||||||
|
words_item = ClmKeyWordsItem()
|
||||||
|
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||||||
|
words_item['words'] = actor
|
||||||
|
words_item['groups'] = 'actress'
|
||||||
|
words_item['tags'] = tag_str
|
||||||
|
words_item['index_count'] = 0
|
||||||
|
db_clm.insert_item(words_item)
|
||||||
|
#self.logger.debug(f"insert item: {words_item}")
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def custom_start_requests(self):
|
def custom_start_requests(self):
|
||||||
|
if not self.run_task:
|
||||||
|
return
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
keywords = db_clm.get_key_words(limit =5)
|
keywords = db_clm.get_key_words(limit =5)
|
||||||
else:
|
else:
|
||||||
keywords = db_clm.get_key_words()
|
keywords = db_clm.get_key_words(groups='actress', tags='vixen')
|
||||||
|
|
||||||
for item in keywords:
|
for item in keywords:
|
||||||
words_id = item['id']
|
words_id = item['id']
|
||||||
|
|||||||
Reference in New Issue
Block a user