This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/tools/db_tools.py
2025-07-28 19:34:14 +08:00

234 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sqlite3
import json
import logging
from datetime import datetime
from typing import List, Dict
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path, test_db_path
import scrapy_proj.comm.comm_def as comm
import scrapy_proj.items as items_def
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
class IAFDDBHandler(SQLiteDBHandler):
def __init__(self, db_path=shared_db_path):
super().__init__(db_path)
self.tbl_name_performers = 'iafd_performers'
self.tbl_name_movies = 'iafd_movies'
self.tbl_name_performer_movies = 'iafd_performers_movies'
self.tbl_name_alias = 'iafd_performer_aliases'
self.tbl_name_moives_appear_in = 'iafd_movies_appers_in'
self.tbl_name_studio = 'iafd_studios'
self.tbl_name_dist = 'iafd_distributors'
self.tbl_name_performer_urls = 'iafd_performer_urls'
self.tbl_name_ethnic = 'iafd_meta_ethnic'
self.tbl_name_thelordofporn_actress = 'thelordofporn_actress'
# 获取 view_iafd_performers_movies 中数据 不匹配的演员信息。
def get_performers_needed_update(self, limit=None):
try:
sql = """
SELECT href, name FROM view_iafd_performers_movies where actual_movies_cnt != movies_cnt
"""
if limit is not None:
sql += f" LIMIT {limit}"
self.cursor.execute(sql)
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return []
# 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。
def check_and_create_stat_table(self, taskid = 0):
try:
# 检查索引是否存在,如果不存在则创建
indexes = [
("idx_iafd_performers_movies_performer_id",
"CREATE INDEX idx_iafd_performers_movies_performer_id ON iafd_performers_movies (performer_id);"),
("idx_iafd_movies_director_id",
"CREATE INDEX idx_iafd_movies_director_id ON iafd_movies (director_id);"),
("idx_iafd_performers_id",
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
]
for index_name, create_index_sql in indexes:
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
if not self.cursor.fetchone():
self.cursor.execute(create_index_sql)
logging.info(f"Index {index_name} created successfully.")
else:
logging.info(f"Index {index_name} already exists.")
# 检查视图是否存在,如果不存在则创建
view_name = f"iafd_tmp_performers_stat_{taskid}"
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
if self.cursor.fetchone():
self.cursor.execute("drop table ?", (view_name,))
self.conn.commit()
create_view_sql = f"""
CREATE table {view_name} AS
SELECT
id,
href,
name,
movies_cnt,
SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count,
SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count
FROM (
SELECT
p.id,
p.href,
p.name,
p.movies_cnt,
COUNT(apm.movie_id) AS movie_count,
'actor' AS role
FROM
iafd_performers p
LEFT JOIN
iafd_performers_movies apm ON p.id = apm.performer_id
GROUP BY
p.id, p.href, p.name, p.movies_cnt
UNION ALL
SELECT
p.id,
p.href,
p.name,
p.movies_cnt,
COUNT(im.id) AS movie_count,
'director' AS role
FROM
iafd_performers p
LEFT JOIN
iafd_movies im ON p.id = im.director_id
GROUP BY
p.id, p.href, p.name, p.movies_cnt
) combined
GROUP BY
id, href, name, movies_cnt;
"""
self.cursor.execute(create_view_sql)
logging.info(f"table {view_name} created successfully.")
# 提交更改并关闭连接
self.conn.commit()
except sqlite3.Error as e:
logging.warning(f"An error occurred: {e}")
# 处理影片的 无码 字段
def reset_movies_uncensored(self, check_and_do = 0):
try:
logging.info("创建临时表以便于保存待更新记录")
self.cursor.execute("""
CREATE TEMPORARY TABLE IF NOT EXISTS temp_movies_to_update (
movie_id INTEGER PRIMARY KEY
)
""")
# 清空临时表(以防之前有残留数据)
self.cursor.execute("DELETE FROM temp_movies_to_update")
logging.info(f"开始收集需要更新的影片ID...")
# 使用单个SQL语句完成所有条件的查询和插入
self.cursor.execute("""
INSERT OR IGNORE INTO temp_movies_to_update (movie_id)
SELECT DISTINCT m.id
FROM javbus_movies m
-- 连接演员表
LEFT JOIN javbus_actors_movies am ON m.id = am.movie_id
LEFT JOIN javbus_actors a ON am.actor_id = a.id
-- 连接标签/系列/工作室表
LEFT JOIN javbus_labels l ON m.label_id = l.id
LEFT JOIN javbus_series s ON m.series_id = s.id
LEFT JOIN javbus_studios st ON m.studio_id = st.id
-- 筛选条件任一表的href包含'uncensored'
WHERE a.href LIKE '%uncensored%'
OR l.href LIKE '%uncensored%'
OR s.href LIKE '%uncensored%'
OR st.href LIKE '%uncensored%'
""")
total_count = self.cursor.execute("SELECT COUNT(*) FROM temp_movies_to_update").fetchone()[0]
total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0]
logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片")
if check_and_do:
# 1. 将所有记录的uncensored默认设为0
logging.info("开始将所有影片的uncensored设为默认值0...")
self.cursor.execute("UPDATE javbus_movies SET uncensored = 0")
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为0")
# 2. 将临时表中匹配的记录设为1
logging.info("开始将匹配的影片的uncensored设为1...")
self.cursor.execute("""
UPDATE javbus_movies
SET uncensored = 1
WHERE id IN (SELECT movie_id FROM temp_movies_to_update)
""")
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1")
self.conn.commit()
else:
logging.info("check完毕本次忽略更新。。。")
logging.info("任务执行完成!")
except sqlite3.Error as e:
self.conn.rollback()
logging.error("Error inserting movie: %s", e)
logging.error(f"query error: {e}")
# 处理影片的 无码 字段
def reset_actor_movies(self, check_and_do = 0):
try:
# 检查表中是否已存在movies_cnt列
self.cursor.execute(f"PRAGMA table_info({self.tbl_name_actors});")
columns = [row[1] for row in self.cursor.fetchall()]
if 'movies_cnt' not in columns:
# 列不存在,添加新列
add_field_sql = f"""
ALTER TABLE {self.tbl_name_actors} ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
"""
self.cursor.execute(add_field_sql)
logging.info("成功添加movies_cnt字段")
else:
logging.info("movies_cnt字段已存在跳过添加")
# 确保关联表有索引
self.cursor.execute(f"""
CREATE INDEX IF NOT EXISTS idx_actor_movie_actor_id
ON {self.tbl_name_actor_movie}(actor_id);
""")
# 创建临时表存储统计结果
self.cursor.execute(f"""
CREATE TEMPORARY TABLE temp_actor_counts AS
SELECT actor_id, COUNT(movie_id) AS cnt
FROM {self.tbl_name_actor_movie}
GROUP BY actor_id;
""")
# 为临时表添加索引
self.cursor.execute("CREATE INDEX idx_temp_actor_id ON temp_actor_counts(actor_id);")
# 更新主表
self.cursor.execute(f"""
UPDATE {self.tbl_name_actors}
SET movies_cnt = COALESCE((
SELECT cnt FROM temp_actor_counts
WHERE actor_id = {self.tbl_name_actors}.id
), 0); -- 使用COALESCE处理没有影片的演员
""")
updated_rows = self.cursor.rowcount
logging.info(f"成功更新{updated_rows}个演员的影片数量")
self.conn.commit()
logging.info("任务执行完成!")
except sqlite3.Error as e:
self.conn.rollback()
logging.error("Error updating actor movie_cnt: %s", e)