234 lines
10 KiB
Python
234 lines
10 KiB
Python
import os
|
||
import sqlite3
|
||
import json
|
||
import logging
|
||
from datetime import datetime
|
||
from typing import List, Dict
|
||
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path, test_db_path
|
||
import scrapy_proj.comm.comm_def as comm
|
||
import scrapy_proj.items as items_def
|
||
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
|
||
|
||
class IAFDDBHandler(SQLiteDBHandler):
|
||
def __init__(self, db_path=shared_db_path):
|
||
super().__init__(db_path)
|
||
self.tbl_name_performers = 'iafd_performers'
|
||
self.tbl_name_movies = 'iafd_movies'
|
||
self.tbl_name_performer_movies = 'iafd_performers_movies'
|
||
self.tbl_name_alias = 'iafd_performer_aliases'
|
||
self.tbl_name_moives_appear_in = 'iafd_movies_appers_in'
|
||
self.tbl_name_studio = 'iafd_studios'
|
||
self.tbl_name_dist = 'iafd_distributors'
|
||
self.tbl_name_performer_urls = 'iafd_performer_urls'
|
||
self.tbl_name_ethnic = 'iafd_meta_ethnic'
|
||
self.tbl_name_thelordofporn_actress = 'thelordofporn_actress'
|
||
|
||
# 获取 view_iafd_performers_movies 中数据 不匹配的演员信息。
|
||
def get_performers_needed_update(self, limit=None):
|
||
try:
|
||
sql = """
|
||
SELECT href, name FROM view_iafd_performers_movies where actual_movies_cnt != movies_cnt
|
||
"""
|
||
|
||
if limit is not None:
|
||
sql += f" LIMIT {limit}"
|
||
|
||
self.cursor.execute(sql)
|
||
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
|
||
|
||
except sqlite3.Error as e:
|
||
logging.error(f"查询 href 失败: {e}")
|
||
return []
|
||
|
||
# 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。
|
||
def check_and_create_stat_table(self, taskid = 0):
|
||
try:
|
||
# 检查索引是否存在,如果不存在则创建
|
||
indexes = [
|
||
("idx_iafd_performers_movies_performer_id",
|
||
"CREATE INDEX idx_iafd_performers_movies_performer_id ON iafd_performers_movies (performer_id);"),
|
||
("idx_iafd_movies_director_id",
|
||
"CREATE INDEX idx_iafd_movies_director_id ON iafd_movies (director_id);"),
|
||
("idx_iafd_performers_id",
|
||
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
|
||
]
|
||
for index_name, create_index_sql in indexes:
|
||
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
|
||
if not self.cursor.fetchone():
|
||
self.cursor.execute(create_index_sql)
|
||
logging.info(f"Index {index_name} created successfully.")
|
||
else:
|
||
logging.info(f"Index {index_name} already exists.")
|
||
|
||
# 检查视图是否存在,如果不存在则创建
|
||
view_name = f"iafd_tmp_performers_stat_{taskid}"
|
||
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
|
||
if self.cursor.fetchone():
|
||
self.cursor.execute("drop table ?", (view_name,))
|
||
self.conn.commit()
|
||
|
||
create_view_sql = f"""
|
||
CREATE table {view_name} AS
|
||
SELECT
|
||
id,
|
||
href,
|
||
name,
|
||
movies_cnt,
|
||
SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count,
|
||
SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count
|
||
FROM (
|
||
SELECT
|
||
p.id,
|
||
p.href,
|
||
p.name,
|
||
p.movies_cnt,
|
||
COUNT(apm.movie_id) AS movie_count,
|
||
'actor' AS role
|
||
FROM
|
||
iafd_performers p
|
||
LEFT JOIN
|
||
iafd_performers_movies apm ON p.id = apm.performer_id
|
||
GROUP BY
|
||
p.id, p.href, p.name, p.movies_cnt
|
||
|
||
UNION ALL
|
||
|
||
SELECT
|
||
p.id,
|
||
p.href,
|
||
p.name,
|
||
p.movies_cnt,
|
||
COUNT(im.id) AS movie_count,
|
||
'director' AS role
|
||
FROM
|
||
iafd_performers p
|
||
LEFT JOIN
|
||
iafd_movies im ON p.id = im.director_id
|
||
GROUP BY
|
||
p.id, p.href, p.name, p.movies_cnt
|
||
) combined
|
||
GROUP BY
|
||
id, href, name, movies_cnt;
|
||
"""
|
||
self.cursor.execute(create_view_sql)
|
||
logging.info(f"table {view_name} created successfully.")
|
||
|
||
# 提交更改并关闭连接
|
||
self.conn.commit()
|
||
except sqlite3.Error as e:
|
||
logging.warning(f"An error occurred: {e}")
|
||
|
||
# 处理影片的 无码 字段
|
||
def reset_movies_uncensored(self, check_and_do = 0):
|
||
try:
|
||
logging.info("创建临时表以便于保存待更新记录")
|
||
self.cursor.execute("""
|
||
CREATE TEMPORARY TABLE IF NOT EXISTS temp_movies_to_update (
|
||
movie_id INTEGER PRIMARY KEY
|
||
)
|
||
""")
|
||
# 清空临时表(以防之前有残留数据)
|
||
self.cursor.execute("DELETE FROM temp_movies_to_update")
|
||
|
||
logging.info(f"开始收集需要更新的影片ID...")
|
||
# 使用单个SQL语句完成所有条件的查询和插入
|
||
self.cursor.execute("""
|
||
INSERT OR IGNORE INTO temp_movies_to_update (movie_id)
|
||
SELECT DISTINCT m.id
|
||
FROM javbus_movies m
|
||
-- 连接演员表
|
||
LEFT JOIN javbus_actors_movies am ON m.id = am.movie_id
|
||
LEFT JOIN javbus_actors a ON am.actor_id = a.id
|
||
-- 连接标签/系列/工作室表
|
||
LEFT JOIN javbus_labels l ON m.label_id = l.id
|
||
LEFT JOIN javbus_series s ON m.series_id = s.id
|
||
LEFT JOIN javbus_studios st ON m.studio_id = st.id
|
||
-- 筛选条件:任一表的href包含'uncensored'
|
||
WHERE a.href LIKE '%uncensored%'
|
||
OR l.href LIKE '%uncensored%'
|
||
OR s.href LIKE '%uncensored%'
|
||
OR st.href LIKE '%uncensored%'
|
||
""")
|
||
|
||
total_count = self.cursor.execute("SELECT COUNT(*) FROM temp_movies_to_update").fetchone()[0]
|
||
total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0]
|
||
logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片")
|
||
|
||
if check_and_do:
|
||
# 1. 将所有记录的uncensored默认设为0
|
||
logging.info("开始将所有影片的uncensored设为默认值0...")
|
||
self.cursor.execute("UPDATE javbus_movies SET uncensored = 0")
|
||
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为0")
|
||
|
||
# 2. 将临时表中匹配的记录设为1
|
||
logging.info("开始将匹配的影片的uncensored设为1...")
|
||
self.cursor.execute("""
|
||
UPDATE javbus_movies
|
||
SET uncensored = 1
|
||
WHERE id IN (SELECT movie_id FROM temp_movies_to_update)
|
||
""")
|
||
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1")
|
||
|
||
self.conn.commit()
|
||
else:
|
||
logging.info("check完毕,本次忽略更新。。。")
|
||
|
||
logging.info("任务执行完成!")
|
||
|
||
except sqlite3.Error as e:
|
||
self.conn.rollback()
|
||
logging.error("Error inserting movie: %s", e)
|
||
logging.error(f"query error: {e}")
|
||
|
||
# 处理影片的 无码 字段
|
||
def reset_actor_movies(self, check_and_do = 0):
|
||
try:
|
||
# 检查表中是否已存在movies_cnt列
|
||
self.cursor.execute(f"PRAGMA table_info({self.tbl_name_actors});")
|
||
columns = [row[1] for row in self.cursor.fetchall()]
|
||
|
||
if 'movies_cnt' not in columns:
|
||
# 列不存在,添加新列
|
||
add_field_sql = f"""
|
||
ALTER TABLE {self.tbl_name_actors} ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
|
||
"""
|
||
self.cursor.execute(add_field_sql)
|
||
logging.info("成功添加movies_cnt字段")
|
||
else:
|
||
logging.info("movies_cnt字段已存在,跳过添加")
|
||
|
||
# 确保关联表有索引
|
||
self.cursor.execute(f"""
|
||
CREATE INDEX IF NOT EXISTS idx_actor_movie_actor_id
|
||
ON {self.tbl_name_actor_movie}(actor_id);
|
||
""")
|
||
|
||
# 创建临时表存储统计结果
|
||
self.cursor.execute(f"""
|
||
CREATE TEMPORARY TABLE temp_actor_counts AS
|
||
SELECT actor_id, COUNT(movie_id) AS cnt
|
||
FROM {self.tbl_name_actor_movie}
|
||
GROUP BY actor_id;
|
||
""")
|
||
|
||
# 为临时表添加索引
|
||
self.cursor.execute("CREATE INDEX idx_temp_actor_id ON temp_actor_counts(actor_id);")
|
||
|
||
# 更新主表
|
||
self.cursor.execute(f"""
|
||
UPDATE {self.tbl_name_actors}
|
||
SET movies_cnt = COALESCE((
|
||
SELECT cnt FROM temp_actor_counts
|
||
WHERE actor_id = {self.tbl_name_actors}.id
|
||
), 0); -- 使用COALESCE处理没有影片的演员
|
||
""")
|
||
updated_rows = self.cursor.rowcount
|
||
logging.info(f"成功更新{updated_rows}个演员的影片数量")
|
||
|
||
self.conn.commit()
|
||
logging.info("任务执行完成!")
|
||
|
||
except sqlite3.Error as e:
|
||
self.conn.rollback()
|
||
logging.error("Error updating actor movie_cnt: %s", e)
|