modify scripts

This commit is contained in:
2025-07-28 19:34:14 +08:00
parent e9eacc127c
commit a04f02ec6d
4 changed files with 776 additions and 877 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,9 @@
import os
import re
import sqlite3
import logging
from datetime import datetime
from typing import List, Dict, Optional, Any
home_dir = os.path.expanduser("~")
global_share_data_dir = f'{home_dir}/sharedata'
@ -49,6 +51,7 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
self.lower_sqlite_version = True
self.initialized = True # 标记初始化完成
self._column_cache = {} # 缓存表字段信息,避免重复查询
def __del__(self):
try:
@ -64,14 +67,15 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
raise NotImplementedError("子类必须实现 insert_item 方法")
def get_table_columns_and_defaults(self, tbl_name):
"""获取表的字段信息(含默认值),并缓存结果"""
if tbl_name in self._column_cache:
return self._column_cache[tbl_name]
try:
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
columns = self.cursor.fetchall()
column_info = {}
for col in columns:
col_name = col[1]
default_value = col[4]
column_info[col_name] = default_value
column_info = {col[1]: col[4] for col in columns} # col[1]是字段名col[4]是默认值
self._column_cache[tbl_name] = column_info # 缓存结果
return column_info
except sqlite3.Error as e:
logging.error(f"Error getting table columns: {e}")
@ -262,3 +266,322 @@ class SQLiteDBHandler(metaclass=SingletonMeta): # 应用单例元类
def get_stat(self):
return {}
def _validate_fields(self, tbl_name: str, fields: List[str]) -> List[str]:
"""验证查询字段是否合法,返回有效字段列表"""
column_info = self.get_table_columns_and_defaults(tbl_name)
if not column_info:
return []
valid_fields = []
for field in fields:
# 处理带别名的字段(如 "pornstar as name"
match = re.match(r'^(\w+)\s+as\s+\w+$', field, re.IGNORECASE)
if match:
raw_field = match.group(1) # 提取原始字段名(如 "pornstar"
else:
raw_field = field # 普通字段(如 "id"、"name"
if raw_field in column_info:
valid_fields.append(field)
else:
logging.warning(f"无效查询字段: 表={tbl_name}, 字段={field}")
return valid_fields
def _validate_filter_fields(self, tbl_name: str, filters: Dict[str, Any],
condition_mapping: Dict[str, str]) -> Dict[str, Any]:
"""验证过滤条件中的字段是否合法,返回有效条件字典"""
column_info = self.get_table_columns_and_defaults(tbl_name)
if not column_info:
return {}
valid_filters = {}
for key, value in filters.items():
# 跳过排序和限制(单独处理)
if key in ["order_by", "limit"]:
valid_filters[key] = value
continue
# 解析字段名(处理 "__" 分隔的条件类型)
if "__" in key:
field_base, _ = key.split("__", 1)
else:
field_base = key
# 映射到表实际字段
mapped_field = condition_mapping.get(field_base, field_base)
if mapped_field in column_info:
valid_filters[key] = value
else:
logging.warning(f"无效过滤字段: 表={tbl_name}, 字段={field_base} (映射后={mapped_field})")
return valid_filters
def _validate_order_fields(self, tbl_name: str, allowed_order_fields: List[str]) -> List[str]:
"""验证排序字段是否合法,返回有效排序字段列表"""
column_info = self.get_table_columns_and_defaults(tbl_name)
if not column_info:
return []
valid_order_fields = []
for field in allowed_order_fields:
# 处理带排序方向的字段(如 "id DESC"、"name ASC"
raw_field = field.split()[0].strip() # 提取纯字段名
if raw_field in column_info:
valid_order_fields.append(field)
else:
logging.warning(f"无效排序字段: 表={tbl_name}, 字段={field}")
return valid_order_fields
def generic_query(
self,
table_name: str,
fields: List[str],
filters: Dict[str, Any],
condition_mapping: Optional[Dict[str, str]] = None,
allowed_order_fields: Optional[List[str]] = None,
simplify_single_field: bool = True # 新增参数:是否简化单字段结果
) -> Optional[List[Dict[str, Any]]]:
"""
带字段合法性校验的通用单表查询函数
新增逻辑:自动校验查询字段、过滤字段和排序字段的有效性
"""
try:
condition_mapping = condition_mapping or {}
allowed_order_fields = allowed_order_fields or []
# 1. 校验并过滤查询字段fields
valid_fields = self._validate_fields(table_name, fields)
if not valid_fields:
logging.error(f"无有效查询字段: 表={table_name}")
return None
select_fields = ", ".join(valid_fields)
is_single_field = len(valid_fields) == 1 # 判断是否单字段查询
# 2. 校验并过滤条件字段filters
valid_filters = self._validate_filter_fields(
table_name, filters, condition_mapping
)
# 3. 校验排序字段allowed_order_fields
valid_order_fields = self._validate_order_fields(
table_name, allowed_order_fields
)
# 构建SQL基础
sql = f"SELECT {select_fields} FROM {table_name} WHERE 1=1"
params = []
# 处理查询条件基于校验后的valid_filters
for key, value in valid_filters.items():
if key in ["order_by", "limit"]:
continue
if "__" in key:
field_base, condition_type = key.split("__", 1)
else:
field_base, condition_type = key, "eq"
field = condition_mapping.get(field_base, field_base)
# 生成SQL片段逻辑与之前一致
if condition_type == "eq":
sql += f" AND {field} = ?"
params.append(value)
elif condition_type == "like":
sql += f" AND {field} LIKE ?"
params.append(f"%{value}%")
elif condition_type == "in":
if isinstance(value, list):
placeholders = ", ".join(["?"] * len(value))
sql += f" AND {field} IN ({placeholders})"
params.extend(value)
else:
logging.warning(f"IN条件值必须是列表键: {key}")
elif condition_type == "not_in":
if isinstance(value, list):
placeholders = ", ".join(["?"] * len(value))
sql += f" AND {field} NOT IN ({placeholders})"
params.extend(value)
else:
logging.warning(f"NOT IN条件值必须是列表键: {key}")
elif condition_type == "gt":
sql += f" AND {field} > ?"
params.append(value)
elif condition_type == "lt":
sql += f" AND {field} < ?"
params.append(value)
else:
logging.warning(f"不支持的条件类型: {condition_type},键: {key}")
# 处理排序基于校验后的valid_order_fields
if "order_by" in valid_filters:
sql += f" ORDER BY {valid_filters["order_by"]}"
''' 加校验的这段屏蔽掉
if "order_by" in valid_filters:
order_field = valid_filters["order_by"]
# 检查排序字段是否在允许的列表中(且已通过合法性校验)
if any((order_field.startswith(valid) or valid.startswith(order_field)) for valid in valid_order_fields):
sql += f" ORDER BY {order_field}"
else:
logging.warning(f"不允许的排序字段: {order_field},表={table_name}")
'''
# 处理限制条数
if "limit" in valid_filters:
sql += " LIMIT ?"
params.append(valid_filters["limit"])
# 执行查询
self.cursor.execute(sql, params)
rows = self.cursor.fetchall()
# 4. 处理结果:单字段查询时简化为值数组
if not rows:
return [] # 空结果返回空列表
if is_single_field and simplify_single_field:
# 提取单字段的值(支持带别名的字段,如 "pornstar as name" 取 "name"
field_key = valid_fields[0].split(" as ")[-1].strip().lower()
return [row[field_key] for row in rows]
else:
# 多字段返回字典列表
return [dict(row) for row in rows]
except sqlite3.Error as e:
logging.error(f"查询失败: 表={table_name}, 错误={e}")
return None
def generic_stats_query(self, stats_config: List[Dict[str, str]]) -> Dict[str, int]:
"""
通用统计查询方法通过配置列表定义统计项自动生成并执行SQL
参数:
stats_config: 统计项配置列表,每个元素为字典,包含:
- 'table': 要统计的表名(必填)
- 'alias': 统计结果的别名(必填,如'actors''mov_full'
- 'where': 过滤条件(可选,如'uncensored=1 AND is_full_data=1'
返回:
统计结果字典键为alias值为统计数int
"""
try:
# 1. 生成子查询列表(每个统计项对应一个子查询)
subqueries = []
for config in stats_config:
table = config.get('table')
alias = config.get('alias')
where_clause = config.get('where')
# 校验必填参数
if not (table and alias):
logging.warning(f"统计项配置不完整:{config},跳过")
continue
# 构建单个子查询(如 "SELECT COUNT(*) FROM actors WHERE uncensored=1 AS act_un"
subquery = f"(SELECT COUNT(*) FROM {table}"
if where_clause:
subquery += f" WHERE {where_clause}"
subquery += f") AS {alias}"
subqueries.append(subquery)
if not subqueries:
logging.warning("无有效统计项配置,返回空结果")
return {}
# 2. 组合成完整SQL
sql = f"SELECT {', '.join(subqueries)}"
# 3. 执行查询
self.cursor.execute(sql)
row = self.cursor.fetchone()
if not row:
logging.warning("统计查询无结果")
return {}
# 4. 提取列名alias并映射结果
columns = [desc[0] for desc in self.cursor.description] # 获取别名列表
result = dict(zip(columns, row))
# 确保所有值都是整数COUNT(*)返回的是数字转换为int避免类型问题
return {k: int(v) if v is not None else 0 for k, v in result.items()}
except sqlite3.Error as e:
logging.error(f"统计查询失败: {e}")
return {}
def generic_get_record_count(
self,
table_name: str,
conditions: Optional[Dict[str, any]] = None,
condition_mapping: Optional[Dict[str, str]] = None
) -> int:
"""
通用记录数查询:查询指定表中满足条件的记录数量
参数:
table_name: 要查询的表名
conditions: 查询条件字典格式与generic_query中的filters一致
(如{'is_full_data': 1, 'url': 'xxx'}表示is_full_data=1 AND url='xxx'
condition_mapping: 字段映射同generic_query将条件键映射到表实际字段
返回:
满足条件的记录数int查询失败返回0
"""
try:
condition_mapping = condition_mapping or {}
conditions = conditions or {}
# 1. 构建基础SQL
sql = f"SELECT COUNT(*) AS cnt FROM {table_name} WHERE 1=1"
params = []
# 2. 处理查询条件(复用之前的条件解析逻辑)
for key, value in conditions.items():
# 解析条件类型支持基础的eq其他复杂条件可按需扩展
if "__" in key:
field_base, condition_type = key.split("__", 1)
else:
field_base, condition_type = key, "eq" # 默认等于
# 映射到表实际字段
field = condition_mapping.get(field_base, field_base)
# 生成条件SQL目前支持eq/like/in/not_in/gt/lt与generic_query保持一致
if condition_type == "eq":
sql += f" AND {field} = ?"
params.append(value)
elif condition_type == "like":
sql += f" AND {field} LIKE ?"
params.append(f"%{value}%")
elif condition_type == "in":
if isinstance(value, list):
placeholders = ", ".join(["?"] * len(value))
sql += f" AND {field} IN ({placeholders})"
params.extend(value)
elif condition_type == "not_in":
if isinstance(value, list):
placeholders = ", ".join(["?"] * len(value))
sql += f" AND {field} NOT IN ({placeholders})"
params.extend(value)
elif condition_type == "gt":
sql += f" AND {field} > ?"
params.append(value)
elif condition_type == "lt":
sql += f" AND {field} < ?"
params.append(value)
else:
logging.warning(f"不支持的条件类型: {condition_type},键: {key}")
# 3. 执行查询
self.cursor.execute(sql, params)
row = self.cursor.fetchone()
# 4. 解析结果确保返回整数默认0
return int(row[0]) if row and row[0] is not None else 0
except sqlite3.Error as e:
logging.error(f"记录数查询失败: 表={table_name}, 错误={e}")
return 0

View File

@ -81,8 +81,7 @@ class IAFDSpider(BaseSpider):
if self.debug:
query_args['limit'] = 5
if self.update_mode:
query_args['is_full_data'] = 0
query_args['is_full_data'] = 404
query_args['is_full_data__in'] = [0,404]
# 读取待更新的演员列表
if self.cmd_performers in self.cmd_list:
@ -347,22 +346,22 @@ class IAFDSpider(BaseSpider):
def load_existed_actors(self):
query_args = {}
rows = db_tools.query_performer_hrefs(**query_args)
rows = db_tools.get_performers(**query_args)
if rows:
for item in rows:
self.existed_actors[item['href']] = {'is_full_data': item['is_full_data'], 'movies_cnt': item['movies_cnt']}
else:
self.logger.warning(f"query_performer_hrefs empty. query args: {query_args}")
self.logger.warning(f"get_performers empty. query args: {query_args}")
def load_existed_movies(self):
query_args = {}
rows = db_tools.query_movie_hrefs(**query_args)
rows = db_tools.get_movies(**query_args)
if rows:
for item in rows:
self.existed_movies[item['href']] = item['is_full_data']
else:
self.logger.warning(f"query_movies empty. query args: {query_args}")
self.logger.warning(f"get_movies empty. query args: {query_args}")
# 内存缓存也可以改为查询db
def need_update_movie(self, href):

View File

@ -0,0 +1,233 @@
import os
import sqlite3
import json
import logging
from datetime import datetime
from typing import List, Dict
from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path, test_db_path
import scrapy_proj.comm.comm_def as comm
import scrapy_proj.items as items_def
from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url
class IAFDDBHandler(SQLiteDBHandler):
def __init__(self, db_path=shared_db_path):
super().__init__(db_path)
self.tbl_name_performers = 'iafd_performers'
self.tbl_name_movies = 'iafd_movies'
self.tbl_name_performer_movies = 'iafd_performers_movies'
self.tbl_name_alias = 'iafd_performer_aliases'
self.tbl_name_moives_appear_in = 'iafd_movies_appers_in'
self.tbl_name_studio = 'iafd_studios'
self.tbl_name_dist = 'iafd_distributors'
self.tbl_name_performer_urls = 'iafd_performer_urls'
self.tbl_name_ethnic = 'iafd_meta_ethnic'
self.tbl_name_thelordofporn_actress = 'thelordofporn_actress'
# 获取 view_iafd_performers_movies 中数据 不匹配的演员信息。
def get_performers_needed_update(self, limit=None):
try:
sql = """
SELECT href, name FROM view_iafd_performers_movies where actual_movies_cnt != movies_cnt
"""
if limit is not None:
sql += f" LIMIT {limit}"
self.cursor.execute(sql)
return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return []
# 生成一个复杂的演员电影数量的查询视图,来判断从电影列表中聚合出来的演员-影片数量,与从演员列表中抓取到的影片数量,是否相等。
def check_and_create_stat_table(self, taskid = 0):
try:
# 检查索引是否存在,如果不存在则创建
indexes = [
("idx_iafd_performers_movies_performer_id",
"CREATE INDEX idx_iafd_performers_movies_performer_id ON iafd_performers_movies (performer_id);"),
("idx_iafd_movies_director_id",
"CREATE INDEX idx_iafd_movies_director_id ON iafd_movies (director_id);"),
("idx_iafd_performers_id",
"CREATE INDEX idx_iafd_performers_id ON iafd_performers (id);")
]
for index_name, create_index_sql in indexes:
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name=?", (index_name,))
if not self.cursor.fetchone():
self.cursor.execute(create_index_sql)
logging.info(f"Index {index_name} created successfully.")
else:
logging.info(f"Index {index_name} already exists.")
# 检查视图是否存在,如果不存在则创建
view_name = f"iafd_tmp_performers_stat_{taskid}"
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (view_name,))
if self.cursor.fetchone():
self.cursor.execute("drop table ?", (view_name,))
self.conn.commit()
create_view_sql = f"""
CREATE table {view_name} AS
SELECT
id,
href,
name,
movies_cnt,
SUM(CASE WHEN role = 'actor' THEN movie_count ELSE 0 END) AS actor_movie_count,
SUM(CASE WHEN role = 'director' THEN movie_count ELSE 0 END) AS director_movie_count
FROM (
SELECT
p.id,
p.href,
p.name,
p.movies_cnt,
COUNT(apm.movie_id) AS movie_count,
'actor' AS role
FROM
iafd_performers p
LEFT JOIN
iafd_performers_movies apm ON p.id = apm.performer_id
GROUP BY
p.id, p.href, p.name, p.movies_cnt
UNION ALL
SELECT
p.id,
p.href,
p.name,
p.movies_cnt,
COUNT(im.id) AS movie_count,
'director' AS role
FROM
iafd_performers p
LEFT JOIN
iafd_movies im ON p.id = im.director_id
GROUP BY
p.id, p.href, p.name, p.movies_cnt
) combined
GROUP BY
id, href, name, movies_cnt;
"""
self.cursor.execute(create_view_sql)
logging.info(f"table {view_name} created successfully.")
# 提交更改并关闭连接
self.conn.commit()
except sqlite3.Error as e:
logging.warning(f"An error occurred: {e}")
# 处理影片的 无码 字段
def reset_movies_uncensored(self, check_and_do = 0):
try:
logging.info("创建临时表以便于保存待更新记录")
self.cursor.execute("""
CREATE TEMPORARY TABLE IF NOT EXISTS temp_movies_to_update (
movie_id INTEGER PRIMARY KEY
)
""")
# 清空临时表(以防之前有残留数据)
self.cursor.execute("DELETE FROM temp_movies_to_update")
logging.info(f"开始收集需要更新的影片ID...")
# 使用单个SQL语句完成所有条件的查询和插入
self.cursor.execute("""
INSERT OR IGNORE INTO temp_movies_to_update (movie_id)
SELECT DISTINCT m.id
FROM javbus_movies m
-- 连接演员表
LEFT JOIN javbus_actors_movies am ON m.id = am.movie_id
LEFT JOIN javbus_actors a ON am.actor_id = a.id
-- 连接标签/系列/工作室表
LEFT JOIN javbus_labels l ON m.label_id = l.id
LEFT JOIN javbus_series s ON m.series_id = s.id
LEFT JOIN javbus_studios st ON m.studio_id = st.id
-- 筛选条件任一表的href包含'uncensored'
WHERE a.href LIKE '%uncensored%'
OR l.href LIKE '%uncensored%'
OR s.href LIKE '%uncensored%'
OR st.href LIKE '%uncensored%'
""")
total_count = self.cursor.execute("SELECT COUNT(*) FROM temp_movies_to_update").fetchone()[0]
total_movies = self.cursor.execute("SELECT COUNT(*) FROM javbus_movies").fetchone()[0]
logging.info(f"共收集到 {total_count} 部需要更新的影片, 共有 {total_movies} 部影片")
if check_and_do:
# 1. 将所有记录的uncensored默认设为0
logging.info("开始将所有影片的uncensored设为默认值0...")
self.cursor.execute("UPDATE javbus_movies SET uncensored = 0")
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为0")
# 2. 将临时表中匹配的记录设为1
logging.info("开始将匹配的影片的uncensored设为1...")
self.cursor.execute("""
UPDATE javbus_movies
SET uncensored = 1
WHERE id IN (SELECT movie_id FROM temp_movies_to_update)
""")
logging.info(f"已将 {self.cursor.rowcount} 条记录的uncensored设为1")
self.conn.commit()
else:
logging.info("check完毕本次忽略更新。。。")
logging.info("任务执行完成!")
except sqlite3.Error as e:
self.conn.rollback()
logging.error("Error inserting movie: %s", e)
logging.error(f"query error: {e}")
# 处理影片的 无码 字段
def reset_actor_movies(self, check_and_do = 0):
try:
# 检查表中是否已存在movies_cnt列
self.cursor.execute(f"PRAGMA table_info({self.tbl_name_actors});")
columns = [row[1] for row in self.cursor.fetchall()]
if 'movies_cnt' not in columns:
# 列不存在,添加新列
add_field_sql = f"""
ALTER TABLE {self.tbl_name_actors} ADD COLUMN movies_cnt INTEGER DEFAULT 0 NOT NULL;
"""
self.cursor.execute(add_field_sql)
logging.info("成功添加movies_cnt字段")
else:
logging.info("movies_cnt字段已存在跳过添加")
# 确保关联表有索引
self.cursor.execute(f"""
CREATE INDEX IF NOT EXISTS idx_actor_movie_actor_id
ON {self.tbl_name_actor_movie}(actor_id);
""")
# 创建临时表存储统计结果
self.cursor.execute(f"""
CREATE TEMPORARY TABLE temp_actor_counts AS
SELECT actor_id, COUNT(movie_id) AS cnt
FROM {self.tbl_name_actor_movie}
GROUP BY actor_id;
""")
# 为临时表添加索引
self.cursor.execute("CREATE INDEX idx_temp_actor_id ON temp_actor_counts(actor_id);")
# 更新主表
self.cursor.execute(f"""
UPDATE {self.tbl_name_actors}
SET movies_cnt = COALESCE((
SELECT cnt FROM temp_actor_counts
WHERE actor_id = {self.tbl_name_actors}.id
), 0); -- 使用COALESCE处理没有影片的演员
""")
updated_rows = self.cursor.rowcount
logging.info(f"成功更新{updated_rows}个演员的影片数量")
self.conn.commit()
logging.info("任务执行完成!")
except sqlite3.Error as e:
self.conn.rollback()
logging.error("Error updating actor movie_cnt: %s", e)