modify scripts

This commit is contained in:
2025-11-13 08:34:28 +08:00
parent 15c4f7b823
commit 40eae5569a
5 changed files with 581 additions and 1 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
# 其他已有的忽略规则
*.pyc
__pycache__/
# 忽略环境配置文件
.env
# 忽略所有 log 目录 和 data 目录
**/log/
**/data/
**/result/

View File

@ -1,7 +1,7 @@
#!/bin/bash
SRC="/volume1/docker/sharedata/stock_data/pdfs"
DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
LOG="./paperless.log"
LOG="./log/paperless.log"
TARGET_UID=1000
TARGET_GID=1000
@ -16,6 +16,13 @@ if [ ! -d "$DST" ]; then
exit 1
fi
# 关键添加检查并创建log目录-p 确保父目录存在,无报错)
LOG_DIR=$(dirname "$LOG") # 提取日志文件所在目录(即 ./log
if [ ! -d "$LOG_DIR" ]; then
mkdir -p "$LOG_DIR"
echo "$(date '+%F %T') [INFO] log目录不存在已创建: $LOG_DIR" | tee -a "$LOG"
fi
COUNT=0
for f in "$SRC"/*.pdf; do
[ -f "$f" ] || continue

View File

@ -0,0 +1,263 @@
import sqlite3
import os
import logging
import json
from datetime import datetime
import argparse
import re
res_dir = './result'
os.makedirs(res_dir, exist_ok=True)
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'{res_dir}/rename_files.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def preload_folders(conn, prefix):
"""预加载所有文件夹路径到字典folder_id -> path"""
sqlstr = "SELECT id, path FROM folders where 1=1 "
if prefix and prefix.strip():
sqlstr += f" and path like '%{prefix}%' "
try:
cursor = conn.cursor()
cursor.execute(sqlstr)
return {row[0]: row[1] for row in cursor.fetchall()}
except sqlite3.Error as e:
logger.error(f"预加载文件夹信息失败: {str(e)}")
raise
def preload_studios(conn):
"""预加载所有工作室名称到字典studio_id -> name"""
try:
cursor = conn.cursor()
cursor.execute("SELECT id, name FROM studios")
studios = {row[0]: row[1] for row in cursor.fetchall()}
# 补充默认值(未找到的工作室)
studios[None] = "UnknownStudio"
return studios
except sqlite3.Error as e:
logger.error(f"预加载工作室信息失败: {str(e)}")
raise
def get_performers(conn, scene_id):
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
try:
cursor = conn.cursor()
query = """
SELECT p.name
FROM performers p
JOIN performers_scenes ps ON p.id = ps.performer_id
WHERE ps.scene_id = ?
ORDER BY p.name
"""
cursor.execute(query, (scene_id,))
results = cursor.fetchall()
return ','.join([row[0] for row in results]) or "UnknownPerformers"
except sqlite3.Error as e:
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
raise
def parse_date(date_str):
"""解析日期为yyyy.mm.dd格式"""
if not date_str:
return "0000.00.00"
date_formats = [
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
]
for fmt in date_formats:
try:
return datetime.strptime(date_str, fmt).strftime("%Y.%m.%d")
except ValueError:
continue
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
return "0000.00.00"
def get_file_extension(basename):
"""获取文件扩展名"""
if '.' in basename:
return basename.split('.')[-1].lower()
return ''
def sanitize_filename(name):
"""清理文件名中的非法字符"""
invalid_chars = '/\\:*?"<>|'
for char in invalid_chars:
name = name.replace(char, '-')
return name
def process_scene_files(conn, mode, prefix):
"""处理所有场景文件映射关系(优化版:合并查询+预加载缓存)"""
results = []
try:
# 1. 预加载文件夹和工作室到内存字典仅2次SQL查询
folders = preload_folders(conn, prefix)
studios = preload_studios(conn)
logger.info(f"预加载完成 - 文件夹: {len(folders)} 个, 工作室: {len(studios)}")
# 2. 一次性查询所有关联数据1次SQL查询替代多次
cursor = conn.cursor()
query = """
SELECT
sf.scene_id, sf.file_id,
f.id AS file_id, f.basename, f.parent_folder_id,
s.title, s.date as release_date, s.studio_id, s.code
FROM scenes_files sf
LEFT JOIN files f ON sf.file_id = f.id
LEFT JOIN scenes s ON sf.scene_id = s.id
"""
cursor.execute(query)
mappings = cursor.fetchall()
logger.info(f"共找到 {len(mappings)} 条场景-文件映射记录")
for idx, row in enumerate(mappings, 1):
try:
# 解析合并查询的结果
scene_id = row[0]
file_id = row[1]
file_info = {
'id': row[2],
'basename': row[3],
'parent_folder_id': row[4]
}
scene_info = {
'title': row[5],
'release_date': row[6],
'studio_id': row[7],
'code': row[8]
}
# 校验必要数据
if not file_id or not file_info['id'] or not file_info['basename'] or not file_info['parent_folder_id']:
logger.debug(f"文件ID信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
continue
if not scene_id or not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
logger.debug(f"场景信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
continue
# 3. 从内存缓存获取文件夹路径和工作室名称无SQL查询
folder_path = folders.get(file_info['parent_folder_id'])
if not folder_path:
logger.debug(f"文件夹ID不存在 (folder_id={file_info['parent_folder_id']}),跳过")
continue
studio_name = studios.get(scene_info['studio_id'])
if not studio_name:
logger.debug(f"工作室ID不存在 (studio_id={scene_info['studio_id']}),跳过")
continue
# 4. 获取演员信息(仍需单独查询,因多对多关联需排序)
performers = get_performers(conn, scene_id)
# 5. 构建新文件名
original_basename = file_info['basename'] or "unknown_file"
ext = get_file_extension(original_basename)
release_date = parse_date(scene_info['release_date'])
title = scene_info['title'] or "Untitled"
# 清理特殊字符
sanitized_studio = sanitize_filename(studio_name)
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
if scene_info.get('code'):
sanitized_title = f"{sanitized_title} ({scene_info['code']})"
# 去掉sanitized_studio的空格
sanitized_studio = re.sub(r'\s+', '', sanitized_studio)
# 拼接新文件名
if ext:
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}.{ext}"
else:
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}"
if len(new_basename) > 254:
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
continue
# 构建完整路径
original_path = os.path.join(folder_path, original_basename)
new_path = os.path.join(folder_path, new_basename)
# 记录结果
result = {
'file_id': file_id,
'scene_id': scene_id,
'original_name': original_path,
'dest_name': new_path
}
results.append(result)
logger.info(f"处理第 {idx}/{len(mappings)} 条: {original_path} -> {new_path}")
# 运行模式:执行重命名和数据库更新
if mode == 'run':
if not os.path.exists(original_path):
logger.warning(f"文件不存在,跳过: {original_path}")
continue
if os.path.exists(new_path):
logger.warning(f"目标文件已存在,跳过: {new_path}")
continue
if original_path != new_path:
os.rename(original_path, new_path)
#cursor.execute(
# "UPDATE files SET basename = ? WHERE id = ?",
# (new_basename, file_info['id'])
#)
#conn.commit()
logger.info(f"已更新文件 (file_id={file_info['id']})")
except Exception as e:
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
if mode == 'run':
conn.rollback()
continue
# 保存结果
with open(f'{res_dir}/rename_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
logger.info(f"处理完成,结果已保存到 rename_results.json")
return results
except sqlite3.Error as e:
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
if mode == 'run':
conn.rollback()
raise
finally:
if mode == 'run':
conn.commit()
def main():
parser = argparse.ArgumentParser(description='电影文件重命名工具(优化版)')
parser.add_argument('--mode', choices=['check', 'run'], default='check',
help='运行模式: check(检查) 或 run(执行)')
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
parser.add_argument('--prefix', default='', help='目录前缀,用来过滤文件路径')
args = parser.parse_args()
if not os.path.exists(args.db):
logger.error(f"数据库文件不存在: {args.db}")
return
conn = None
try:
conn = sqlite3.connect(args.db)
logger.info(f"成功连接到数据库: {args.db}")
process_scene_files(conn, args.mode, args.prefix)
except sqlite3.Error as e:
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
finally:
if conn:
conn.close()
logger.info("数据库连接已关闭")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,288 @@
import sqlite3
import os
import logging
import json
from datetime import datetime
import argparse
import re
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('./result/rename_files.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def get_performers(conn, scene_id):
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
try:
cursor = conn.cursor()
# 优化查询使用JOIN一次性获取所需数据
query = """
SELECT p.name
FROM performers p
JOIN performers_scenes ps ON p.id = ps.performer_id
WHERE ps.scene_id = ?
ORDER BY p.name
"""
cursor.execute(query, (scene_id,))
results = cursor.fetchall()
return ','.join([row[0] for row in results])
except sqlite3.Error as e:
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
raise
def get_file_info(conn, file_id):
"""获取文件信息ID、原始文件名、父目录ID"""
try:
cursor = conn.cursor()
cursor.execute("""
SELECT id, basename, parent_folder_id
FROM files
WHERE id = ?
""", (file_id,))
result = cursor.fetchone()
if not result:
raise ValueError(f"未找到文件信息 (file_id={file_id})")
return {
'id': result[0],
'basename': result[1],
'parent_folder_id': result[2]
}
except sqlite3.Error as e:
logger.error(f"获取文件信息失败 (file_id={file_id}): {str(e)}")
raise
def get_folder_path(conn, folder_id):
"""获取文件夹路径"""
try:
cursor = conn.cursor()
cursor.execute("SELECT path FROM folders WHERE id = ?", (folder_id,))
result = cursor.fetchone()
if not result:
raise ValueError(f"未找到文件夹路径 (folder_id={folder_id})")
return result[0]
except sqlite3.Error as e:
logger.error(f"获取文件夹路径失败 (folder_id={folder_id}): {str(e)}")
raise
def get_scene_info(conn, scene_id):
"""获取场景信息标题、日期、工作室ID"""
try:
cursor = conn.cursor()
cursor.execute("""
SELECT title, date as release_date, studio_id
FROM scenes
WHERE id = ?
""", (scene_id,))
result = cursor.fetchone()
if not result:
raise ValueError(f"未找到场景信息 (scene_id={scene_id})")
return {
'title': result[0],
'release_date': result[1],
'studio_id': result[2]
}
except sqlite3.Error as e:
logger.error(f"获取场景信息失败 (scene_id={scene_id}): {str(e)}")
raise
def get_studio_name(conn, studio_id):
"""获取工作室名称"""
try:
cursor = conn.cursor()
cursor.execute("SELECT name FROM studios WHERE id = ?", (studio_id,))
result = cursor.fetchone()
if not result:
logger.warning(f"未找到工作室信息 (studio_id={studio_id}),使用默认名称")
return "UnknownStudio"
return result[0]
except sqlite3.Error as e:
logger.error(f"获取工作室信息失败 (studio_id={studio_id}): {str(e)}")
raise
def parse_date(date_str):
"""解析日期为yyyy.mm.dd格式"""
if not date_str:
return "0000.00.00"
# 尝试多种常见日期格式
date_formats = [
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
]
for fmt in date_formats:
try:
date_obj = datetime.strptime(date_str, fmt)
return date_obj.strftime("%Y.%m.%d")
except ValueError:
continue
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
return "0000.00.00"
def get_file_extension(basename):
"""获取文件扩展名"""
if '.' in basename:
return basename.split('.')[-1].lower()
return ''
def sanitize_filename(name):
"""清理文件名中的非法字符"""
invalid_chars = '/\\:*?"<>|'
for char in invalid_chars:
name = name.replace(char, '-')
return name
def process_scene_files(conn, mode, prefix):
"""处理所有场景文件映射关系"""
results = []
try:
cursor = conn.cursor()
# 获取所有场景-文件映射关系
cursor.execute("SELECT scene_id, file_id FROM scenes_files")
mappings = cursor.fetchall()
logger.debug(f"共找到 {len(mappings)} 条场景-文件映射记录")
for idx, (scene_id, file_id) in enumerate(mappings, 1):
logger.debug(f"处理第 {idx}/{len(mappings)} 条记录 (scene_id={scene_id}, file_id={file_id})")
try:
# 1. 获取文件信息
file_info = get_file_info(conn, file_id)
original_basename = file_info['basename']
parent_folder_id = file_info['parent_folder_id']
# 2.获取文件夹路径
folder_path = get_folder_path(conn, parent_folder_id)
# 3. 获取演员信息
performers = get_performers(conn, scene_id)
if not performers:
performers = "UnknownPerformers"
logger.warning(f"场景 {scene_id} 未找到演员信息,跳过")
continue
# 4. 获取场景和工作室信息
scene_info = get_scene_info(conn, scene_id)
if not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
logger.warning(f"场景 {scene_id} 信息不完整,跳过")
continue
title = scene_info['title'] or "Untitled"
release_date = parse_date(scene_info['release_date'])
studio_name = get_studio_name(conn, scene_info['studio_id'])
# 5. 构建新文件名
ext = get_file_extension(original_basename)
sanitized_studio = sanitize_filename(studio_name)
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
if ext:
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}.{ext}"
else:
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}"
if len(new_basename) > 254:
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
continue
# 构建完整路径
original_path = os.path.join(folder_path, original_basename)
new_path = os.path.join(folder_path, new_basename)
# 记录结果
result = {
'file_id': file_id,
'scene_id': scene_id,
'original_name': original_path,
'dest_name': new_path
}
results.append(result)
# 输出检查信息
logger.info(f"准备重命名: {original_path} -> {new_path}")
# 在运行模式下执行操作
if mode == 'run':
# 检查文件是否存在
if not os.path.exists(original_path):
logger.warning(f"文件不存在,跳过: {original_path}")
continue
# 执行文件重命名
if original_path != new_path:
os.rename(original_path, new_path)
logger.info(f"已重命名: {original_path} -> {new_path}")
# 更新数据库记录
cursor.execute(
"UPDATE files SET basename = ? WHERE id = ?",
(new_basename, file_id)
)
conn.commit()
logger.info(f"已更新数据库记录 (file_id={file_id})")
except Exception as e:
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
# 回滚当前事务(如果是运行模式)
if mode == 'run':
conn.rollback()
continue
# 保存结果到文件
with open('./result/rename_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
logger.info(f"处理完成,结果已保存到 rename_results.json")
return results
except sqlite3.Error as e:
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
if mode == 'run':
conn.rollback()
raise
finally:
if mode == 'run':
conn.commit()
def main():
# 解析命令行参数
parser = argparse.ArgumentParser(description='电影文件重命名工具')
parser.add_argument('--mode', choices=['check', 'run'], default='check',
help='运行模式: check(检查) 或 run(执行)')
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
parser.add_argument('--prefix', default='', help='目录的前缀,用来匹配')
args = parser.parse_args()
# 验证数据库文件是否存在
if not os.path.exists(args.db):
logger.error(f"数据库文件不存在: {args.db}")
return
os.makedirs('./result', exist_ok=True)
# 连接数据库
conn = None
try:
conn = sqlite3.connect(args.db)
conn.row_factory = sqlite3.Row # 启用行工厂,方便按列名访问
logger.info(f"成功连接到数据库: {args.db}")
# 执行处理
process_scene_files(conn, args.mode, args.prefix)
except sqlite3.Error as e:
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
finally:
if conn:
conn.close()
logger.info("数据库连接已关闭")
if __name__ == "__main__":
main()

11
gitignore Normal file
View File

@ -0,0 +1,11 @@
# 其他已有的忽略规则
*.pyc
__pycache__/
# 忽略环境配置文件
.env
# 忽略所有 log 目录 和 data 目录
**/log/
**/data/
**/result/