modify scripts

This commit is contained in:
2026-01-11 11:50:55 +08:00
parent dece263c8b
commit 2b0e1c0413
4 changed files with 161 additions and 128 deletions

View File

@ -1,73 +1,149 @@
#!/bin/bash
# 定义文件路径
SOURCE_PARSE_FILENAME="/usr/src/paperless/git_scripts/parse_filename.py"
DEST_PARSE_FILENAME="/usr/src/paperless/scripts/parse_filename.py"
# 定义文件替换对(数组形式,格式:源文件 目标文件)
# 可按需添加/删除行,每行一组 源文件 目标文件
FILE_PAIRS=(
"/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py"
# 示例:新增更多文件对
#"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py"
# "/path/to/source/file3" "/path/to/dest/file3"
# "/path/to/source/file4" "/path/to/dest/file4"
)
SOURCE_PARSERS="/usr/src/paperless/git_scripts/parsers.py"
DEST_PARSERS="/usr/src/paperless/src/paperless_tesseract/parsers.py"
# 检查文件是否存在
# 检查所有文件是否存在仅检查replace/check操作需要的文件
check_files_exist() {
local missing=0
for file in "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME" "$SOURCE_PARSERS" "$DEST_PARSERS"; do
if [ ! -f "$file" ]; then
echo "错误:文件不存在 - $file"
local pair_count=${#FILE_PAIRS[@]}
# 遍历文件对步长2源文件、目标文件为一组
for ((i=0; i<pair_count; i+=2)); do
local source="${FILE_PAIRS[$i]}"
local dest="${FILE_PAIRS[$i+1]}"
# 根据操作类型检查对应文件
if [ "$1" = "replace" ] || [ "$1" = "check" ]; then
if [ ! -f "$source" ]; then
echo "错误:源文件不存在 - $source"
missing=1
fi
if [ ! -f "$dest" ]; then
echo "错误:目标文件不存在 - $dest"
missing=1
fi
elif [ "$1" = "rollback" ]; then
if [ ! -f "$dest.bak" ]; then
echo "警告:备份文件不存在(未执行过替换?) - $dest.bak"
missing=1
fi
fi
done
if [ $missing -eq 1 ]; then
if [ $missing -eq 1 ] && [ "$1" != "rollback" ]; then
echo "错误:关键文件缺失,无法继续执行"
exit 1
fi
}
# 显示文件差异
# 显示所有文件对的差异
show_diffs() {
echo "=== 检查 parse_filename.py 差异 ==="
diff -u "$DEST_PARSE_FILENAME" "$SOURCE_PARSE_FILENAME" || true # 差异不存在时不报错
local pair_count=${#FILE_PAIRS[@]}
echo "=== 开始检查文件差异 ==="
echo -e "\n=== 检查 parsers.py 差异 ==="
diff -u "$DEST_PARSERS" "$SOURCE_PARSERS" || true
for ((i=0; i<pair_count; i+=2)); do
local source="${FILE_PAIRS[$i]}"
local dest="${FILE_PAIRS[$i+1]}"
echo -e "\n--- 检查 $dest <-> $source 的差异 ---"
diff -u "$dest" "$source" || true # 无差异时不报错
done
}
# 备份并替换文件
replace_files() {
# 备份目标文件
backup_file "$DEST_PARSE_FILENAME"
backup_file "$DEST_PARSERS"
# 执行替换
cp -f "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME"
cp -f "$SOURCE_PARSERS" "$DEST_PARSERS"
echo -e "\n=== 替换完成,以下是替换后的差异(应无差异) ==="
show_diffs
}
# 备份文件(添加 .bak 后缀,保留原权限)
# 备份单个文件(添加 .bak 后缀,保留原权限)
backup_file() {
local file="$1"
local backup="$file.bak"
if [ -f "$backup" ]; then
# 若已有备份,先删除旧备份(避免累积过多)
echo "提示:旧备份文件已存在,将覆盖 - $backup"
rm -f "$backup"
fi
cp -a "$file" "$backup" # -a 保留权限和属性
cp -a "$file" "$backup" # -a 保留权限、属性、时间戳等
echo "已备份:$file -> $backup"
}
# 主逻辑
check_files_exist
# 替换所有文件对
replace_files() {
local pair_count=${#FILE_PAIRS[@]}
echo "=== 开始替换文件(先备份目标文件) ==="
if [ "$1" = "check" ]; then
echo "=== 执行差异检查(不修改文件) ==="
for ((i=0; i<pair_count; i+=2)); do
local source="${FILE_PAIRS[$i]}"
local dest="${FILE_PAIRS[$i+1]}"
echo -e "\n--- 处理文件对:$source -> $dest ---"
backup_file "$dest"
cp -f "$source" "$dest"
echo "已替换:$source 覆盖 $dest"
done
echo -e "\n=== 替换完成,验证最终差异(应无差异) ==="
show_diffs
elif [ "$1" = "replace" ]; then
echo "=== 执行文件替换(先备份) ==="
replace_files
}
# 回滚替换操作(恢复 .bak 备份文件)
rollback_files() {
local pair_count=${#FILE_PAIRS[@]}
echo "=== 开始回滚替换操作 ==="
for ((i=0; i<pair_count; i+=2)); do
local dest="${FILE_PAIRS[$i+1]}"
local backup="$dest.bak"
echo -e "\n--- 处理回滚:$backup -> $dest ---"
if [ -f "$backup" ]; then
# 先备份当前文件(防止回滚出错)
cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true
# 恢复备份文件
mv -f "$backup" "$dest"
echo "已回滚:$dest 恢复为备份版本"
# 删除临时文件
rm -f "$dest.rollback_temp" 2>/dev/null || true
else
echo "用法:$0 [check|replace]"
echo " check - 仅检查文件差异,不做修改"
echo " replace - 备份目标文件并替换,然后显示最终差异"
exit 1
echo "跳过:备份文件不存在 - $backup"
fi
done
echo -e "\n=== 回滚操作执行完成 ==="
}
# 主逻辑
main() {
case "$1" in
check)
echo "=== 执行文件差异检查(不修改文件) ==="
check_files_exist "check"
show_diffs
;;
replace)
echo "=== 执行文件替换操作(自动备份) ==="
check_files_exist "replace"
replace_files
;;
rollback)
echo "=== 执行文件回滚操作(恢复备份) ==="
check_files_exist "rollback"
rollback_files
;;
*)
echo "用法:$0 [check|replace|rollback]"
echo " check - 仅检查所有文件对的差异,不做修改"
echo " replace - 备份所有目标文件并执行替换,完成后验证差异"
echo " rollback - 回滚替换操作(恢复 .bak 备份文件)"
exit 1
;;
esac
}
# 启动主逻辑
main "$1"

View File

@ -0,0 +1,37 @@
## 登陆
### 用户名: admin
### 密码: paperless
## 需要指定用户名
### 配置好 USERMAP_GID和USERMAP_GID否则可能无法执行主机映射进去的脚本。
### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID
## 自定义的文件名解析脚本
```Bash
# 文档
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
# 配置
environment:
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
```
## 源码修改,可以通过在容器里执行 docker_patch.sh 脚本来完成
### 对于无法简单读取pdf内容的文档paperless会启动OCR扫描且复杂情况下会执行两遍非常慢而且消耗资源。只能通过修改源码解决
```Bash
# /usr/src/paperless/src/paperless_tesseract/parsers.py :
# force skip ocr process.
if not original_has_text:
original_has_text = True
text_original = "this is default content, as we skipped ocr process..."
self.log.warning("Cannot read text from Document, use default message.")
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
```

View File

@ -1,80 +0,0 @@
-------------------------------------------------------
------------------- paperless 无纸化pdf管理 ------------
-------------------------------------------------------
## 最好不要用命令使用docker-compose.yml来创建需要制定后端使用的数据库以及redis
docker run -itd \
--name paperless \
--network devops \
--platform linux/x86_64 \
-e TZ="Asia/Shanghai" \
-v /etc/localtime:/etc/localtime:ro \
-v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \
-v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
-p 8000:8000 \
ghcr.io/paperless-ngx/paperless-ngx
# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin
docker compose run --rm webserver createsuperuser
python3 manage.py createsuperuser
# 已有文档,放在指定目录下,等系统自动加载(或者手工启动)
cd /path/to/paperless/src/
python3 manage.py document_consumer
# 自动解析文件名
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
environment:
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
对于无法简单读取pdf内容的文档paperless会启动OCR扫描且复杂情况下会执行两遍非常慢而且消耗资源。只能通过修改源码解决
/usr/src/paperless/src/paperless_tesseract/parsers.py :
# force skip ocr process.
if not original_has_text:
original_has_text = True
text_original = "this is default content, as we skipped ocr process..."
self.log.warning("Cannot read text from Document, use default message.")
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:(已经有配置,详见 docker-compose.yml
/usr/src/paperless/src/documents/consumer.py
def pre_check_duplicate(self):
"""
Using the MD5 of the file, check this exact file doesn't already exist
"""
with open(self.input_doc.original_file, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)
if existing_doc.exists():
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
if existing_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
## 修改这里,让它删除重复文件。
if settings.CONSUMER_DELETE_DUPLICATES or True:
os.unlink(self.input_doc.original_file)
self._fail(
msg,
log_msg,
)