modify scripts
This commit is contained in:
@ -1,73 +1,149 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 定义文件路径
|
||||
SOURCE_PARSE_FILENAME="/usr/src/paperless/git_scripts/parse_filename.py"
|
||||
DEST_PARSE_FILENAME="/usr/src/paperless/scripts/parse_filename.py"
|
||||
# 定义文件替换对(数组形式,格式:源文件 目标文件)
|
||||
# 可按需添加/删除行,每行一组 源文件 目标文件
|
||||
FILE_PAIRS=(
|
||||
"/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py"
|
||||
# 示例:新增更多文件对
|
||||
#"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py"
|
||||
# "/path/to/source/file3" "/path/to/dest/file3"
|
||||
# "/path/to/source/file4" "/path/to/dest/file4"
|
||||
)
|
||||
|
||||
SOURCE_PARSERS="/usr/src/paperless/git_scripts/parsers.py"
|
||||
DEST_PARSERS="/usr/src/paperless/src/paperless_tesseract/parsers.py"
|
||||
|
||||
# 检查文件是否存在
|
||||
# 检查所有文件是否存在(仅检查replace/check操作需要的文件)
|
||||
check_files_exist() {
|
||||
local missing=0
|
||||
for file in "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME" "$SOURCE_PARSERS" "$DEST_PARSERS"; do
|
||||
if [ ! -f "$file" ]; then
|
||||
echo "错误:文件不存在 - $file"
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
|
||||
# 遍历文件对(步长2:源文件、目标文件为一组)
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local source="${FILE_PAIRS[$i]}"
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
|
||||
# 根据操作类型检查对应文件
|
||||
if [ "$1" = "replace" ] || [ "$1" = "check" ]; then
|
||||
if [ ! -f "$source" ]; then
|
||||
echo "错误:源文件不存在 - $source"
|
||||
missing=1
|
||||
fi
|
||||
if [ ! -f "$dest" ]; then
|
||||
echo "错误:目标文件不存在 - $dest"
|
||||
missing=1
|
||||
fi
|
||||
elif [ "$1" = "rollback" ]; then
|
||||
if [ ! -f "$dest.bak" ]; then
|
||||
echo "警告:备份文件不存在(未执行过替换?) - $dest.bak"
|
||||
missing=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
if [ $missing -eq 1 ]; then
|
||||
|
||||
if [ $missing -eq 1 ] && [ "$1" != "rollback" ]; then
|
||||
echo "错误:关键文件缺失,无法继续执行"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示文件差异
|
||||
# 显示所有文件对的差异
|
||||
show_diffs() {
|
||||
echo "=== 检查 parse_filename.py 差异 ==="
|
||||
diff -u "$DEST_PARSE_FILENAME" "$SOURCE_PARSE_FILENAME" || true # 差异不存在时不报错
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
echo "=== 开始检查文件差异 ==="
|
||||
|
||||
echo -e "\n=== 检查 parsers.py 差异 ==="
|
||||
diff -u "$DEST_PARSERS" "$SOURCE_PARSERS" || true
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local source="${FILE_PAIRS[$i]}"
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
|
||||
echo -e "\n--- 检查 $dest <-> $source 的差异 ---"
|
||||
diff -u "$dest" "$source" || true # 无差异时不报错
|
||||
done
|
||||
}
|
||||
|
||||
# 备份并替换文件
|
||||
replace_files() {
|
||||
# 备份目标文件
|
||||
backup_file "$DEST_PARSE_FILENAME"
|
||||
backup_file "$DEST_PARSERS"
|
||||
|
||||
# 执行替换
|
||||
cp -f "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME"
|
||||
cp -f "$SOURCE_PARSERS" "$DEST_PARSERS"
|
||||
|
||||
echo -e "\n=== 替换完成,以下是替换后的差异(应无差异) ==="
|
||||
show_diffs
|
||||
}
|
||||
|
||||
# 备份文件(添加 .bak 后缀,保留原权限)
|
||||
# 备份单个文件(添加 .bak 后缀,保留原权限)
|
||||
backup_file() {
|
||||
local file="$1"
|
||||
local backup="$file.bak"
|
||||
|
||||
if [ -f "$backup" ]; then
|
||||
# 若已有备份,先删除旧备份(避免累积过多)
|
||||
echo "提示:旧备份文件已存在,将覆盖 - $backup"
|
||||
rm -f "$backup"
|
||||
fi
|
||||
cp -a "$file" "$backup" # -a 保留权限和属性
|
||||
|
||||
cp -a "$file" "$backup" # -a 保留权限、属性、时间戳等
|
||||
echo "已备份:$file -> $backup"
|
||||
}
|
||||
|
||||
# 主逻辑
|
||||
check_files_exist
|
||||
# 替换所有文件对
|
||||
replace_files() {
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
echo "=== 开始替换文件(先备份目标文件) ==="
|
||||
|
||||
if [ "$1" = "check" ]; then
|
||||
echo "=== 执行差异检查(不修改文件) ==="
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local source="${FILE_PAIRS[$i]}"
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
|
||||
echo -e "\n--- 处理文件对:$source -> $dest ---"
|
||||
backup_file "$dest"
|
||||
cp -f "$source" "$dest"
|
||||
echo "已替换:$source 覆盖 $dest"
|
||||
done
|
||||
|
||||
echo -e "\n=== 替换完成,验证最终差异(应无差异) ==="
|
||||
show_diffs
|
||||
elif [ "$1" = "replace" ]; then
|
||||
echo "=== 执行文件替换(先备份) ==="
|
||||
replace_files
|
||||
}
|
||||
|
||||
# 回滚替换操作(恢复 .bak 备份文件)
|
||||
rollback_files() {
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
echo "=== 开始回滚替换操作 ==="
|
||||
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
local backup="$dest.bak"
|
||||
|
||||
echo -e "\n--- 处理回滚:$backup -> $dest ---"
|
||||
if [ -f "$backup" ]; then
|
||||
# 先备份当前文件(防止回滚出错)
|
||||
cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true
|
||||
# 恢复备份文件
|
||||
mv -f "$backup" "$dest"
|
||||
echo "已回滚:$dest 恢复为备份版本"
|
||||
# 删除临时文件
|
||||
rm -f "$dest.rollback_temp" 2>/dev/null || true
|
||||
else
|
||||
echo "用法:$0 [check|replace]"
|
||||
echo " check - 仅检查文件差异,不做修改"
|
||||
echo " replace - 备份目标文件并替换,然后显示最终差异"
|
||||
exit 1
|
||||
echo "跳过:备份文件不存在 - $backup"
|
||||
fi
|
||||
done
|
||||
|
||||
echo -e "\n=== 回滚操作执行完成 ==="
|
||||
}
|
||||
|
||||
# 主逻辑
|
||||
main() {
|
||||
case "$1" in
|
||||
check)
|
||||
echo "=== 执行文件差异检查(不修改文件) ==="
|
||||
check_files_exist "check"
|
||||
show_diffs
|
||||
;;
|
||||
replace)
|
||||
echo "=== 执行文件替换操作(自动备份) ==="
|
||||
check_files_exist "replace"
|
||||
replace_files
|
||||
;;
|
||||
rollback)
|
||||
echo "=== 执行文件回滚操作(恢复备份) ==="
|
||||
check_files_exist "rollback"
|
||||
rollback_files
|
||||
;;
|
||||
*)
|
||||
echo "用法:$0 [check|replace|rollback]"
|
||||
echo " check - 仅检查所有文件对的差异,不做修改"
|
||||
echo " replace - 备份所有目标文件并执行替换,完成后验证差异"
|
||||
echo " rollback - 回滚替换操作(恢复 .bak 备份文件)"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# 启动主逻辑
|
||||
main "$1"
|
||||
0
docker/paperless/plugins/bak_parsers.py → docker/paperless/plugins/origin_parsers.py
Executable file → Normal file
0
docker/paperless/plugins/bak_parsers.py → docker/paperless/plugins/origin_parsers.py
Executable file → Normal file
37
docker/paperless/plugins/readme.md
Normal file
37
docker/paperless/plugins/readme.md
Normal file
@ -0,0 +1,37 @@
|
||||
## 登陆
|
||||
### 用户名: admin
|
||||
### 密码: paperless
|
||||
|
||||
## 需要指定用户名
|
||||
### 配置好 USERMAP_GID和USERMAP_GID,否则可能无法执行主机映射进去的脚本。
|
||||
### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID
|
||||
|
||||
## 自定义的文件名解析脚本
|
||||
```Bash
|
||||
# 文档
|
||||
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
|
||||
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
|
||||
|
||||
# 配置
|
||||
environment:
|
||||
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||
```
|
||||
|
||||
|
||||
## 源码修改,可以通过在容器里执行 docker_patch.sh 脚本来完成
|
||||
### 对于无法简单读取pdf内容的文档,paperless会启动OCR扫描,且复杂情况下会执行两遍,非常慢而且消耗资源。只能通过修改源码解决:
|
||||
```Bash
|
||||
# /usr/src/paperless/src/paperless_tesseract/parsers.py :
|
||||
|
||||
# force skip ocr process.
|
||||
if not original_has_text:
|
||||
original_has_text = True
|
||||
text_original = "this is default content, as we skipped ocr process..."
|
||||
self.log.warning("Cannot read text from Document, use default message.")
|
||||
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
```
|
||||
@ -1,80 +0,0 @@
|
||||
|
||||
|
||||
-------------------------------------------------------|
|
||||
------------------- paperless 无纸化pdf管理 ------------|
|
||||
-------------------------------------------------------|
|
||||
|
||||
## 最好不要用命令,使用docker-compose.yml来创建,需要制定后端使用的数据库,以及redis!
|
||||
docker run -itd \
|
||||
--name paperless \
|
||||
--network devops \
|
||||
--platform linux/x86_64 \
|
||||
-e TZ="Asia/Shanghai" \
|
||||
-v /etc/localtime:/etc/localtime:ro \
|
||||
-v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \
|
||||
-v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \
|
||||
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
||||
-p 8000:8000 \
|
||||
ghcr.io/paperless-ngx/paperless-ngx
|
||||
|
||||
|
||||
# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin)
|
||||
docker compose run --rm webserver createsuperuser
|
||||
python3 manage.py createsuperuser
|
||||
|
||||
# 已有文档,放在指定目录下,等系统自动加载(或者手工启动)
|
||||
cd /path/to/paperless/src/
|
||||
python3 manage.py document_consumer
|
||||
|
||||
# 自动解析文件名
|
||||
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
|
||||
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
|
||||
|
||||
environment:
|
||||
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||
|
||||
|
||||
对于无法简单读取pdf内容的文档,paperless会启动OCR扫描,且复杂情况下会执行两遍,非常慢而且消耗资源。只能通过修改源码解决:
|
||||
/usr/src/paperless/src/paperless_tesseract/parsers.py :
|
||||
|
||||
# force skip ocr process.
|
||||
if not original_has_text:
|
||||
original_has_text = True
|
||||
text_original = "this is default content, as we skipped ocr process..."
|
||||
self.log.warning("Cannot read text from Document, use default message.")
|
||||
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
|
||||
|
||||
paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:(已经有配置,详见 docker-compose.yml)
|
||||
|
||||
/usr/src/paperless/src/documents/consumer.py
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with open(self.input_doc.original_file, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||
|
||||
if existing_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
## 修改这里,让它删除重复文件。
|
||||
if settings.CONSUMER_DELETE_DUPLICATES or True:
|
||||
os.unlink(self.input_doc.original_file)
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
Reference in New Issue
Block a user