diff --git a/docker/paperless/plugins/docker_patch.sh b/docker/paperless/plugins/docker_patch.sh index cb1205a..ee9b5ca 100755 --- a/docker/paperless/plugins/docker_patch.sh +++ b/docker/paperless/plugins/docker_patch.sh @@ -1,73 +1,149 @@ #!/bin/bash -# 定义文件路径 -SOURCE_PARSE_FILENAME="/usr/src/paperless/git_scripts/parse_filename.py" -DEST_PARSE_FILENAME="/usr/src/paperless/scripts/parse_filename.py" +# 定义文件替换对(数组形式,格式:源文件 目标文件) +# 可按需添加/删除行,每行一组 源文件 目标文件 +FILE_PAIRS=( + "/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py" + # 示例:新增更多文件对 + #"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py" + # "/path/to/source/file3" "/path/to/dest/file3" + # "/path/to/source/file4" "/path/to/dest/file4" +) -SOURCE_PARSERS="/usr/src/paperless/git_scripts/parsers.py" -DEST_PARSERS="/usr/src/paperless/src/paperless_tesseract/parsers.py" - -# 检查文件是否存在 +# 检查所有文件是否存在(仅检查replace/check操作需要的文件) check_files_exist() { local missing=0 - for file in "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME" "$SOURCE_PARSERS" "$DEST_PARSERS"; do - if [ ! -f "$file" ]; then - echo "错误:文件不存在 - $file" - missing=1 + local pair_count=${#FILE_PAIRS[@]} + + # 遍历文件对(步长2:源文件、目标文件为一组) + for ((i=0; i $source 的差异 ---" + diff -u "$dest" "$source" || true # 无差异时不报错 + done } -# 备份并替换文件 -replace_files() { - # 备份目标文件 - backup_file "$DEST_PARSE_FILENAME" - backup_file "$DEST_PARSERS" - - # 执行替换 - cp -f "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME" - cp -f "$SOURCE_PARSERS" "$DEST_PARSERS" - - echo -e "\n=== 替换完成,以下是替换后的差异(应无差异) ===" - show_diffs -} - -# 备份文件(添加 .bak 后缀,保留原权限) +# 备份单个文件(添加 .bak 后缀,保留原权限) backup_file() { local file="$1" local backup="$file.bak" + if [ -f "$backup" ]; then - # 若已有备份,先删除旧备份(避免累积过多) + echo "提示:旧备份文件已存在,将覆盖 - $backup" rm -f "$backup" fi - cp -a "$file" "$backup" # -a 保留权限和属性 + + cp -a "$file" "$backup" # -a 保留权限、属性、时间戳等 echo "已备份:$file -> $backup" } -# 主逻辑 -check_files_exist - -if [ "$1" = "check" ]; then - echo "=== 执行差异检查(不修改文件) ===" +# 替换所有文件对 +replace_files() { + local pair_count=${#FILE_PAIRS[@]} + echo "=== 开始替换文件(先备份目标文件) ===" + + for ((i=0; i $dest ---" + backup_file "$dest" + cp -f "$source" "$dest" + echo "已替换:$source 覆盖 $dest" + done + + echo -e "\n=== 替换完成,验证最终差异(应无差异) ===" show_diffs -elif [ "$1" = "replace" ]; then - echo "=== 执行文件替换(先备份) ===" - replace_files -else - echo "用法:$0 [check|replace]" - echo " check - 仅检查文件差异,不做修改" - echo " replace - 备份目标文件并替换,然后显示最终差异" - exit 1 -fi \ No newline at end of file +} + +# 回滚替换操作(恢复 .bak 备份文件) +rollback_files() { + local pair_count=${#FILE_PAIRS[@]} + echo "=== 开始回滚替换操作 ===" + + for ((i=0; i $dest ---" + if [ -f "$backup" ]; then + # 先备份当前文件(防止回滚出错) + cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true + # 恢复备份文件 + mv -f "$backup" "$dest" + echo "已回滚:$dest 恢复为备份版本" + # 删除临时文件 + rm -f "$dest.rollback_temp" 2>/dev/null || true + else + echo "跳过:备份文件不存在 - $backup" + fi + done + + echo -e "\n=== 回滚操作执行完成 ===" +} + +# 主逻辑 +main() { + case "$1" in + check) + echo "=== 执行文件差异检查(不修改文件) ===" + check_files_exist "check" + show_diffs + ;; + replace) + echo "=== 执行文件替换操作(自动备份) ===" + check_files_exist "replace" + replace_files + ;; + rollback) + echo "=== 执行文件回滚操作(恢复备份) ===" + check_files_exist "rollback" + rollback_files + ;; + *) + echo "用法:$0 [check|replace|rollback]" + echo " check - 仅检查所有文件对的差异,不做修改" + echo " replace - 备份所有目标文件并执行替换,完成后验证差异" + echo " rollback - 回滚替换操作(恢复 .bak 备份文件)" + exit 1 + ;; + esac +} + +# 启动主逻辑 +main "$1" \ No newline at end of file diff --git a/docker/paperless/plugins/bak_parsers.py b/docker/paperless/plugins/origin_parsers.py old mode 100755 new mode 100644 similarity index 100% rename from docker/paperless/plugins/bak_parsers.py rename to docker/paperless/plugins/origin_parsers.py diff --git a/docker/paperless/plugins/readme.md b/docker/paperless/plugins/readme.md new file mode 100644 index 0000000..cc987be --- /dev/null +++ b/docker/paperless/plugins/readme.md @@ -0,0 +1,37 @@ +## 登陆 +### 用户名: admin +### 密码: paperless + +## 需要指定用户名 +### 配置好 USERMAP_GID和USERMAP_GID,否则可能无法执行主机映射进去的脚本。 +### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID + +## 自定义的文件名解析脚本 +```Bash +# 文档 +https://docs.paperless-ngx.com/advanced_usage/#file-name-handling +https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT + +# 配置 +environment: + PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py" +``` + + +## 源码修改,可以通过在容器里执行 docker_patch.sh 脚本来完成 +### 对于无法简单读取pdf内容的文档,paperless会启动OCR扫描,且复杂情况下会执行两遍,非常慢而且消耗资源。只能通过修改源码解决: +```Bash +# /usr/src/paperless/src/paperless_tesseract/parsers.py : + + # force skip ocr process. + if not original_has_text: + original_has_text = True + text_original = "this is default content, as we skipped ocr process..." + self.log.warning("Cannot read text from Document, use default message.") + + if skip_archive_for_text and original_has_text: + self.log.debug("Document has text, skipping OCRmyPDF entirely.") + self.text = text_original + return + +``` diff --git a/docker/paperless/plugins/redme.txt b/docker/paperless/plugins/redme.txt deleted file mode 100644 index dc3eb76..0000000 --- a/docker/paperless/plugins/redme.txt +++ /dev/null @@ -1,80 +0,0 @@ - - --------------------------------------------------------| -------------------- paperless 无纸化pdf管理 ------------| --------------------------------------------------------| - -## 最好不要用命令,使用docker-compose.yml来创建,需要制定后端使用的数据库,以及redis! -docker run -itd \ - --name paperless \ - --network devops \ - --platform linux/x86_64 \ - -e TZ="Asia/Shanghai" \ - -v /etc/localtime:/etc/localtime:ro \ - -v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \ - -v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \ - -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ - -p 8000:8000 \ - ghcr.io/paperless-ngx/paperless-ngx - - -# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin) -docker compose run --rm webserver createsuperuser -python3 manage.py createsuperuser - -# 已有文档,放在指定目录下,等系统自动加载(或者手工启动) -cd /path/to/paperless/src/ -python3 manage.py document_consumer - -# 自动解析文件名 -https://docs.paperless-ngx.com/advanced_usage/#file-name-handling -https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT - -environment: - PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py" - - -对于无法简单读取pdf内容的文档,paperless会启动OCR扫描,且复杂情况下会执行两遍,非常慢而且消耗资源。只能通过修改源码解决: -/usr/src/paperless/src/paperless_tesseract/parsers.py : - - # force skip ocr process. - if not original_has_text: - original_has_text = True - text_original = "this is default content, as we skipped ocr process..." - self.log.warning("Cannot read text from Document, use default message.") - - if skip_archive_for_text and original_has_text: - self.log.debug("Document has text, skipping OCRmyPDF entirely.") - self.text = text_original - return - - - -paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:(已经有配置,详见 docker-compose.yml) - -/usr/src/paperless/src/documents/consumer.py - - def pre_check_duplicate(self): - """ - Using the MD5 of the file, check this exact file doesn't already exist - """ - with open(self.input_doc.original_file, "rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() - existing_doc = Document.global_objects.filter( - Q(checksum=checksum) | Q(archive_checksum=checksum), - ) - if existing_doc.exists(): - msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS - log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})." - - if existing_doc.first().deleted_at is not None: - msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH - log_msg += " Note: existing document is in the trash." - - ## 修改这里,让它删除重复文件。 - if settings.CONSUMER_DELETE_DUPLICATES or True: - os.unlink(self.input_doc.original_file) - self._fail( - msg, - log_msg, - )