modify scripts

2026-01-11 11:50:55 +08:00 · 2026-01-11 10:36:07 +08:00 · 2026-01-09 11:29:25 +08:00 · 2025-12-25 17:08:29 +08:00 · 2025-12-25 15:02:07 +08:00 · 2025-12-25 14:53:33 +08:00
19 changed files with 2038 additions and 178 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
+# 其他已有的忽略规则
+*.pyc
+__pycache__/
+
+# 忽略环境配置文件
+.env
+
+# 忽略所有 log 目录 和 data 目录
+**/log/
+**/data/
+**/result/
--- a/docker/paperless/docker-compose.yml
+++ b/docker/paperless/docker-compose.yml
@ -8,11 +8,6 @@ services:
    ports:
      - "8000:8000"
    environment:
-      PAPERLESS_OCR_LANGUAGES: ""   # 跳过OCR
-      PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always"  # 跳过创建文档存档版本的时间
-      PAPERLESS_OCR_OUTPUT_TYPE: "pdf"  # 尽量少修改PDF文档
-      PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔（以秒为单位），这将导致 paperless 定期检查消费目录中的更改
-      #PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间（以秒为单位）

      # 使用 SQLite 作为数据库（默认）
      PAPERLESS_DBENGINE: sqlite3
@ -34,11 +29,22 @@ services:
      # 定义文件命名规则和存储路径
      # 作用不大，主要还是用消费后脚本，以及工作流来指定存储路径。
      # 工作流先于消费后脚本运行，因此消费后脚本里解析的document_type在工作流里无效。所以使用了文件名关键词匹配
-      PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}.pdf"
+      PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}"

      # 解析文件里的关键信息，并更新。但无法更新strorage path。这个字段要靠工作流才行。
      PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"

+      # 自动删除重复文件
+      PAPERLESS_CONSUMER_DELETE_DUPLICATES: true
+      # 支持消费目录递归检索，即子目录。这样可以支持多个宿主机的目录映射到docker中
+      PAPERLESS_CONSUMER_RECURSIVE: true
+
+      PAPERLESS_OCR_LANGUAGES: ""   # 跳过OCR，并不会，只会用默认的eng来执行
+      PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always"  # 跳过创建文档存档版本的时间
+      PAPERLESS_OCR_OUTPUT_TYPE: "pdf"  # 尽量少修改PDF文档
+      PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔（以秒为单位），这将导致 paperless 定期检查消费目录中的更改
+      #PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间（以秒为单位）
+
      # 运行用户
      USERMAP_UID: 1000
      USERMAP_GID: 1000
@ -46,8 +52,9 @@ services:
    volumes:
      # 存储所有数据（搜索索引、SQLite 数据库、分类模型等）的地方
      - ~/dockers/paperless/data:/usr/src/paperless/data
-      # 挂载文件导入目录
+      # 挂载文件导入目录，可以把多个宿主机的目录，挂到docker中，以子目录的形式存在
      - ~/dockers/paperless/consume:/usr/src/paperless/consume
+      - ~/dockers/sharedata/consume:/usr/src/paperless/consume/subdir
      # 挂载文件导出目录
      - ~/dockers/paperless/export:/usr/src/paperless/export
      # 存储您的文档和缩略图的地方
--- a/docker/paperless/plugins/batch_del.py
+++ b/docker/paperless/plugins/batch_del.py
@ -9,7 +9,8 @@ import logging

 # Paperless 服务器信息
 PAPERLESS_URL = "http://localhost:8000/api"
-AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证
+#AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证， mac上用这个
+AUTH = HTTPBasicAuth("admin", "paperless")  # Basic Auth 认证，NAS上用这个

 # 日志配置
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
--- a/docker/paperless/plugins/docker_patch.sh
+++ b/docker/paperless/plugins/docker_patch.sh
@ -0,0 +1,149 @@
+#!/bin/bash
+
+# 定义文件替换对（数组形式，格式：源文件 目标文件）
+# 可按需添加/删除行，每行一组 源文件 目标文件
+FILE_PAIRS=(
+    "/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py"
+    # 示例：新增更多文件对
+    #"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py"
+    # "/path/to/source/file3" "/path/to/dest/file3"
+    # "/path/to/source/file4" "/path/to/dest/file4"
+)
+
+# 检查所有文件是否存在（仅检查replace/check操作需要的文件）
+check_files_exist() {
+    local missing=0
+    local pair_count=${#FILE_PAIRS[@]}
+    
+    # 遍历文件对（步长2：源文件、目标文件为一组）
+    for ((i=0; i<pair_count; i+=2)); do
+        local source="${FILE_PAIRS[$i]}"
+        local dest="${FILE_PAIRS[$i+1]}"
+        
+        # 根据操作类型检查对应文件
+        if [ "$1" = "replace" ] || [ "$1" = "check" ]; then
+            if [ ! -f "$source" ]; then
+                echo "错误：源文件不存在 - $source"
+                missing=1
+            fi
+            if [ ! -f "$dest" ]; then
+                echo "错误：目标文件不存在 - $dest"
+                missing=1
+            fi
+        elif [ "$1" = "rollback" ]; then
+            if [ ! -f "$dest.bak" ]; then
+                echo "警告：备份文件不存在（未执行过替换？） - $dest.bak"
+                missing=1
+            fi
+        fi
+    done
+    
+    if [ $missing -eq 1 ] && [ "$1" != "rollback" ]; then
+        echo "错误：关键文件缺失，无法继续执行"
+        exit 1
+    fi
+}
+
+# 显示所有文件对的差异
+show_diffs() {
+    local pair_count=${#FILE_PAIRS[@]}
+    echo "=== 开始检查文件差异 ==="
+    
+    for ((i=0; i<pair_count; i+=2)); do
+        local source="${FILE_PAIRS[$i]}"
+        local dest="${FILE_PAIRS[$i+1]}"
+        
+        echo -e "\n--- 检查 $dest <-> $source 的差异 ---"
+        diff -u "$dest" "$source" || true  # 无差异时不报错
+    done
+}
+
+# 备份单个文件（添加 .bak 后缀，保留原权限）
+backup_file() {
+    local file="$1"
+    local backup="$file.bak"
+    
+    if [ -f "$backup" ]; then
+        echo "提示：旧备份文件已存在，将覆盖 - $backup"
+        rm -f "$backup"
+    fi
+    
+    cp -a "$file" "$backup"  # -a 保留权限、属性、时间戳等
+    echo "已备份：$file -> $backup"
+}
+
+# 替换所有文件对
+replace_files() {
+    local pair_count=${#FILE_PAIRS[@]}
+    echo "=== 开始替换文件（先备份目标文件） ==="
+    
+    for ((i=0; i<pair_count; i+=2)); do
+        local source="${FILE_PAIRS[$i]}"
+        local dest="${FILE_PAIRS[$i+1]}"
+        
+        echo -e "\n--- 处理文件对：$source -> $dest ---"
+        backup_file "$dest"
+        cp -f "$source" "$dest"
+        echo "已替换：$source 覆盖 $dest"
+    done
+    
+    echo -e "\n=== 替换完成，验证最终差异（应无差异） ==="
+    show_diffs
+}
+
+# 回滚替换操作（恢复 .bak 备份文件）
+rollback_files() {
+    local pair_count=${#FILE_PAIRS[@]}
+    echo "=== 开始回滚替换操作 ==="
+    
+    for ((i=0; i<pair_count; i+=2)); do
+        local dest="${FILE_PAIRS[$i+1]}"
+        local backup="$dest.bak"
+        
+        echo -e "\n--- 处理回滚：$backup -> $dest ---"
+        if [ -f "$backup" ]; then
+            # 先备份当前文件（防止回滚出错）
+            cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true
+            # 恢复备份文件
+            mv -f "$backup" "$dest"
+            echo "已回滚：$dest 恢复为备份版本"
+            # 删除临时文件
+            rm -f "$dest.rollback_temp" 2>/dev/null || true
+        else
+            echo "跳过：备份文件不存在 - $backup"
+        fi
+    done
+    
+    echo -e "\n=== 回滚操作执行完成 ==="
+}
+
+# 主逻辑
+main() {
+    case "$1" in
+        check)
+            echo "=== 执行文件差异检查（不修改文件） ==="
+            check_files_exist "check"
+            show_diffs
+            ;;
+        replace)
+            echo "=== 执行文件替换操作（自动备份） ==="
+            check_files_exist "replace"
+            replace_files
+            ;;
+        rollback)
+            echo "=== 执行文件回滚操作（恢复备份） ==="
+            check_files_exist "rollback"
+            rollback_files
+            ;;
+        *)
+            echo "用法：$0 [check|replace|rollback]"
+            echo "  check   - 仅检查所有文件对的差异，不做修改"
+            echo "  replace - 备份所有目标文件并执行替换，完成后验证差异"
+            echo "  rollback - 回滚替换操作（恢复 .bak 备份文件）"
+            exit 1
+            ;;
+    esac
+}
+
+# 启动主逻辑
+main "$1"
--- a/docker/paperless/plugins/em_reports_consume.sh
+++ b/docker/paperless/plugins/em_reports_consume.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+SRC="/volume1/docker/sharedata/stock_data/pdfs"
+DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
+LOG="/volume1/docker/projects/devops/docker/paperless/plugins/log/paperless.log"
+
+TARGET_UID=1000
+TARGET_GID=1000
+
+# 检查目录
+if [ ! -d "$SRC" ]; then
+  echo "$(date '+%F %T') [ERROR] 源目录不存在: $SRC" | tee -a "$LOG"
+  exit 1
+fi
+if [ ! -d "$DST" ]; then
+  echo "$(date '+%F %T') [ERROR] 目标目录不存在: $DST" | tee -a "$LOG"
+  exit 1
+fi
+
+# 关键添加：检查并创建log目录（-p 确保父目录存在，无报错）
+LOG_DIR=$(dirname "$LOG")  # 提取日志文件所在目录（即 ./log）
+if [ ! -d "$LOG_DIR" ]; then
+  mkdir -p "$LOG_DIR"
+  echo "$(date '+%F %T') [INFO] log目录不存在，已创建: $LOG_DIR" | tee -a "$LOG"
+fi
+
+COUNT=0
+for f in "$SRC"/*.pdf; do
+  [ -f "$f" ] || continue
+
+  # 移动 + 改属主 + 改权限
+  if install -D -o "$TARGET_UID" -g "$TARGET_GID" -m 644 "$f" "$DST"; then
+    rm -f "$f"
+    echo "$(date '+%F %T') [OK] Moved: $f" >> "$LOG"
+    ((COUNT++))
+
+    # 每移动10个文件，输出进度到屏幕（同时写入日志）
+    if (( COUNT % 100 == 0 )); then
+      PROGRESS_MSG="$(date '+%F %T') [PROGRESS] 已移动 $COUNT 个文件"
+      echo "$PROGRESS_MSG" | tee -a "$LOG"
+    fi
+  else
+    echo "$(date '+%F %T') [FAIL] Failed: $f" >> "$LOG"
+  fi
+done
+
+echo "$(date '+%F %T') [INFO] 搬运完成，共移动 $COUNT 个文件" | tee -a "$LOG"
+
--- a/docker/paperless/plugins/origin_parsers.py
+++ b/docker/paperless/plugins/origin_parsers.py
@ -0,0 +1,472 @@
+import os
+import re
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from django.conf import settings
+from PIL import Image
+
+from documents.parsers import DocumentParser
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
+from paperless.config import OcrConfig
+from paperless.models import ArchiveFileChoices
+from paperless.models import CleanChoices
+from paperless.models import ModeChoices
+
+
+class NoTextFoundException(Exception):
+    pass
+
+
+class RtlLanguageException(Exception):
+    pass
+
+
+class RasterisedDocumentParser(DocumentParser):
+    """
+    This parser uses Tesseract to try and get some text out of a rasterised
+    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+    """
+
+    logging_name = "paperless.parsing.tesseract"
+
+    def get_settings(self) -> OcrConfig:
+        """
+        This parser uses the OCR configuration settings to parse documents
+        """
+        return OcrConfig()
+
+    def get_page_count(self, document_path, mime_type):
+        page_count = None
+        if mime_type == "application/pdf":
+            try:
+                import pikepdf
+
+                with pikepdf.Pdf.open(document_path) as pdf:
+                    page_count = len(pdf.pages)
+            except Exception as e:
+                self.log.warning(
+                    f"Unable to determine PDF page count {document_path}: {e}",
+                )
+        return page_count
+
+    def extract_metadata(self, document_path, mime_type):
+        result = []
+        if mime_type == "application/pdf":
+            import pikepdf
+
+            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+            pdf = pikepdf.open(document_path)
+            meta = pdf.open_metadata()
+            for key, value in meta.items():
+                if isinstance(value, list):
+                    value = " ".join([str(e) for e in value])
+                value = str(value)
+                try:
+                    m = namespace_pattern.match(key)
+                    if m is None:  # pragma: no cover
+                        continue
+                    namespace = m.group(1)
+                    key_value = m.group(2)
+                    try:
+                        namespace.encode("utf-8")
+                        key_value.encode("utf-8")
+                    except UnicodeEncodeError as e:  # pragma: no cover
+                        self.log.debug(f"Skipping metadata key {key}: {e}")
+                        continue
+                    result.append(
+                        {
+                            "namespace": namespace,
+                            "prefix": meta.REVERSE_NS[namespace],
+                            "key": key_value,
+                            "value": value,
+                        },
+                    )
+                except Exception as e:
+                    self.log.warning(
+                        f"Error while reading metadata {key}: {value}. Error: {e}",
+                    )
+        return result
+
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+        return make_thumbnail_from_pdf(
+            self.archive_path or document_path,
+            self.tempdir,
+            self.logging_group,
+        )
+
+    def is_image(self, mime_type) -> bool:
+        return mime_type in [
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+            "image/webp",
+            "image/heic",
+        ]
+
+    def has_alpha(self, image) -> bool:
+        with Image.open(image) as im:
+            return im.mode in ("RGBA", "LA")
+
+    def remove_alpha(self, image_path: str) -> Path:
+        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
+        run_subprocess(
+            [
+                settings.CONVERT_BINARY,
+                "-alpha",
+                "off",
+                image_path,
+                no_alpha_image,
+            ],
+            logger=self.log,
+        )
+        return no_alpha_image
+
+    def get_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                x, _ = im.info["dpi"]
+                return round(x)
+        except Exception as e:
+            self.log.warning(f"Error while getting DPI from image {image}: {e}")
+            return None
+
+    def calculate_a4_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                width, _ = im.size
+                # divide image width by A4 width (210mm) in inches.
+                dpi = int(width / (21 / 2.54))
+                self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
+                return dpi
+
+        except Exception as e:
+            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
+            return None
+
+    def extract_text(
+        self,
+        sidecar_file: Path | None,
+        pdf_file: Path,
+    ) -> str | None:
+        # When re-doing OCR, the sidecar contains ONLY the new text, not
+        # the whole text, so do not utilize it in that case
+        if (
+            sidecar_file is not None
+            and sidecar_file.is_file()
+            and self.settings.mode != "redo"
+        ):
+            text = self.read_file_handle_unicode_errors(sidecar_file)
+
+            if "[OCR skipped on page" not in text:
+                # This happens when there's already text in the input file.
+                # The sidecar file will only contain text for OCR'ed pages.
+                self.log.debug("Using text from sidecar file")
+                return post_process_text(text)
+            else:
+                self.log.debug("Incomplete sidecar file: discarding.")
+
+        # no success with the sidecar file, try PDF
+
+        if not Path(pdf_file).is_file():
+            return None
+
+        try:
+            text = None
+            with tempfile.NamedTemporaryFile(
+                mode="w+",
+                dir=self.tempdir,
+            ) as tmp:
+                run_subprocess(
+                    [
+                        "pdftotext",
+                        "-q",
+                        "-layout",
+                        "-enc",
+                        "UTF-8",
+                        pdf_file,
+                        tmp.name,
+                    ],
+                    logger=self.log,
+                )
+                text = self.read_file_handle_unicode_errors(Path(tmp.name))
+
+            return post_process_text(text)
+
+        except Exception:
+            #  If pdftotext fails, fall back to OCR.
+            self.log.warning(
+                "Error while getting text from PDF document with pdftotext",
+                exc_info=True,
+            )
+            # probably not a PDF file.
+            return None
+
+    def construct_ocrmypdf_parameters(
+        self,
+        input_file,
+        mime_type,
+        output_file,
+        sidecar_file,
+        *,
+        safe_fallback=False,
+    ):
+        if TYPE_CHECKING:
+            assert isinstance(self.settings, OcrConfig)
+        ocrmypdf_args = {
+            "input_file": input_file,
+            "output_file": output_file,
+            # need to use threads, since this will be run in daemonized
+            # processes via the task library.
+            "use_threads": True,
+            "jobs": settings.THREADS_PER_WORKER,
+            "language": self.settings.language,
+            "output_type": self.settings.output_type,
+            "progress_bar": False,
+        }
+
+        if "pdfa" in ocrmypdf_args["output_type"]:
+            ocrmypdf_args["color_conversion_strategy"] = (
+                self.settings.color_conversion_strategy
+            )
+
+        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
+            ocrmypdf_args["force_ocr"] = True
+        elif self.settings.mode in {
+            ModeChoices.SKIP,
+            ModeChoices.SKIP_NO_ARCHIVE,
+        }:
+            ocrmypdf_args["skip_text"] = True
+        elif self.settings.mode == ModeChoices.REDO:
+            ocrmypdf_args["redo_ocr"] = True
+        else:  # pragma: no cover
+            raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
+
+        if self.settings.clean == CleanChoices.CLEAN:
+            ocrmypdf_args["clean"] = True
+        elif self.settings.clean == CleanChoices.FINAL:
+            if self.settings.mode == ModeChoices.REDO:
+                ocrmypdf_args["clean"] = True
+            else:
+                # --clean-final is not compatible with --redo-ocr
+                ocrmypdf_args["clean_final"] = True
+
+        if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
+            # --deskew is not compatible with --redo-ocr
+            ocrmypdf_args["deskew"] = True
+
+        if self.settings.rotate:
+            ocrmypdf_args["rotate_pages"] = True
+            ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
+
+        if self.settings.pages is not None and self.settings.pages > 0:
+            ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
+        else:
+            # sidecar is incompatible with pages
+            ocrmypdf_args["sidecar"] = sidecar_file
+
+        if self.is_image(mime_type):
+            # This may be required, depending on the known information
+            maybe_override_pixel_limit()
+
+            dpi = self.get_dpi(input_file)
+            a4_dpi = self.calculate_a4_dpi(input_file)
+
+            if self.has_alpha(input_file):
+                self.log.info(
+                    f"Removing alpha layer from {input_file} "
+                    "for compatibility with img2pdf",
+                )
+                # Replace the input file with the non-alpha
+                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
+
+            if dpi:
+                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
+                ocrmypdf_args["image_dpi"] = dpi
+            elif self.settings.image_dpi is not None:
+                ocrmypdf_args["image_dpi"] = self.settings.image_dpi
+            elif a4_dpi:
+                ocrmypdf_args["image_dpi"] = a4_dpi
+            else:
+                raise ParseError(
+                    f"Cannot produce archive PDF for image {input_file}, "
+                    f"no DPI information is present in this image and "
+                    f"OCR_IMAGE_DPI is not set.",
+                )
+            if ocrmypdf_args["image_dpi"] < 70:  # pragma: no cover
+                self.log.warning(
+                    f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
+                )
+
+        if self.settings.user_args is not None:
+            try:
+                ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
+            except Exception as e:
+                self.log.warning(
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    f"they will not be used. Error: {e}",
+                )
+
+        if (
+            self.settings.max_image_pixel is not None
+            and self.settings.max_image_pixel >= 0
+        ):
+            # Convert pixels to mega-pixels and provide to ocrmypdf
+            max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
+            msg = (
+                "OCR pixel limit is disabled!"
+                if max_pixels_mpixels == 0
+                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
+            )
+            self.log.debug(msg)
+            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+
+        return ocrmypdf_args
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        # This forces tesseract to use one core per page.
+        os.environ["OMP_THREAD_LIMIT"] = "1"
+        VALID_TEXT_LENGTH = 50
+
+        if mime_type == "application/pdf":
+            text_original = self.extract_text(None, document_path)
+            original_has_text = (
+                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+            )
+        else:
+            text_original = None
+            original_has_text = False
+
+        # If the original has text, and the user doesn't want an archive,
+        # we're done here
+        skip_archive_for_text = (
+            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
+            or self.settings.skip_archive_file
+            in {
+                ArchiveFileChoices.WITH_TEXT,
+                ArchiveFileChoices.ALWAYS,
+            }
+        )
+        if skip_archive_for_text and original_has_text:
+            self.log.debug(f"Document has text, skipping OCRmyPDF entirely. {text_original}")
+            self.text = text_original
+            return
+
+        # Either no text was in the original or there should be an archive
+        # file created, so OCR the file and create an archive with any
+        # text located via OCR
+
+        import ocrmypdf
+        from ocrmypdf import EncryptedPdfError
+        from ocrmypdf import InputFileError
+        from ocrmypdf import SubprocessOutputError
+        from ocrmypdf.exceptions import DigitalSignatureError
+
+        archive_path = Path(self.tempdir) / "archive.pdf"
+        sidecar_file = Path(self.tempdir) / "sidecar.txt"
+
+        args = self.construct_ocrmypdf_parameters(
+            document_path,
+            mime_type,
+            archive_path,
+            sidecar_file,
+        )
+
+        try:
+            self.log.debug(f"Calling OCRmyPDF with args: {args}")
+            ocrmypdf.ocr(**args)
+
+            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
+                self.archive_path = archive_path
+
+            self.text = self.extract_text(sidecar_file, archive_path)
+
+            if not self.text:
+                raise NoTextFoundException("No text was found in the original document")
+        except (DigitalSignatureError, EncryptedPdfError):
+            self.log.warning(
+                "This file is encrypted and/or signed, OCR is impossible. Using "
+                "any text present in the original file.",
+            )
+            if original_has_text:
+                self.text = text_original
+        except SubprocessOutputError as e:
+            if "Ghostscript PDF/A rendering" in str(e):
+                self.log.warning(
+                    "Ghostscript PDF/A rendering failed, consider setting "
+                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
+                )
+
+            raise ParseError(
+                f"SubprocessOutputError: {e!s}. See logs for more information.",
+            ) from e
+        except (NoTextFoundException, InputFileError) as e:
+            self.log.warning(
+                f"Encountered an error while running OCR: {e!s}. "
+                f"Attempting force OCR to get the text.",
+            )
+
+            archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
+            sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
+
+            # Attempt to run OCR with safe settings.
+
+            args = self.construct_ocrmypdf_parameters(
+                document_path,
+                mime_type,
+                archive_path_fallback,
+                sidecar_file_fallback,
+                safe_fallback=True,
+            )
+
+            try:
+                self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
+                ocrmypdf.ocr(**args)
+
+                # Don't return the archived file here, since this file
+                # is bigger and blurry due to --force-ocr.
+
+                self.text = self.extract_text(
+                    sidecar_file_fallback,
+                    archive_path_fallback,
+                )
+
+            except Exception as e:
+                # If this fails, we have a serious issue at hand.
+                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        except Exception as e:
+            # Anything else is probably serious.
+            raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        # As a last resort, if we still don't have any text for any reason,
+        # try to extract the text from the original document.
+        if not self.text:
+            if original_has_text:
+                self.text = text_original
+            else:
+                self.log.warning(
+                    f"No text was found in {document_path}, the content will be empty.",
+                )
+                self.text = ""
+
+
+def post_process_text(text):
+    if not text:
+        return None
+
+    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
+    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
+
+    # TODO: this needs a rework
+    # replace \0 prevents issues with saving to postgres.
+    # text may contain \0 when this character is present in PDF files.
+    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/docker/paperless/plugins/paperless.sql
+++ b/docker/paperless/plugins/paperless.sql
@ -1,41 +0,0 @@
-- documents_correspondent definition
-
-CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
-
-CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
-CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
-
-- documents_customfield definition
-
-CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
-
-CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
-
-- documents_customfieldinstance definition
-
-CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
-
-CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
-CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
-CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
-
-
-- documents_document definition
-
-CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
-
-CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
-CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
-CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
-CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
-CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
-CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
-CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
-CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
-
-- documents_documenttype definition
-
-CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
-
-CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
-CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");
--- a/docker/paperless/plugins/paperless.txt
+++ b/docker/paperless/plugins/paperless.txt
@ -1,63 +0,0 @@
-我提供的文件，是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下：
-
-1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
-2，我们提取上面的各个字段，然后： 
-  1） report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表，如果对应的name不存在，则插入一条记录；然后得到对应的 documents_documenttype.id 
-  2） org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表，如果对应的name 不存在，则插入一条记录，然后得到对应的 documents_correspondent.id 
-  3） 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段，如果不存在，则创建； 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
-3，我们开始更新数据表：
-  1） 更新 documents_document 表对应的记录， reated = publish_date,  correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title} 
-  2)  向 documents_customfieldinstance 两条记录，分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
-
-好了，请你根据以上需求，完成这个python脚本。注意异常情况的处理，以及日志输出。如果文件名无法匹配以上的格式，则忽略，不用处理。
-
-
-
-
-
-Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
-
-The API provides the following main endpoints:
-
-/api/correspondents/: Full CRUD support.
-/api/custom_fields/: Full CRUD support.
-/api/documents/: Full CRUD support, except POSTing new documents. See below.
-/api/document_types/: Full CRUD support.
-/api/groups/: Full CRUD support.
-/api/logs/: Read-Only.
-/api/mail_accounts/: Full CRUD support.
-/api/mail_rules/: Full CRUD support.
-/api/profile/: GET, PATCH
-/api/share_links/: Full CRUD support.
-/api/storage_paths/: Full CRUD support.
-/api/tags/: Full CRUD support.
-/api/tasks/: Read-only.
-/api/users/: Full CRUD support.
-/api/workflows/: Full CRUD support.
-/api/search/ GET, see below.
-All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
-
-The objects served by the document endpoint contain the following fields:
-
-id: ID of the document. Read-only.
-title: Title of the document.
-content: Plain text content of the document.
-tags: List of IDs of tags assigned to this document, or empty list.
-document_type: Document type of this document, or null.
-correspondent: Correspondent of this document or null.
-created: The date time at which this document was created.
-created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
-modified: The date at which this document was last edited in paperless. Read-only.
-added: The date at which this document was added to paperless. Read-only.
-archive_serial_number: The identifier of this document in a physical document archive.
-original_file_name: Verbose filename of the original document. Read-only.
-archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
-notes: Array of notes associated with the document.
-page_count: Number of pages.
-set_permissions: Allows setting document permissions. Optional, write-only. See below.
-custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
-
-
-以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么，我想对编号为19的文档进行查询，以及更新操作，应该如何写对应的python代码？
-
-
--- a/docker/paperless/plugins/parse_filename.py
+++ b/docker/paperless/plugins/parse_filename.py
@ -11,7 +11,8 @@ from requests.exceptions import RequestException

 # Paperless 服务器信息
 PAPERLESS_URL = "http://localhost:8000/api"
-AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证
+#AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证， mac上用这个
+AUTH = HTTPBasicAuth("admin", "paperless")  # Basic Auth 认证，NAS上用这个

 # 日志配置
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
@ -22,7 +23,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(me
 DB_PATH = "/usr/src/paperless/data/db.sqlite3"
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
-enable_db = True
+enable_db = False   # 标准用法，用API

 # 正则解析文件名
 FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")
--- a/docker/paperless/plugins/parsers.py
+++ b/docker/paperless/plugins/parsers.py
@ -0,0 +1,484 @@
+import os
+import re
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from django.conf import settings
+from PIL import Image
+
+from documents.parsers import DocumentParser
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
+from paperless.config import OcrConfig
+from paperless.models import ArchiveFileChoices
+from paperless.models import CleanChoices
+from paperless.models import ModeChoices
+
+
+class NoTextFoundException(Exception):
+    pass
+
+
+class RtlLanguageException(Exception):
+    pass
+
+
+class RasterisedDocumentParser(DocumentParser):
+    """
+    This parser uses Tesseract to try and get some text out of a rasterised
+    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+    """
+
+    logging_name = "paperless.parsing.tesseract"
+
+    def get_settings(self) -> OcrConfig:
+        """
+        This parser uses the OCR configuration settings to parse documents
+        """
+        return OcrConfig()
+
+    def get_page_count(self, document_path, mime_type):
+        page_count = None
+        if mime_type == "application/pdf":
+            try:
+                import pikepdf
+
+                with pikepdf.Pdf.open(document_path) as pdf:
+                    page_count = len(pdf.pages)
+            except Exception as e:
+                self.log.warning(
+                    f"Unable to determine PDF page count {document_path}: {e}",
+                )
+        return page_count
+
+    def extract_metadata(self, document_path, mime_type):
+        result = []
+        if mime_type == "application/pdf":
+            import pikepdf
+
+            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+            pdf = pikepdf.open(document_path)
+            meta = pdf.open_metadata()
+            for key, value in meta.items():
+                if isinstance(value, list):
+                    value = " ".join([str(e) for e in value])
+                value = str(value)
+                try:
+                    m = namespace_pattern.match(key)
+                    if m is None:  # pragma: no cover
+                        continue
+                    namespace = m.group(1)
+                    key_value = m.group(2)
+                    try:
+                        namespace.encode("utf-8")
+                        key_value.encode("utf-8")
+                    except UnicodeEncodeError as e:  # pragma: no cover
+                        self.log.debug(f"Skipping metadata key {key}: {e}")
+                        continue
+                    result.append(
+                        {
+                            "namespace": namespace,
+                            "prefix": meta.REVERSE_NS[namespace],
+                            "key": key_value,
+                            "value": value,
+                        },
+                    )
+                except Exception as e:
+                    self.log.warning(
+                        f"Error while reading metadata {key}: {value}. Error: {e}",
+                    )
+        return result
+
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+        return make_thumbnail_from_pdf(
+            self.archive_path or document_path,
+            self.tempdir,
+            self.logging_group,
+        )
+
+    def is_image(self, mime_type) -> bool:
+        return mime_type in [
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+            "image/webp",
+            "image/heic",
+        ]
+
+    def has_alpha(self, image) -> bool:
+        with Image.open(image) as im:
+            return im.mode in ("RGBA", "LA")
+
+    def remove_alpha(self, image_path: str) -> Path:
+        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
+        run_subprocess(
+            [
+                settings.CONVERT_BINARY,
+                "-alpha",
+                "off",
+                image_path,
+                no_alpha_image,
+            ],
+            logger=self.log,
+        )
+        return no_alpha_image
+
+    def get_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                x, _ = im.info["dpi"]
+                return round(x)
+        except Exception as e:
+            self.log.warning(f"Error while getting DPI from image {image}: {e}")
+            return None
+
+    def calculate_a4_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                width, _ = im.size
+                # divide image width by A4 width (210mm) in inches.
+                dpi = int(width / (21 / 2.54))
+                self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
+                return dpi
+
+        except Exception as e:
+            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
+            return None
+
+    def extract_text(
+        self,
+        sidecar_file: Path | None,
+        pdf_file: Path,
+    ) -> str | None:
+        # When re-doing OCR, the sidecar contains ONLY the new text, not
+        # the whole text, so do not utilize it in that case
+        if (
+            sidecar_file is not None
+            and sidecar_file.is_file()
+            and self.settings.mode != "redo"
+        ):
+            text = self.read_file_handle_unicode_errors(sidecar_file)
+
+            if "[OCR skipped on page" not in text:
+                # This happens when there's already text in the input file.
+                # The sidecar file will only contain text for OCR'ed pages.
+                self.log.debug("Using text from sidecar file")
+                return post_process_text(text)
+            else:
+                self.log.debug("Incomplete sidecar file: discarding.")
+
+        # no success with the sidecar file, try PDF
+
+        if not Path(pdf_file).is_file():
+            return None
+
+        try:
+            text = None
+            with tempfile.NamedTemporaryFile(
+                mode="w+",
+                dir=self.tempdir,
+            ) as tmp:
+                run_subprocess(
+                    [
+                        "pdftotext",
+                        "-q",
+                        "-layout",
+                        "-enc",
+                        "UTF-8",
+                        pdf_file,
+                        tmp.name,
+                    ],
+                    logger=self.log,
+                )
+                text = self.read_file_handle_unicode_errors(Path(tmp.name))
+
+            return post_process_text(text)
+
+        except Exception:
+            #  If pdftotext fails, fall back to OCR.
+            self.log.warning(
+                "Error while getting text from PDF document with pdftotext",
+                exc_info=True,
+            )
+            # probably not a PDF file.
+            return None
+
+    def construct_ocrmypdf_parameters(
+        self,
+        input_file,
+        mime_type,
+        output_file,
+        sidecar_file,
+        *,
+        safe_fallback=False,
+    ):
+        if TYPE_CHECKING:
+            assert isinstance(self.settings, OcrConfig)
+        ocrmypdf_args = {
+            "input_file": input_file,
+            "output_file": output_file,
+            # need to use threads, since this will be run in daemonized
+            # processes via the task library.
+            "use_threads": True,
+            "jobs": settings.THREADS_PER_WORKER,
+            "language": self.settings.language,
+            "output_type": self.settings.output_type,
+            "progress_bar": False,
+        }
+
+        if "pdfa" in ocrmypdf_args["output_type"]:
+            ocrmypdf_args["color_conversion_strategy"] = (
+                self.settings.color_conversion_strategy
+            )
+
+        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
+            ocrmypdf_args["force_ocr"] = True
+        elif self.settings.mode in {
+            ModeChoices.SKIP,
+            ModeChoices.SKIP_NO_ARCHIVE,
+        }:
+            ocrmypdf_args["skip_text"] = True
+        elif self.settings.mode == ModeChoices.REDO:
+            ocrmypdf_args["redo_ocr"] = True
+        else:  # pragma: no cover
+            raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
+
+        if self.settings.clean == CleanChoices.CLEAN:
+            ocrmypdf_args["clean"] = True
+        elif self.settings.clean == CleanChoices.FINAL:
+            if self.settings.mode == ModeChoices.REDO:
+                ocrmypdf_args["clean"] = True
+            else:
+                # --clean-final is not compatible with --redo-ocr
+                ocrmypdf_args["clean_final"] = True
+
+        if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
+            # --deskew is not compatible with --redo-ocr
+            ocrmypdf_args["deskew"] = True
+
+        if self.settings.rotate:
+            ocrmypdf_args["rotate_pages"] = True
+            ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
+
+        if self.settings.pages is not None and self.settings.pages > 0:
+            ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
+        else:
+            # sidecar is incompatible with pages
+            ocrmypdf_args["sidecar"] = sidecar_file
+
+        if self.is_image(mime_type):
+            # This may be required, depending on the known information
+            maybe_override_pixel_limit()
+
+            dpi = self.get_dpi(input_file)
+            a4_dpi = self.calculate_a4_dpi(input_file)
+
+            if self.has_alpha(input_file):
+                self.log.info(
+                    f"Removing alpha layer from {input_file} "
+                    "for compatibility with img2pdf",
+                )
+                # Replace the input file with the non-alpha
+                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
+
+            if dpi:
+                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
+                ocrmypdf_args["image_dpi"] = dpi
+            elif self.settings.image_dpi is not None:
+                ocrmypdf_args["image_dpi"] = self.settings.image_dpi
+            elif a4_dpi:
+                ocrmypdf_args["image_dpi"] = a4_dpi
+            else:
+                raise ParseError(
+                    f"Cannot produce archive PDF for image {input_file}, "
+                    f"no DPI information is present in this image and "
+                    f"OCR_IMAGE_DPI is not set.",
+                )
+            if ocrmypdf_args["image_dpi"] < 70:  # pragma: no cover
+                self.log.warning(
+                    f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
+                )
+
+        if self.settings.user_args is not None:
+            try:
+                ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
+            except Exception as e:
+                self.log.warning(
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    f"they will not be used. Error: {e}",
+                )
+
+        if (
+            self.settings.max_image_pixel is not None
+            and self.settings.max_image_pixel >= 0
+        ):
+            # Convert pixels to mega-pixels and provide to ocrmypdf
+            max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
+            msg = (
+                "OCR pixel limit is disabled!"
+                if max_pixels_mpixels == 0
+                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
+            )
+            self.log.debug(msg)
+            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+
+        return ocrmypdf_args
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        # This forces tesseract to use one core per page.
+        os.environ["OMP_THREAD_LIMIT"] = "1"
+        VALID_TEXT_LENGTH = 50
+
+        # skip ocr process entirely to save time.
+        self.text = "defautl text"
+        self.log.debug("skipping reading file entirely.")
+        return
+
+        if mime_type == "application/pdf":
+            text_original = self.extract_text(None, document_path)
+            original_has_text = (
+                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+            )
+        else:
+            text_original = None
+            original_has_text = False
+
+        # If the original has text, and the user doesn't want an archive,
+        # we're done here
+        skip_archive_for_text = (
+            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
+            or self.settings.skip_archive_file
+            in {
+                ArchiveFileChoices.WITH_TEXT,
+                ArchiveFileChoices.ALWAYS,
+            }
+        )
+	
+        # force skip ocr process.
+        if not original_has_text:
+            original_has_text = True
+            text_original = "this is default content, as we skipped ocr process..."
+            self.log.warning("Cannot read text from Document, use default message.")
+
+        if skip_archive_for_text and original_has_text:
+            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
+            self.text = text_original
+            return
+
+        # Either no text was in the original or there should be an archive
+        # file created, so OCR the file and create an archive with any
+        # text located via OCR
+
+        import ocrmypdf
+        from ocrmypdf import EncryptedPdfError
+        from ocrmypdf import InputFileError
+        from ocrmypdf import SubprocessOutputError
+        from ocrmypdf.exceptions import DigitalSignatureError
+
+        archive_path = Path(self.tempdir) / "archive.pdf"
+        sidecar_file = Path(self.tempdir) / "sidecar.txt"
+
+        args = self.construct_ocrmypdf_parameters(
+            document_path,
+            mime_type,
+            archive_path,
+            sidecar_file,
+        )
+
+        try:
+            self.log.debug(f"Calling OCRmyPDF with args: {args}")
+            ocrmypdf.ocr(**args)
+
+            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
+                self.archive_path = archive_path
+
+            self.text = self.extract_text(sidecar_file, archive_path)
+
+            if not self.text:
+                raise NoTextFoundException("No text was found in the original document")
+        except (DigitalSignatureError, EncryptedPdfError):
+            self.log.warning(
+                "This file is encrypted and/or signed, OCR is impossible. Using "
+                "any text present in the original file.",
+            )
+            if original_has_text:
+                self.text = text_original
+        except SubprocessOutputError as e:
+            if "Ghostscript PDF/A rendering" in str(e):
+                self.log.warning(
+                    "Ghostscript PDF/A rendering failed, consider setting "
+                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
+                )
+
+            raise ParseError(
+                f"SubprocessOutputError: {e!s}. See logs for more information.",
+            ) from e
+        except (NoTextFoundException, InputFileError) as e:
+            self.log.warning(
+                f"Encountered an error while running OCR: {e!s}. "
+                f"Attempting force OCR to get the text.",
+            )
+
+            archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
+            sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
+
+            # Attempt to run OCR with safe settings.
+
+            args = self.construct_ocrmypdf_parameters(
+                document_path,
+                mime_type,
+                archive_path_fallback,
+                sidecar_file_fallback,
+                safe_fallback=True,
+            )
+
+            try:
+                self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
+                ocrmypdf.ocr(**args)
+
+                # Don't return the archived file here, since this file
+                # is bigger and blurry due to --force-ocr.
+
+                self.text = self.extract_text(
+                    sidecar_file_fallback,
+                    archive_path_fallback,
+                )
+
+            except Exception as e:
+                # If this fails, we have a serious issue at hand.
+                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        except Exception as e:
+            # Anything else is probably serious.
+            raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        # As a last resort, if we still don't have any text for any reason,
+        # try to extract the text from the original document.
+        if not self.text:
+            if original_has_text:
+                self.text = text_original
+            else:
+                self.log.warning(
+                    f"No text was found in {document_path}, the content will be empty.",
+                )
+                self.text = ""
+
+
+def post_process_text(text):
+    if not text:
+        return None
+
+    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
+    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
+
+    # TODO: this needs a rework
+    # replace \0 prevents issues with saving to postgres.
+    # text may contain \0 when this character is present in PDF files.
+    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/docker/paperless/plugins/readme.md
+++ b/docker/paperless/plugins/readme.md
@ -0,0 +1,37 @@
+## 登陆
+### 用户名： admin
+### 密码： paperless
+
+## 需要指定用户名
+### 配置好 USERMAP_GID和USERMAP_GID，否则可能无法执行主机映射进去的脚本。
+### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID
+
+## 自定义的文件名解析脚本
+```Bash
+# 文档
+https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
+https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
+
+# 配置
+environment:
+  PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
+```
+
+
+## 源码修改，可以通过在容器里执行 docker_patch.sh 脚本来完成
+### 对于无法简单读取pdf内容的文档，paperless会启动OCR扫描，且复杂情况下会执行两遍，非常慢而且消耗资源。只能通过修改源码解决：
+```Bash
+# /usr/src/paperless/src/paperless_tesseract/parsers.py :
+
+        # force skip ocr process.
+        if not original_has_text:
+            original_has_text = True
+            text_original = "this is default content, as we skipped ocr process..."
+            self.log.warning("Cannot read text from Document, use default message.")
+
+        if skip_archive_for_text and original_has_text:
+            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
+            self.text = text_original
+            return
+
+```
--- a/docker/paperless/plugins/redme.txt
+++ b/docker/paperless/plugins/redme.txt
@ -1,64 +0,0 @@
-
-
-------------------------------------------------------｜
------------------- paperless 无纸化pdf管理  ------------｜
-------------------------------------------------------｜
-
-## 最好不要用命令，使用docker-compose.yml来创建，需要制定后端使用的数据库，以及redis！
-docker run -itd \
-  --name paperless \
-  --network devops \
-  --platform linux/x86_64 \
-  -e TZ="Asia/Shanghai"  \
-  -v /etc/localtime:/etc/localtime:ro \
-  -v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data"  \
-  -v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db"  \
-  -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
-  -p 8000:8000 \
-  ghcr.io/paperless-ngx/paperless-ngx
-
-
-# 容器创建好之后，要手动设置密码（二选一操作，目前设置的 admin / admin）
-docker compose run --rm webserver createsuperuser
-python3 manage.py createsuperuser
-
-# 已有文档，放在指定目录下，等系统自动加载(或者手工启动)
-cd /path/to/paperless/src/
-python3 manage.py document_consumer
-
-# 自动解析文件名
-https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
-https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
-
-environment:
-  PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
-
-
-paperless 默认不会删除重复的文件，这会导致如果重复添加，会不停扫描，加载，报错。没找到配置，直接修改源码解决：
-
-/usr/src/paperless/src/documents/consumer.py
-
-    def pre_check_duplicate(self):
-        """
-        Using the MD5 of the file, check this exact file doesn't already exist
-        """
-        with open(self.input_doc.original_file, "rb") as f:
-            checksum = hashlib.md5(f.read()).hexdigest()
-        existing_doc = Document.global_objects.filter(
-            Q(checksum=checksum) | Q(archive_checksum=checksum),
-        )
-        if existing_doc.exists():
-            msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
-            log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
-
-            if existing_doc.first().deleted_at is not None:
-                msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
-                log_msg += " Note: existing document is in the trash."
-
-            ## 修改这里，让它删除重复文件。
-            if settings.CONSUMER_DELETE_DUPLICATES or True:
-                os.unlink(self.input_doc.original_file)
-            self._fail(
-                msg,
-                log_msg,
-            )
--- a/docker/stash/scripts/batch_format_filename.py
+++ b/docker/stash/scripts/batch_format_filename.py
@ -0,0 +1,281 @@
+import sqlite3
+import os
+import logging
+import json
+from datetime import datetime
+import argparse
+import re
+
+res_dir = './result'
+os.makedirs(res_dir, exist_ok=True)
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(f'{res_dir}/rename_files.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+def preload_folders(conn, prefix):
+    """预加载所有文件夹路径到字典（folder_id -> path）"""
+    sqlstr = "SELECT id, path FROM folders where 1=1 "
+    if prefix and prefix.strip():
+        sqlstr += f" and path like '%{prefix}%' "
+    try:
+        cursor = conn.cursor()
+        cursor.execute(sqlstr)
+        return {row[0]: row[1] for row in cursor.fetchall()}
+    except sqlite3.Error as e:
+        logger.error(f"预加载文件夹信息失败: {str(e)}")
+        raise
+
+def preload_studios(conn):
+    """预加载所有工作室名称到字典（studio_id -> name）"""
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT id, name FROM studios")
+        studios = {row[0]: row[1] for row in cursor.fetchall()}
+        # 补充默认值（未找到的工作室）
+        studios[None] = "UnknownStudio"
+        return studios
+    except sqlite3.Error as e:
+        logger.error(f"预加载工作室信息失败: {str(e)}")
+        raise
+
+def get_performers(conn, scene_id):
+    """获取场景对应的演员列表（按字母序排序，逗号分隔）"""
+    try:
+        cursor = conn.cursor()
+        query = """
+        SELECT p.name 
+        FROM performers p
+        JOIN performers_scenes ps ON p.id = ps.performer_id
+        WHERE ps.scene_id = ?
+        ORDER BY p.name
+        """
+        cursor.execute(query, (scene_id,))
+        results = cursor.fetchall()
+        return ','.join([row[0] for row in results]) or "UnknownPerformers"
+    except sqlite3.Error as e:
+        logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
+        raise
+
+def parse_date(date_str):
+    """解析日期为yyyy.mm.dd格式"""
+    if not date_str:
+        return "0000.00.00"
+    
+    date_formats = [
+        "%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
+        "%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
+    ]
+    
+    for fmt in date_formats:
+        try:
+            return datetime.strptime(date_str, fmt).strftime("%Y.%m.%d")
+        except ValueError:
+            continue
+    
+    logger.warning(f"无法解析日期格式: {date_str}，使用默认值")
+    return "0000.00.00"
+
+def get_file_extension(basename):
+    """获取文件扩展名"""
+    if '.' in basename:
+        return basename.split('.')[-1].lower()
+    return ''
+
+def sanitize_filename(name):
+    """清理文件名中的非法字符"""
+    invalid_chars = '/\\:*?"<>|'
+    for char in invalid_chars:
+        name = name.replace(char, '-')
+    return name
+
+def process_scene_files(conn, mode, prefix, rename_style):
+    """处理所有场景文件映射关系（优化版：合并查询+预加载缓存）"""
+    results = []
+    try:
+        # 1. 预加载文件夹和工作室到内存字典（仅2次SQL查询）
+        folders = preload_folders(conn, prefix)
+        studios = preload_studios(conn)
+        logger.info(f"预加载完成 - 文件夹: {len(folders)} 个, 工作室: {len(studios)} 个")
+
+        # 2. 一次性查询所有关联数据（1次SQL查询替代多次）
+        cursor = conn.cursor()
+        query = """
+        SELECT 
+            sf.scene_id, sf.file_id,
+            f.id AS file_id, f.basename, f.parent_folder_id,
+            s.title, s.date as release_date, s.studio_id, s.code
+        FROM scenes_files sf
+        LEFT JOIN files f ON sf.file_id = f.id
+        LEFT JOIN scenes s ON sf.scene_id = s.id
+        """
+        cursor.execute(query)
+        mappings = cursor.fetchall()
+        logger.info(f"共找到 {len(mappings)} 条场景-文件映射记录")
+
+        for idx, row in enumerate(mappings, 1):
+            try:
+                # 解析合并查询的结果
+                scene_id = row[0]
+                file_id = row[1]
+                file_info = {
+                    'id': row[2],
+                    'basename': row[3],
+                    'parent_folder_id': row[4]
+                }
+                scene_info = {
+                    'title': row[5],
+                    'release_date': row[6],
+                    'studio_id': row[7],
+                    'code': row[8]
+                }
+
+                # 校验必要数据
+                if not file_id or not file_info['id'] or not file_info['basename'] or not file_info['parent_folder_id']:
+                    logger.debug(f"文件ID信息不完整 (scene_id={scene_id}, file_id={file_id})，跳过")
+                    continue
+                if not scene_id or not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
+                    logger.debug(f"场景信息不完整 (scene_id={scene_id}, file_id={file_id})，跳过")
+                    continue
+
+                # 3. 从内存缓存获取文件夹路径和工作室名称（无SQL查询）
+                folder_path = folders.get(file_info['parent_folder_id'])
+                if not folder_path:
+                    logger.debug(f"文件夹ID不存在 (folder_id={file_info['parent_folder_id']})，跳过")
+                    continue
+                studio_name = studios.get(scene_info['studio_id'])
+                if not studio_name:
+                    logger.debug(f"工作室ID不存在 (studio_id={scene_info['studio_id']})，跳过")
+                    continue
+
+                # 4. 获取演员信息（仍需单独查询，因多对多关联需排序）
+                performers = get_performers(conn, scene_id)
+
+                # 5. 构建新文件名
+                original_basename = file_info['basename'] or "unknown_file"
+                ext = get_file_extension(original_basename)
+                release_date = parse_date(scene_info['release_date'])
+                title = scene_info['title'] or "Untitled"
+
+                # 清理特殊字符
+                sanitized_studio = sanitize_filename(studio_name)
+                sanitized_performers = sanitize_filename(performers)[0:100]  # 限制长度避免过长
+                sanitized_title = sanitize_filename(title)[0:100]  # 限制长度避免过长
+                if scene_info.get('code'):
+                    sanitized_title = f"{sanitized_title} ({scene_info['code']})"
+                # 去掉sanitized_studio的空格，以及' " 等特殊符号
+                sanitized_studio = re.sub(r'[\'"\s\-_]+', '', sanitized_studio)
+
+                # 拼接新文件名
+                if ext:
+                    new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}.{ext}"
+                else:
+                    new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}"
+
+                # 简化命名规则，适用于日本影片
+                if rename_style == 'simple':
+                    if scene_info.get('code'):
+                        # code 转换成大写
+                        new_code = scene_info['code'].upper()
+                        new_basename = f"{new_code}_{release_date}.{ext}" if ext else f"{new_code}_{release_date}"
+
+                if len(new_basename) > 254:
+                    logger.warning(f"生成的文件名过长，跳过 (file_id={file_id}): {new_basename}")
+                    continue
+
+                # 构建完整路径
+                original_path = os.path.join(folder_path, original_basename)
+                new_path = os.path.join(folder_path, new_basename)
+
+                if not os.path.exists(original_path):
+                    logger.warning(f"文件不存在，跳过: {original_path}")
+                    continue
+                if os.path.exists(new_path):
+                    logger.warning(f"目标文件已存在，跳过: {new_path}")
+                    continue
+                if original_path == new_path: # 文件名未变化
+                    logger.info(f"文件名未变化，跳过 (file_id={file_id}): {original_path}")
+                    continue
+
+                # 记录结果
+                result = {
+                    'file_id': file_id,
+                    'scene_id': scene_id,
+                    'original_name': original_path,
+                    'dest_name': new_path
+                }
+                results.append(result)
+                logger.info(f"处理第 {idx}/{len(mappings)} 条: {original_path} -> {new_path}")
+
+                # 运行模式：执行重命名和数据库更新
+                if mode == 'run':
+                    if not os.path.exists(original_path):
+                        logger.warning(f"文件不存在，跳过: {original_path}")
+                        continue
+                    if os.path.exists(new_path):
+                        logger.warning(f"目标文件已存在，跳过: {new_path}")
+                        continue
+                    if original_path != new_path:
+                        os.rename(original_path, new_path)
+                        #cursor.execute(
+                        #    "UPDATE files SET basename = ? WHERE id = ?",
+                        #    (new_basename, file_info['id'])
+                        #)
+                        #conn.commit()
+                        logger.info(f"已更新文件 (file_id={file_info['id']})")
+
+            except Exception as e:
+                logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
+                if mode == 'run':
+                    conn.rollback()
+                continue
+
+        # 保存结果
+        with open(f'{res_dir}/rename_results.json', 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        logger.info(f"处理完成，结果已保存到 rename_results.json")
+        return results
+
+    except sqlite3.Error as e:
+        logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
+        if mode == 'run':
+            conn.rollback()
+        raise
+    finally:
+        if mode == 'run':
+            conn.commit()
+
+def main():
+    parser = argparse.ArgumentParser(description='电影文件重命名工具（优化版）')
+    parser.add_argument('--mode', choices=['check', 'run'], default='check',
+                      help='运行模式: check(检查) 或 run(执行)')
+    parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
+    parser.add_argument('--prefix', default='', help='目录前缀，用来过滤文件路径')
+    parser.add_argument('--rename_style', choices=['standard', 'simple'], default='standard', help='文件命名规则，标准格式和简化格式')
+    args = parser.parse_args()
+
+    if not os.path.exists(args.db):
+        logger.error(f"数据库文件不存在: {args.db}")
+        return
+
+    conn = None
+    try:
+        conn = sqlite3.connect(args.db)
+        logger.info(f"成功连接到数据库: {args.db}")
+        process_scene_files(conn, args.mode, args.prefix, args.rename_style)
+    except sqlite3.Error as e:
+        logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
+    finally:
+        if conn:
+            conn.close()
+            logger.info("数据库连接已关闭")
+
+if __name__ == "__main__":
+    main()
--- a/docker/stash/scripts/format_filename.py
+++ b/docker/stash/scripts/format_filename.py
@ -0,0 +1,288 @@
+import sqlite3
+import os
+import logging
+import json
+from datetime import datetime
+import argparse
+import re
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('./result/rename_files.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+def get_performers(conn, scene_id):
+    """获取场景对应的演员列表（按字母序排序，逗号分隔）"""
+    try:
+        cursor = conn.cursor()
+        # 优化查询：使用JOIN一次性获取所需数据
+        query = """
+        SELECT p.name 
+        FROM performers p
+        JOIN performers_scenes ps ON p.id = ps.performer_id
+        WHERE ps.scene_id = ?
+        ORDER BY p.name
+        """
+        cursor.execute(query, (scene_id,))
+        results = cursor.fetchall()
+        return ','.join([row[0] for row in results])
+    except sqlite3.Error as e:
+        logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
+        raise
+
+def get_file_info(conn, file_id):
+    """获取文件信息（ID、原始文件名、父目录ID）"""
+    try:
+        cursor = conn.cursor()
+        cursor.execute("""
+        SELECT id, basename, parent_folder_id 
+        FROM files 
+        WHERE id = ?
+        """, (file_id,))
+        result = cursor.fetchone()
+        if not result:
+            raise ValueError(f"未找到文件信息 (file_id={file_id})")
+        return {
+            'id': result[0],
+            'basename': result[1],
+            'parent_folder_id': result[2]
+        }
+    except sqlite3.Error as e:
+        logger.error(f"获取文件信息失败 (file_id={file_id}): {str(e)}")
+        raise
+
+def get_folder_path(conn, folder_id):
+    """获取文件夹路径"""
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT path FROM folders WHERE id = ?", (folder_id,))
+        result = cursor.fetchone()
+        if not result:
+            raise ValueError(f"未找到文件夹路径 (folder_id={folder_id})")
+        return result[0]
+    except sqlite3.Error as e:
+        logger.error(f"获取文件夹路径失败 (folder_id={folder_id}): {str(e)}")
+        raise
+
+def get_scene_info(conn, scene_id):
+    """获取场景信息（标题、日期、工作室ID）"""
+    try:
+        cursor = conn.cursor()
+        cursor.execute("""
+        SELECT title, date as release_date, studio_id 
+        FROM scenes 
+        WHERE id = ?
+        """, (scene_id,))
+        result = cursor.fetchone()
+        if not result:
+            raise ValueError(f"未找到场景信息 (scene_id={scene_id})")
+        return {
+            'title': result[0],
+            'release_date': result[1],
+            'studio_id': result[2]
+        }
+    except sqlite3.Error as e:
+        logger.error(f"获取场景信息失败 (scene_id={scene_id}): {str(e)}")
+        raise
+
+def get_studio_name(conn, studio_id):
+    """获取工作室名称"""
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT name FROM studios WHERE id = ?", (studio_id,))
+        result = cursor.fetchone()
+        if not result:
+            logger.warning(f"未找到工作室信息 (studio_id={studio_id})，使用默认名称")
+            return "UnknownStudio"
+        return result[0]
+    except sqlite3.Error as e:
+        logger.error(f"获取工作室信息失败 (studio_id={studio_id}): {str(e)}")
+        raise
+
+def parse_date(date_str):
+    """解析日期为yyyy.mm.dd格式"""
+    if not date_str:
+        return "0000.00.00"
+    
+    # 尝试多种常见日期格式
+    date_formats = [
+        "%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
+        "%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
+    ]
+    
+    for fmt in date_formats:
+        try:
+            date_obj = datetime.strptime(date_str, fmt)
+            return date_obj.strftime("%Y.%m.%d")
+        except ValueError:
+            continue
+    
+    logger.warning(f"无法解析日期格式: {date_str}，使用默认值")
+    return "0000.00.00"
+
+def get_file_extension(basename):
+    """获取文件扩展名"""
+    if '.' in basename:
+        return basename.split('.')[-1].lower()
+    return ''
+
+def sanitize_filename(name):
+    """清理文件名中的非法字符"""
+    invalid_chars = '/\\:*?"<>|'
+    for char in invalid_chars:
+        name = name.replace(char, '-')
+    return name
+
+def process_scene_files(conn, mode, prefix):
+    """处理所有场景文件映射关系"""
+    results = []
+    try:
+        cursor = conn.cursor()
+        # 获取所有场景-文件映射关系
+        cursor.execute("SELECT scene_id, file_id FROM scenes_files")
+        mappings = cursor.fetchall()
+        logger.debug(f"共找到 {len(mappings)} 条场景-文件映射记录")
+
+        for idx, (scene_id, file_id) in enumerate(mappings, 1):
+            logger.debug(f"处理第 {idx}/{len(mappings)} 条记录 (scene_id={scene_id}, file_id={file_id})")
+            
+            try:
+                # 1. 获取文件信息
+                file_info = get_file_info(conn, file_id)
+                original_basename = file_info['basename']
+                parent_folder_id = file_info['parent_folder_id']
+                
+                # 2.获取文件夹路径
+                folder_path = get_folder_path(conn, parent_folder_id)
+
+                # 3. 获取演员信息
+                performers = get_performers(conn, scene_id)
+                if not performers:
+                    performers = "UnknownPerformers"
+                    logger.warning(f"场景 {scene_id} 未找到演员信息，跳过")
+                    continue
+
+                # 4. 获取场景和工作室信息
+                scene_info = get_scene_info(conn, scene_id)
+                if not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
+                    logger.warning(f"场景 {scene_id} 信息不完整，跳过")
+                    continue
+                title = scene_info['title'] or "Untitled"
+                release_date = parse_date(scene_info['release_date'])
+                studio_name = get_studio_name(conn, scene_info['studio_id'])
+
+                # 5. 构建新文件名
+                ext = get_file_extension(original_basename)
+                sanitized_studio = sanitize_filename(studio_name)
+                sanitized_performers = sanitize_filename(performers)[0:100]  # 限制长度避免过长
+                sanitized_title = sanitize_filename(title)[0:100]  # 限制长度避免过长
+
+                if ext:
+                    new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}.{ext}"
+                else:
+                    new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}"
+
+                if len(new_basename) > 254:
+                    logger.warning(f"生成的文件名过长，跳过 (file_id={file_id}): {new_basename}")
+                    continue
+
+                # 构建完整路径
+                original_path = os.path.join(folder_path, original_basename)
+                new_path = os.path.join(folder_path, new_basename)
+
+                # 记录结果
+                result = {
+                    'file_id': file_id,
+                    'scene_id': scene_id,
+                    'original_name': original_path,
+                    'dest_name': new_path
+                }
+                results.append(result)
+
+                # 输出检查信息
+                logger.info(f"准备重命名: {original_path} -> {new_path}")
+
+                # 在运行模式下执行操作
+                if mode == 'run':
+                    # 检查文件是否存在
+                    if not os.path.exists(original_path):
+                        logger.warning(f"文件不存在，跳过: {original_path}")
+                        continue
+
+                    # 执行文件重命名
+                    if original_path != new_path:
+                        os.rename(original_path, new_path)
+                        logger.info(f"已重命名: {original_path} -> {new_path}")
+
+                        # 更新数据库记录
+                        cursor.execute(
+                            "UPDATE files SET basename = ? WHERE id = ?",
+                            (new_basename, file_id)
+                        )
+                        conn.commit()
+                        logger.info(f"已更新数据库记录 (file_id={file_id})")
+
+            except Exception as e:
+                logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
+                # 回滚当前事务（如果是运行模式）
+                if mode == 'run':
+                    conn.rollback()
+                continue
+
+        # 保存结果到文件
+        with open('./result/rename_results.json', 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        logger.info(f"处理完成，结果已保存到 rename_results.json")
+
+        return results
+
+    except sqlite3.Error as e:
+        logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
+        if mode == 'run':
+            conn.rollback()
+        raise
+    finally:
+        if mode == 'run':
+            conn.commit()
+
+def main():
+    # 解析命令行参数
+    parser = argparse.ArgumentParser(description='电影文件重命名工具')
+    parser.add_argument('--mode', choices=['check', 'run'], default='check',
+                      help='运行模式: check(检查) 或 run(执行)')
+    parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
+    parser.add_argument('--prefix', default='', help='目录的前缀，用来匹配')
+    args = parser.parse_args()
+
+    # 验证数据库文件是否存在
+    if not os.path.exists(args.db):
+        logger.error(f"数据库文件不存在: {args.db}")
+        return
+    
+    os.makedirs('./result', exist_ok=True)
+
+    # 连接数据库
+    conn = None
+    try:
+        conn = sqlite3.connect(args.db)
+        conn.row_factory = sqlite3.Row  # 启用行工厂，方便按列名访问
+        logger.info(f"成功连接到数据库: {args.db}")
+        
+        # 执行处理
+        process_scene_files(conn, args.mode, args.prefix)
+        
+    except sqlite3.Error as e:
+        logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
+    finally:
+        if conn:
+            conn.close()
+            logger.info("数据库连接已关闭")
+
+if __name__ == "__main__":
+    main()
--- a/docker/stash/scripts/scrapers/JavBus/JavBus.yml
+++ b/docker/stash/scripts/scrapers/JavBus/JavBus.yml
@ -0,0 +1,110 @@
+name: Javbus
+sceneByFragment:
+  action: scrapeXPath
+  queryURL: https://www.javbus.com/{filename}
+  queryURLReplace:
+    filename:
+      - regex: -JG\d
+        with: ""
+      - regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
+        with: $2
+  scraper: sceneScraper
+sceneByURL:
+  - action: scrapeXPath
+    url:
+      - https://www.javbus.com
+      - https://www.seejav.bid
+      - https://www.cdnbus.lol
+      - https://www.dmmbus.lol
+      - https://www.seedmm.cfd
+    scraper: sceneScraper
+sceneByName:
+  action: scrapeXPath
+  queryURL: https://www.javbus.com/search/{}&type=&parent=ce
+  scraper: sceneSearch
+sceneByQueryFragment:
+  action: scrapeXPath
+  queryURL: "{url}"
+  scraper: sceneScraper
+
+performerByURL: 
+  - action: scrapeXPath
+    url: 
+      - https://www.javbus.com
+      - https://www.seejav.bid
+      - https://www.cdnbus.lol
+      - https://www.dmmbus.lol
+      - https://www.seedmm.cfd
+    scraper: performerScraper
+performerByName: 
+  action: scrapeXPath
+  queryURL: https://www.javbus.com/searchstar/{}&type=&parent=ce
+  scraper: performerSearch
+
+xPathScrapers: 
+  performerSearch: 
+    performer:
+      Name: //span[@class="mleft"]
+      URLs: //*[@id="waterfall"]/div/a/@href
+  performerScraper: 
+    performer: 
+      Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
+      Birthdate:
+        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '生日')]
+        postProcess:
+          - replace:
+            - regex: ^(.*? ){1}
+              with:
+      Height: 
+        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '身高')]
+        postProcess:
+          - replace:
+            - regex: ^(.*? ){1}
+              with:
+      # Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
+      Image: 
+        selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
+        postProcess:
+          - replace:
+            - regex: ^
+              with: https://www.javbus.com
+
+  sceneSearch: 
+    scene: 
+      Title: //div[@class="photo-info"]/span
+      URL: //*[@id="waterfall"]/div/a/@href
+  sceneScraper:
+    scene:
+      Title:
+        selector: //div[@class="col-md-3 info"]//span[contains(text(), '識別碼')]/../span[2]/text()
+      URL:
+        selector: /html/head/link[@hreflang="zh"]/@href
+      Date:
+        selector: //div[@class="col-md-3 info"]//span[contains(text(), '發行日期')]/../text()
+      Details:
+        selector: //div[@class="container"]/h3/text()
+        postProcess:
+          - replace:
+            - regex: ^(.*? ){1}
+              with:
+      Tags:
+        Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
+      Performers:
+        Name: //div[@class="star-name"]/a
+      Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
+      Image:
+        selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
+        postProcess:
+          - replace:
+            - regex: ^
+              with: https://www.javbus.com
+      Studio:
+        Name: //div[@class="col-md-3 info"]//span[contains(text(), '發行商')]/../a/text()
+
+driver:
+  headers:
+    - Key: User-Agent
+      Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
+    - Key: Accept-Language
+      Value: zh-cn
+# Last Updated September 17, 2025
--- a/docker/stash/scripts/scrapers/JavBus/manifest
+++ b/docker/stash/scripts/scrapers/JavBus/manifest
@ -0,0 +1,9 @@
+id: JavBus
+name: Javbus
+metadata: {}
+version: 5ee93a34
+date: "2025-09-17 10:48:13"
+requires: []
+source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
+files:
+- JavBus.yml
--- a/docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
+++ b/docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
@ -0,0 +1,111 @@
+name: Javbus_en
+sceneByFragment:
+  action: scrapeXPath
+  queryURL: https://www.javbus.com/en/{filename}
+  queryURLReplace:
+    filename:
+      - regex: -JG\d
+        with: ""
+      - regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
+        with: $2
+  scraper: sceneScraper
+sceneByURL:
+  - action: scrapeXPath
+    url:
+      - https://www.javbus.com/en
+      - https://www.seejav.bid
+      - https://www.cdnbus.lol
+      - https://www.dmmbus.lol
+      - https://www.seedmm.cfd
+    scraper: sceneScraper
+sceneByName:
+  action: scrapeXPath
+  queryURL: https://www.javbus.com/en/search/{}&type=&parent=ce
+  scraper: sceneSearch
+sceneByQueryFragment:
+  action: scrapeXPath
+  queryURL: "{url}"
+  scraper: sceneScraper
+
+performerByURL: 
+  - action: scrapeXPath
+    url: 
+      - https://www.javbus.com/en
+      - https://www.seejav.bid
+      - https://www.cdnbus.lol
+      - https://www.dmmbus.lol
+      - https://www.seedmm.cfd
+    scraper: performerScraper
+performerByName: 
+  action: scrapeXPath
+  queryURL: https://www.javbus.com/en/searchstar/{}&type=&parent=ce
+  scraper: performerSearch
+
+xPathScrapers: 
+  performerSearch: 
+    performer:
+      Name: //span[@class="mleft"]
+      URLs: //*[@id="waterfall"]/div/a/@href
+  performerScraper: 
+    performer: 
+      Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
+      Birthdate:
+        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'D.O.B')]
+        postProcess:
+          - replace:
+            - regex: ^(.*? ){1}
+              with:
+      Height: 
+        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'Height')]
+        postProcess:
+          - replace:
+            - regex: ^(.*? ){1}
+              with:
+      # Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
+      Image: 
+        selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
+        postProcess:
+          - replace:
+            - regex: ^
+              with: https://www.javbus.com/en
+
+  sceneSearch: 
+    scene: 
+      Title: //div[@class="photo-info"]/span
+      URL: //*[@id="waterfall"]/div/a/@href
+  sceneScraper:
+    scene:
+      Title:
+        selector: //div[@class="col-md-3 info"]//span[contains(text(), 'ID')]/../span[2]/text()
+      URL:
+        selector: /html/head/link[@hreflang="zh"]/@href
+      Date:
+        selector: //div[@class="col-md-3 info"]//span[contains(normalize-space(text()), 'Release Date')]/../text()
+        #selector: //div[@class="col-md-3 info"]//span[contains(text(), 'Release Date')]/../text()
+      Details:
+        selector: //div[@class="container"]/h3/text()
+        postProcess:
+          - replace:
+            - regex: ^(.*? ){1}
+              with:
+      Tags:
+        Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
+      Performers:
+        Name: //div[@class="star-name"]/a
+      Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
+      Image:
+        selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
+        postProcess:
+          - replace:
+            - regex: ^
+              with: https://www.javbus.com/
+      Studio:
+        Name: //div[@class="col-md-3 info"]//span[contains(text(), 'Label')]/../a/text()
+
+driver:
+  headers:
+    - Key: User-Agent
+      Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
+    - Key: Accept-Language
+      Value: zh-cn,en-US
+# Last Updated September 17, 2025
--- a/docker/stash/scripts/scrapers/JavBus_en/manifest
+++ b/docker/stash/scripts/scrapers/JavBus_en/manifest
@ -0,0 +1,9 @@
+id: JavBus_en
+name: Javbus_en
+metadata: {}
+version: b4672ccf
+date: "2025-08-01 16:01:27"
+requires: []
+source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
+files:
+- JavBus_en.yml
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+# 其他已有的忽略规则
+*.pyc
+__pycache__/
+
+# 忽略环境配置文件
+.env
+
+# 忽略所有 log 目录 和 data 目录
+**/log/
+**/data/
+**/result/
Author	SHA1	Message	Date
sophon	2b0e1c0413	modify scripts	2026-01-11 11:50:55 +08:00
sophon	dece263c8b	modify scripts	2026-01-11 10:36:07 +08:00
sophon	00b267b651	modify scripts	2026-01-09 11:29:25 +08:00
sophon	0a4776479c	modify scripts	2025-12-25 17:08:29 +08:00
sophon	6cf529541d	modify scripts	2025-12-25 15:02:07 +08:00
sophon	2c0e3bd718	modify scripts	2025-12-25 14:53:33 +08:00
sophon	ebae625165	modify scripts	2025-12-25 14:51:09 +08:00
sophon	f8daffd47f	modify scripts	2025-12-04 11:08:31 +08:00
sophon	bed2de3cd1	modify scripts	2025-11-14 16:53:08 +08:00
sophon	d1c543512e	modify scripts	2025-11-14 14:09:42 +08:00
sophon	857339d261	modify scripts	2025-11-14 13:54:20 +08:00
sophon	f189dcfaca	modify scripts	2025-11-14 13:32:18 +08:00
sophon	1848510b65	modify scripts	2025-11-13 11:59:45 +08:00
sophon	04d76944ad	modify scripts	2025-11-13 10:00:07 +08:00
sophon	40eae5569a	modify scripts	2025-11-13 08:34:28 +08:00
sophon	15c4f7b823	modify scripts	2025-11-07 10:08:19 +08:00
sophon	17356c79f9	modify scripts	2025-11-07 09:03:35 +08:00
sophon	808dbaa985	modify scripts	2025-11-05 17:25:41 +08:00
sophon	b7dffc539c	modify scripts	2025-11-03 16:34:46 +08:00
sophon	91e7d38725	modify scripts	2025-11-03 16:21:46 +08:00
sophon	fe153d69cc	Merge branch 'master' of git.easyprompt8.com:backend/devops	2025-07-21 11:46:15 +08:00