modify scripts

2026-01-11 11:50:55 +08:00 · 2026-01-11 10:36:07 +08:00 · 2026-01-09 11:29:25 +08:00 · 2025-12-25 17:08:29 +08:00 · 2025-12-25 15:02:07 +08:00 · 2025-12-25 14:53:33 +08:00
20 changed files with 2074 additions and 185 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
 # 其他已有的忽略规则
 *.pyc
 __pycache__/
 # 忽略环境配置文件
 .env
 # 忽略所有 log 目录 和 data 目录
 **/log/
 **/data/
 **/result/
--- a/docker/paperless/docker-compose.yml
+++ b/docker/paperless/docker-compose.yml
@ -8,11 +8,6 @@ services:
    ports:
      - "8000:8000"
    environment:
      PAPERLESS_OCR_LANGUAGES: ""   # 跳过OCR
      PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always"  # 跳过创建文档存档版本的时间
      PAPERLESS_OCR_OUTPUT_TYPE: "pdf"  # 尽量少修改PDF文档
      PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔（以秒为单位），这将导致 paperless 定期检查消费目录中的更改
      #PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间（以秒为单位）
      # 使用 SQLite 作为数据库（默认）
      PAPERLESS_DBENGINE: sqlite3
@ -34,11 +29,22 @@ services:
      # 定义文件命名规则和存储路径
      # 作用不大，主要还是用消费后脚本，以及工作流来指定存储路径。
      # 工作流先于消费后脚本运行，因此消费后脚本里解析的document_type在工作流里无效。所以使用了文件名关键词匹配
-      PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}.pdf"
+      PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}"
      # 解析文件里的关键信息，并更新。但无法更新strorage path。这个字段要靠工作流才行。
      PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
      # 自动删除重复文件
      PAPERLESS_CONSUMER_DELETE_DUPLICATES: true
      # 支持消费目录递归检索，即子目录。这样可以支持多个宿主机的目录映射到docker中
      PAPERLESS_CONSUMER_RECURSIVE: true
      PAPERLESS_OCR_LANGUAGES: ""   # 跳过OCR，并不会，只会用默认的eng来执行
      PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always"  # 跳过创建文档存档版本的时间
      PAPERLESS_OCR_OUTPUT_TYPE: "pdf"  # 尽量少修改PDF文档
      PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔（以秒为单位），这将导致 paperless 定期检查消费目录中的更改
      #PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间（以秒为单位）
      # 运行用户
      USERMAP_UID: 1000
      USERMAP_GID: 1000
@ -46,8 +52,9 @@ services:
    volumes:
      # 存储所有数据（搜索索引、SQLite 数据库、分类模型等）的地方
      - ~/dockers/paperless/data:/usr/src/paperless/data
-      # 挂载文件导入目录
+      # 挂载文件导入目录，可以把多个宿主机的目录，挂到docker中，以子目录的形式存在
      - ~/dockers/paperless/consume:/usr/src/paperless/consume
      - ~/dockers/sharedata/consume:/usr/src/paperless/consume/subdir
      # 挂载文件导出目录
      - ~/dockers/paperless/export:/usr/src/paperless/export
      # 存储您的文档和缩略图的地方
--- a/docker/paperless/plugins/batch_del.py
+++ b/docker/paperless/plugins/batch_del.py
@ -9,7 +9,8 @@ import logging
 # Paperless 服务器信息
 PAPERLESS_URL = "http://localhost:8000/api"
-AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证
+#AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证， mac上用这个
 AUTH = HTTPBasicAuth("admin", "paperless")  # Basic Auth 认证，NAS上用这个
 # 日志配置
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
--- a/docker/paperless/plugins/docker_patch.sh
+++ b/docker/paperless/plugins/docker_patch.sh
@ -0,0 +1,149 @@
 #!/bin/bash
 # 定义文件替换对（数组形式，格式：源文件 目标文件）
 # 可按需添加/删除行，每行一组 源文件 目标文件
 FILE_PAIRS=(
    "/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py"
    # 示例：新增更多文件对
    #"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py"
    # "/path/to/source/file3" "/path/to/dest/file3"
    # "/path/to/source/file4" "/path/to/dest/file4"
 )
 # 检查所有文件是否存在（仅检查replace/check操作需要的文件）
 check_files_exist() {
    local missing=0
    local pair_count=${#FILE_PAIRS[@]}
    # 遍历文件对（步长2：源文件、目标文件为一组）
    for ((i=0; i<pair_count; i+=2)); do
        local source="${FILE_PAIRS[$i]}"
        local dest="${FILE_PAIRS[$i+1]}"
        # 根据操作类型检查对应文件
        if [ "$1" = "replace" ] || [ "$1" = "check" ]; then
            if [ ! -f "$source" ]; then
                echo "错误：源文件不存在 - $source"
                missing=1
            fi
            if [ ! -f "$dest" ]; then
                echo "错误：目标文件不存在 - $dest"
                missing=1
            fi
        elif [ "$1" = "rollback" ]; then
            if [ ! -f "$dest.bak" ]; then
                echo "警告：备份文件不存在（未执行过替换？） - $dest.bak"
                missing=1
            fi
        fi
    done
    if [ $missing -eq 1 ] && [ "$1" != "rollback" ]; then
        echo "错误：关键文件缺失，无法继续执行"
        exit 1
    fi
 }
 # 显示所有文件对的差异
 show_diffs() {
    local pair_count=${#FILE_PAIRS[@]}
    echo "=== 开始检查文件差异 ==="
    for ((i=0; i<pair_count; i+=2)); do
        local source="${FILE_PAIRS[$i]}"
        local dest="${FILE_PAIRS[$i+1]}"
        echo -e "\n--- 检查 $dest <-> $source 的差异 ---"
        diff -u "$dest" "$source" || true  # 无差异时不报错
    done
 }
 # 备份单个文件（添加 .bak 后缀，保留原权限）
 backup_file() {
    local file="$1"
    local backup="$file.bak"
    if [ -f "$backup" ]; then
        echo "提示：旧备份文件已存在，将覆盖 - $backup"
        rm -f "$backup"
    fi
    cp -a "$file" "$backup"  # -a 保留权限、属性、时间戳等
    echo "已备份：$file -> $backup"
 }
 # 替换所有文件对
 replace_files() {
    local pair_count=${#FILE_PAIRS[@]}
    echo "=== 开始替换文件（先备份目标文件） ==="
    for ((i=0; i<pair_count; i+=2)); do
        local source="${FILE_PAIRS[$i]}"
        local dest="${FILE_PAIRS[$i+1]}"
        echo -e "\n--- 处理文件对：$source -> $dest ---"
        backup_file "$dest"
        cp -f "$source" "$dest"
        echo "已替换：$source 覆盖 $dest"
    done
    echo -e "\n=== 替换完成，验证最终差异（应无差异） ==="
    show_diffs
 }
 # 回滚替换操作（恢复 .bak 备份文件）
 rollback_files() {
    local pair_count=${#FILE_PAIRS[@]}
    echo "=== 开始回滚替换操作 ==="
    for ((i=0; i<pair_count; i+=2)); do
        local dest="${FILE_PAIRS[$i+1]}"
        local backup="$dest.bak"
        echo -e "\n--- 处理回滚：$backup -> $dest ---"
        if [ -f "$backup" ]; then
            # 先备份当前文件（防止回滚出错）
            cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true
            # 恢复备份文件
            mv -f "$backup" "$dest"
            echo "已回滚：$dest 恢复为备份版本"
            # 删除临时文件
            rm -f "$dest.rollback_temp" 2>/dev/null || true
        else
            echo "跳过：备份文件不存在 - $backup"
        fi
    done
    echo -e "\n=== 回滚操作执行完成 ==="
 }
 # 主逻辑
 main() {
    case "$1" in
        check)
            echo "=== 执行文件差异检查（不修改文件） ==="
            check_files_exist "check"
            show_diffs
            ;;
        replace)
            echo "=== 执行文件替换操作（自动备份） ==="
            check_files_exist "replace"
            replace_files
            ;;
        rollback)
            echo "=== 执行文件回滚操作（恢复备份） ==="
            check_files_exist "rollback"
            rollback_files
            ;;
        *)
            echo "用法：$0 [check|replace|rollback]"
            echo "  check   - 仅检查所有文件对的差异，不做修改"
            echo "  replace - 备份所有目标文件并执行替换，完成后验证差异"
            echo "  rollback - 回滚替换操作（恢复 .bak 备份文件）"
            exit 1
            ;;
    esac
 }
 # 启动主逻辑
 main "$1"
--- a/docker/paperless/plugins/em_reports_consume.sh
+++ b/docker/paperless/plugins/em_reports_consume.sh
@ -0,0 +1,47 @@
 #!/bin/bash
 SRC="/volume1/docker/sharedata/stock_data/pdfs"
 DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
 LOG="/volume1/docker/projects/devops/docker/paperless/plugins/log/paperless.log"
 TARGET_UID=1000
 TARGET_GID=1000
 # 检查目录
 if [ ! -d "$SRC" ]; then
  echo "$(date '+%F %T') [ERROR] 源目录不存在: $SRC" | tee -a "$LOG"
  exit 1
 fi
 if [ ! -d "$DST" ]; then
  echo "$(date '+%F %T') [ERROR] 目标目录不存在: $DST" | tee -a "$LOG"
  exit 1
 fi
 # 关键添加：检查并创建log目录（-p 确保父目录存在，无报错）
 LOG_DIR=$(dirname "$LOG")  # 提取日志文件所在目录（即 ./log）
 if [ ! -d "$LOG_DIR" ]; then
  mkdir -p "$LOG_DIR"
  echo "$(date '+%F %T') [INFO] log目录不存在，已创建: $LOG_DIR" | tee -a "$LOG"
 fi
 COUNT=0
 for f in "$SRC"/*.pdf; do
  [ -f "$f" ] || continue
  # 移动 + 改属主 + 改权限
  if install -D -o "$TARGET_UID" -g "$TARGET_GID" -m 644 "$f" "$DST"; then
    rm -f "$f"
    echo "$(date '+%F %T') [OK] Moved: $f" >> "$LOG"
    ((COUNT++))
    # 每移动10个文件，输出进度到屏幕（同时写入日志）
    if (( COUNT % 100 == 0 )); then
      PROGRESS_MSG="$(date '+%F %T') [PROGRESS] 已移动 $COUNT 个文件"
      echo "$PROGRESS_MSG" | tee -a "$LOG"
    fi
  else
    echo "$(date '+%F %T') [FAIL] Failed: $f" >> "$LOG"
  fi
 done
 echo "$(date '+%F %T') [INFO] 搬运完成，共移动 $COUNT 个文件" | tee -a "$LOG"
--- a/docker/paperless/plugins/origin_parsers.py
+++ b/docker/paperless/plugins/origin_parsers.py
@ -0,0 +1,472 @@
 import os
 import re
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 from django.conf import settings
 from PIL import Image
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
 from documents.utils import run_subprocess
 from paperless.config import OcrConfig
 from paperless.models import ArchiveFileChoices
 from paperless.models import CleanChoices
 from paperless.models import ModeChoices
 class NoTextFoundException(Exception):
    pass
 class RtlLanguageException(Exception):
    pass
 class RasterisedDocumentParser(DocumentParser):
    """
    This parser uses Tesseract to try and get some text out of a rasterised
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """
    logging_name = "paperless.parsing.tesseract"
    def get_settings(self) -> OcrConfig:
        """
        This parser uses the OCR configuration settings to parse documents
        """
        return OcrConfig()
    def get_page_count(self, document_path, mime_type):
        page_count = None
        if mime_type == "application/pdf":
            try:
                import pikepdf
                with pikepdf.Pdf.open(document_path) as pdf:
                    page_count = len(pdf.pages)
            except Exception as e:
                self.log.warning(
                    f"Unable to determine PDF page count {document_path}: {e}",
                )
        return page_count
    def extract_metadata(self, document_path, mime_type):
        result = []
        if mime_type == "application/pdf":
            import pikepdf
            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
            pdf = pikepdf.open(document_path)
            meta = pdf.open_metadata()
            for key, value in meta.items():
                if isinstance(value, list):
                    value = " ".join([str(e) for e in value])
                value = str(value)
                try:
                    m = namespace_pattern.match(key)
                    if m is None:  # pragma: no cover
                        continue
                    namespace = m.group(1)
                    key_value = m.group(2)
                    try:
                        namespace.encode("utf-8")
                        key_value.encode("utf-8")
                    except UnicodeEncodeError as e:  # pragma: no cover
                        self.log.debug(f"Skipping metadata key {key}: {e}")
                        continue
                    result.append(
                        {
                            "namespace": namespace,
                            "prefix": meta.REVERSE_NS[namespace],
                            "key": key_value,
                            "value": value,
                        },
                    )
                except Exception as e:
                    self.log.warning(
                        f"Error while reading metadata {key}: {value}. Error: {e}",
                    )
        return result
    def get_thumbnail(self, document_path, mime_type, file_name=None):
        return make_thumbnail_from_pdf(
            self.archive_path or document_path,
            self.tempdir,
            self.logging_group,
        )
    def is_image(self, mime_type) -> bool:
        return mime_type in [
            "image/png",
            "image/jpeg",
            "image/tiff",
            "image/bmp",
            "image/gif",
            "image/webp",
            "image/heic",
        ]
    def has_alpha(self, image) -> bool:
        with Image.open(image) as im:
            return im.mode in ("RGBA", "LA")
    def remove_alpha(self, image_path: str) -> Path:
        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
        run_subprocess(
            [
                settings.CONVERT_BINARY,
                "-alpha",
                "off",
                image_path,
                no_alpha_image,
            ],
            logger=self.log,
        )
        return no_alpha_image
    def get_dpi(self, image) -> int | None:
        try:
            with Image.open(image) as im:
                x, _ = im.info["dpi"]
                return round(x)
        except Exception as e:
            self.log.warning(f"Error while getting DPI from image {image}: {e}")
            return None
    def calculate_a4_dpi(self, image) -> int | None:
        try:
            with Image.open(image) as im:
                width, _ = im.size
                # divide image width by A4 width (210mm) in inches.
                dpi = int(width / (21 / 2.54))
                self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
                return dpi
        except Exception as e:
            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
            return None
    def extract_text(
        self,
        sidecar_file: Path | None,
        pdf_file: Path,
    ) -> str | None:
        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
            sidecar_file is not None
            and sidecar_file.is_file()
            and self.settings.mode != "redo"
        ):
            text = self.read_file_handle_unicode_errors(sidecar_file)
            if "[OCR skipped on page" not in text:
                # This happens when there's already text in the input file.
                # The sidecar file will only contain text for OCR'ed pages.
                self.log.debug("Using text from sidecar file")
                return post_process_text(text)
            else:
                self.log.debug("Incomplete sidecar file: discarding.")
        # no success with the sidecar file, try PDF
        if not Path(pdf_file).is_file():
            return None
        try:
            text = None
            with tempfile.NamedTemporaryFile(
                mode="w+",
                dir=self.tempdir,
            ) as tmp:
                run_subprocess(
                    [
                        "pdftotext",
                        "-q",
                        "-layout",
                        "-enc",
                        "UTF-8",
                        pdf_file,
                        tmp.name,
                    ],
                    logger=self.log,
                )
                text = self.read_file_handle_unicode_errors(Path(tmp.name))
            return post_process_text(text)
        except Exception:
            #  If pdftotext fails, fall back to OCR.
            self.log.warning(
                "Error while getting text from PDF document with pdftotext",
                exc_info=True,
            )
            # probably not a PDF file.
            return None
    def construct_ocrmypdf_parameters(
        self,
        input_file,
        mime_type,
        output_file,
        sidecar_file,
        *,
        safe_fallback=False,
    ):
        if TYPE_CHECKING:
            assert isinstance(self.settings, OcrConfig)
        ocrmypdf_args = {
            "input_file": input_file,
            "output_file": output_file,
            # need to use threads, since this will be run in daemonized
            # processes via the task library.
            "use_threads": True,
            "jobs": settings.THREADS_PER_WORKER,
            "language": self.settings.language,
            "output_type": self.settings.output_type,
            "progress_bar": False,
        }
        if "pdfa" in ocrmypdf_args["output_type"]:
            ocrmypdf_args["color_conversion_strategy"] = (
                self.settings.color_conversion_strategy
            )
        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
            ocrmypdf_args["force_ocr"] = True
        elif self.settings.mode in {
            ModeChoices.SKIP,
            ModeChoices.SKIP_NO_ARCHIVE,
        }:
            ocrmypdf_args["skip_text"] = True
        elif self.settings.mode == ModeChoices.REDO:
            ocrmypdf_args["redo_ocr"] = True
        else:  # pragma: no cover
            raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
        if self.settings.clean == CleanChoices.CLEAN:
            ocrmypdf_args["clean"] = True
        elif self.settings.clean == CleanChoices.FINAL:
            if self.settings.mode == ModeChoices.REDO:
                ocrmypdf_args["clean"] = True
            else:
                # --clean-final is not compatible with --redo-ocr
                ocrmypdf_args["clean_final"] = True
        if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
            # --deskew is not compatible with --redo-ocr
            ocrmypdf_args["deskew"] = True
        if self.settings.rotate:
            ocrmypdf_args["rotate_pages"] = True
            ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
        if self.settings.pages is not None and self.settings.pages > 0:
            ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
        else:
            # sidecar is incompatible with pages
            ocrmypdf_args["sidecar"] = sidecar_file
        if self.is_image(mime_type):
            # This may be required, depending on the known information
            maybe_override_pixel_limit()
            dpi = self.get_dpi(input_file)
            a4_dpi = self.calculate_a4_dpi(input_file)
            if self.has_alpha(input_file):
                self.log.info(
                    f"Removing alpha layer from {input_file} "
                    "for compatibility with img2pdf",
                )
                # Replace the input file with the non-alpha
                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
            if dpi:
                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
                ocrmypdf_args["image_dpi"] = dpi
            elif self.settings.image_dpi is not None:
                ocrmypdf_args["image_dpi"] = self.settings.image_dpi
            elif a4_dpi:
                ocrmypdf_args["image_dpi"] = a4_dpi
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {input_file}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.",
                )
            if ocrmypdf_args["image_dpi"] < 70:  # pragma: no cover
                self.log.warning(
                    f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
                )
        if self.settings.user_args is not None:
            try:
                ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
            except Exception as e:
                self.log.warning(
                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
                    f"they will not be used. Error: {e}",
                )
        if (
            self.settings.max_image_pixel is not None
            and self.settings.max_image_pixel >= 0
        ):
            # Convert pixels to mega-pixels and provide to ocrmypdf
            max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
            msg = (
                "OCR pixel limit is disabled!"
                if max_pixels_mpixels == 0
                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
            )
            self.log.debug(msg)
            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
        return ocrmypdf_args
    def parse(self, document_path: Path, mime_type, file_name=None):
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"
        VALID_TEXT_LENGTH = 50
        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
            original_has_text = (
                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
            )
        else:
            text_original = None
            original_has_text = False
        # If the original has text, and the user doesn't want an archive,
        # we're done here
        skip_archive_for_text = (
            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
            or self.settings.skip_archive_file
            in {
                ArchiveFileChoices.WITH_TEXT,
                ArchiveFileChoices.ALWAYS,
            }
        )
        if skip_archive_for_text and original_has_text:
            self.log.debug(f"Document has text, skipping OCRmyPDF entirely. {text_original}")
            self.text = text_original
            return
        # Either no text was in the original or there should be an archive
        # file created, so OCR the file and create an archive with any
        # text located via OCR
        import ocrmypdf
        from ocrmypdf import EncryptedPdfError
        from ocrmypdf import InputFileError
        from ocrmypdf import SubprocessOutputError
        from ocrmypdf.exceptions import DigitalSignatureError
        archive_path = Path(self.tempdir) / "archive.pdf"
        sidecar_file = Path(self.tempdir) / "sidecar.txt"
        args = self.construct_ocrmypdf_parameters(
            document_path,
            mime_type,
            archive_path,
            sidecar_file,
        )
        try:
            self.log.debug(f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)
            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
                self.archive_path = archive_path
            self.text = self.extract_text(sidecar_file, archive_path)
            if not self.text:
                raise NoTextFoundException("No text was found in the original document")
        except (DigitalSignatureError, EncryptedPdfError):
            self.log.warning(
                "This file is encrypted and/or signed, OCR is impossible. Using "
                "any text present in the original file.",
            )
            if original_has_text:
                self.text = text_original
        except SubprocessOutputError as e:
            if "Ghostscript PDF/A rendering" in str(e):
                self.log.warning(
                    "Ghostscript PDF/A rendering failed, consider setting "
                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
                )
            raise ParseError(
                f"SubprocessOutputError: {e!s}. See logs for more information.",
            ) from e
        except (NoTextFoundException, InputFileError) as e:
            self.log.warning(
                f"Encountered an error while running OCR: {e!s}. "
                f"Attempting force OCR to get the text.",
            )
            archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
            sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
            # Attempt to run OCR with safe settings.
            args = self.construct_ocrmypdf_parameters(
                document_path,
                mime_type,
                archive_path_fallback,
                sidecar_file_fallback,
                safe_fallback=True,
            )
            try:
                self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
                ocrmypdf.ocr(**args)
                # Don't return the archived file here, since this file
                # is bigger and blurry due to --force-ocr.
                self.text = self.extract_text(
                    sidecar_file_fallback,
                    archive_path_fallback,
                )
            except Exception as e:
                # If this fails, we have a serious issue at hand.
                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
        # As a last resort, if we still don't have any text for any reason,
        # try to extract the text from the original document.
        if not self.text:
            if original_has_text:
                self.text = text_original
            else:
                self.log.warning(
                    f"No text was found in {document_path}, the content will be empty.",
                )
                self.text = ""
 def post_process_text(text):
    if not text:
        return None
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
    # TODO: this needs a rework
    # replace \0 prevents issues with saving to postgres.
    # text may contain \0 when this character is present in PDF files.
    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/docker/paperless/plugins/paperless.sql
+++ b/docker/paperless/plugins/paperless.sql
@ -1,41 +0,0 @@
 -- documents_correspondent definition
 CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
 CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
 CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
 -- documents_customfield definition
 CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
 CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
 -- documents_customfieldinstance definition
 CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
 CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
 CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
 CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
 -- documents_document definition
 CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
 CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
 CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
 CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
 CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
 CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
 CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
 CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
 CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
 -- documents_documenttype definition
 CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
 CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
 CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");
--- a/docker/paperless/plugins/paperless.txt
+++ b/docker/paperless/plugins/paperless.txt
@ -1,63 +0,0 @@
 我提供的文件，是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下：
 1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
 2，我们提取上面的各个字段，然后： 
  1） report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表，如果对应的name不存在，则插入一条记录；然后得到对应的 documents_documenttype.id 
  2） org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表，如果对应的name 不存在，则插入一条记录，然后得到对应的 documents_correspondent.id 
  3） 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段，如果不存在，则创建； 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
 3，我们开始更新数据表：
  1） 更新 documents_document 表对应的记录， reated = publish_date,  correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title} 
  2)  向 documents_customfieldinstance 两条记录，分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
 好了，请你根据以上需求，完成这个python脚本。注意异常情况的处理，以及日志输出。如果文件名无法匹配以上的格式，则忽略，不用处理。
 Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
 The API provides the following main endpoints:
 /api/correspondents/: Full CRUD support.
 /api/custom_fields/: Full CRUD support.
 /api/documents/: Full CRUD support, except POSTing new documents. See below.
 /api/document_types/: Full CRUD support.
 /api/groups/: Full CRUD support.
 /api/logs/: Read-Only.
 /api/mail_accounts/: Full CRUD support.
 /api/mail_rules/: Full CRUD support.
 /api/profile/: GET, PATCH
 /api/share_links/: Full CRUD support.
 /api/storage_paths/: Full CRUD support.
 /api/tags/: Full CRUD support.
 /api/tasks/: Read-only.
 /api/users/: Full CRUD support.
 /api/workflows/: Full CRUD support.
 /api/search/ GET, see below.
 All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
 The objects served by the document endpoint contain the following fields:
 id: ID of the document. Read-only.
 title: Title of the document.
 content: Plain text content of the document.
 tags: List of IDs of tags assigned to this document, or empty list.
 document_type: Document type of this document, or null.
 correspondent: Correspondent of this document or null.
 created: The date time at which this document was created.
 created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
 modified: The date at which this document was last edited in paperless. Read-only.
 added: The date at which this document was added to paperless. Read-only.
 archive_serial_number: The identifier of this document in a physical document archive.
 original_file_name: Verbose filename of the original document. Read-only.
 archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
 notes: Array of notes associated with the document.
 page_count: Number of pages.
 set_permissions: Allows setting document permissions. Optional, write-only. See below.
 custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
 以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么，我想对编号为19的文档进行查询，以及更新操作，应该如何写对应的python代码？
--- a/docker/paperless/plugins/parse_filename.py
+++ b/docker/paperless/plugins/parse_filename.py
@ -11,7 +11,8 @@ from requests.exceptions import RequestException
 # Paperless 服务器信息
 PAPERLESS_URL = "http://localhost:8000/api"
-AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证
+#AUTH = HTTPBasicAuth("admin", "admin")  # Basic Auth 认证， mac上用这个
 AUTH = HTTPBasicAuth("admin", "paperless")  # Basic Auth 认证，NAS上用这个
 # 日志配置
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
@ -22,7 +23,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(me
 DB_PATH = "/usr/src/paperless/data/db.sqlite3"
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
-enable_db = True
+enable_db = False   # 标准用法，用API
 # 正则解析文件名
 FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")
--- a/docker/paperless/plugins/parsers.py
+++ b/docker/paperless/plugins/parsers.py
@ -0,0 +1,484 @@
 import os
 import re
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 from django.conf import settings
 from PIL import Image
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
 from documents.utils import run_subprocess
 from paperless.config import OcrConfig
 from paperless.models import ArchiveFileChoices
 from paperless.models import CleanChoices
 from paperless.models import ModeChoices
 class NoTextFoundException(Exception):
    pass
 class RtlLanguageException(Exception):
    pass
 class RasterisedDocumentParser(DocumentParser):
    """
    This parser uses Tesseract to try and get some text out of a rasterised
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """
    logging_name = "paperless.parsing.tesseract"
    def get_settings(self) -> OcrConfig:
        """
        This parser uses the OCR configuration settings to parse documents
        """
        return OcrConfig()
    def get_page_count(self, document_path, mime_type):
        page_count = None
        if mime_type == "application/pdf":
            try:
                import pikepdf
                with pikepdf.Pdf.open(document_path) as pdf:
                    page_count = len(pdf.pages)
            except Exception as e:
                self.log.warning(
                    f"Unable to determine PDF page count {document_path}: {e}",
                )
        return page_count
    def extract_metadata(self, document_path, mime_type):
        result = []
        if mime_type == "application/pdf":
            import pikepdf
            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
            pdf = pikepdf.open(document_path)
            meta = pdf.open_metadata()
            for key, value in meta.items():
                if isinstance(value, list):
                    value = " ".join([str(e) for e in value])
                value = str(value)
                try:
                    m = namespace_pattern.match(key)
                    if m is None:  # pragma: no cover
                        continue
                    namespace = m.group(1)
                    key_value = m.group(2)
                    try:
                        namespace.encode("utf-8")
                        key_value.encode("utf-8")
                    except UnicodeEncodeError as e:  # pragma: no cover
                        self.log.debug(f"Skipping metadata key {key}: {e}")
                        continue
                    result.append(
                        {
                            "namespace": namespace,
                            "prefix": meta.REVERSE_NS[namespace],
                            "key": key_value,
                            "value": value,
                        },
                    )
                except Exception as e:
                    self.log.warning(
                        f"Error while reading metadata {key}: {value}. Error: {e}",
                    )
        return result
    def get_thumbnail(self, document_path, mime_type, file_name=None):
        return make_thumbnail_from_pdf(
            self.archive_path or document_path,
            self.tempdir,
            self.logging_group,
        )
    def is_image(self, mime_type) -> bool:
        return mime_type in [
            "image/png",
            "image/jpeg",
            "image/tiff",
            "image/bmp",
            "image/gif",
            "image/webp",
            "image/heic",
        ]
    def has_alpha(self, image) -> bool:
        with Image.open(image) as im:
            return im.mode in ("RGBA", "LA")
    def remove_alpha(self, image_path: str) -> Path:
        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
        run_subprocess(
            [
                settings.CONVERT_BINARY,
                "-alpha",
                "off",
                image_path,
                no_alpha_image,
            ],
            logger=self.log,
        )
        return no_alpha_image
    def get_dpi(self, image) -> int | None:
        try:
            with Image.open(image) as im:
                x, _ = im.info["dpi"]
                return round(x)
        except Exception as e:
            self.log.warning(f"Error while getting DPI from image {image}: {e}")
            return None
    def calculate_a4_dpi(self, image) -> int | None:
        try:
            with Image.open(image) as im:
                width, _ = im.size
                # divide image width by A4 width (210mm) in inches.
                dpi = int(width / (21 / 2.54))
                self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
                return dpi
        except Exception as e:
            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
            return None
    def extract_text(
        self,
        sidecar_file: Path | None,
        pdf_file: Path,
    ) -> str | None:
        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
            sidecar_file is not None
            and sidecar_file.is_file()
            and self.settings.mode != "redo"
        ):
            text = self.read_file_handle_unicode_errors(sidecar_file)
            if "[OCR skipped on page" not in text:
                # This happens when there's already text in the input file.
                # The sidecar file will only contain text for OCR'ed pages.
                self.log.debug("Using text from sidecar file")
                return post_process_text(text)
            else:
                self.log.debug("Incomplete sidecar file: discarding.")
        # no success with the sidecar file, try PDF
        if not Path(pdf_file).is_file():
            return None
        try:
            text = None
            with tempfile.NamedTemporaryFile(
                mode="w+",
                dir=self.tempdir,
            ) as tmp:
                run_subprocess(
                    [
                        "pdftotext",
                        "-q",
                        "-layout",
                        "-enc",
                        "UTF-8",
                        pdf_file,
                        tmp.name,
                    ],
                    logger=self.log,
                )
                text = self.read_file_handle_unicode_errors(Path(tmp.name))
            return post_process_text(text)
        except Exception:
            #  If pdftotext fails, fall back to OCR.
            self.log.warning(
                "Error while getting text from PDF document with pdftotext",
                exc_info=True,
            )
            # probably not a PDF file.
            return None
    def construct_ocrmypdf_parameters(
        self,
        input_file,
        mime_type,
        output_file,
        sidecar_file,
        *,
        safe_fallback=False,
    ):
        if TYPE_CHECKING:
            assert isinstance(self.settings, OcrConfig)
        ocrmypdf_args = {
            "input_file": input_file,
            "output_file": output_file,
            # need to use threads, since this will be run in daemonized
            # processes via the task library.
            "use_threads": True,
            "jobs": settings.THREADS_PER_WORKER,
            "language": self.settings.language,
            "output_type": self.settings.output_type,
            "progress_bar": False,
        }
        if "pdfa" in ocrmypdf_args["output_type"]:
            ocrmypdf_args["color_conversion_strategy"] = (
                self.settings.color_conversion_strategy
            )
        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
            ocrmypdf_args["force_ocr"] = True
        elif self.settings.mode in {
            ModeChoices.SKIP,
            ModeChoices.SKIP_NO_ARCHIVE,
        }:
            ocrmypdf_args["skip_text"] = True
        elif self.settings.mode == ModeChoices.REDO:
            ocrmypdf_args["redo_ocr"] = True
        else:  # pragma: no cover
            raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
        if self.settings.clean == CleanChoices.CLEAN:
            ocrmypdf_args["clean"] = True
        elif self.settings.clean == CleanChoices.FINAL:
            if self.settings.mode == ModeChoices.REDO:
                ocrmypdf_args["clean"] = True
            else:
                # --clean-final is not compatible with --redo-ocr
                ocrmypdf_args["clean_final"] = True
        if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
            # --deskew is not compatible with --redo-ocr
            ocrmypdf_args["deskew"] = True
        if self.settings.rotate:
            ocrmypdf_args["rotate_pages"] = True
            ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
        if self.settings.pages is not None and self.settings.pages > 0:
            ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
        else:
            # sidecar is incompatible with pages
            ocrmypdf_args["sidecar"] = sidecar_file
        if self.is_image(mime_type):
            # This may be required, depending on the known information
            maybe_override_pixel_limit()
            dpi = self.get_dpi(input_file)
            a4_dpi = self.calculate_a4_dpi(input_file)
            if self.has_alpha(input_file):
                self.log.info(
                    f"Removing alpha layer from {input_file} "
                    "for compatibility with img2pdf",
                )
                # Replace the input file with the non-alpha
                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
            if dpi:
                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
                ocrmypdf_args["image_dpi"] = dpi
            elif self.settings.image_dpi is not None:
                ocrmypdf_args["image_dpi"] = self.settings.image_dpi
            elif a4_dpi:
                ocrmypdf_args["image_dpi"] = a4_dpi
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {input_file}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.",
                )
            if ocrmypdf_args["image_dpi"] < 70:  # pragma: no cover
                self.log.warning(
                    f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
                )
        if self.settings.user_args is not None:
            try:
                ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
            except Exception as e:
                self.log.warning(
                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
                    f"they will not be used. Error: {e}",
                )
        if (
            self.settings.max_image_pixel is not None
            and self.settings.max_image_pixel >= 0
        ):
            # Convert pixels to mega-pixels and provide to ocrmypdf
            max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
            msg = (
                "OCR pixel limit is disabled!"
                if max_pixels_mpixels == 0
                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
            )
            self.log.debug(msg)
            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
        return ocrmypdf_args
    def parse(self, document_path: Path, mime_type, file_name=None):
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"
        VALID_TEXT_LENGTH = 50
        # skip ocr process entirely to save time.
        self.text = "defautl text"
        self.log.debug("skipping reading file entirely.")
        return
        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
            original_has_text = (
                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
            )
        else:
            text_original = None
            original_has_text = False
        # If the original has text, and the user doesn't want an archive,
        # we're done here
        skip_archive_for_text = (
            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
            or self.settings.skip_archive_file
            in {
                ArchiveFileChoices.WITH_TEXT,
                ArchiveFileChoices.ALWAYS,
            }
        )
        # force skip ocr process.
        if not original_has_text:
            original_has_text = True
            text_original = "this is default content, as we skipped ocr process..."
            self.log.warning("Cannot read text from Document, use default message.")
        if skip_archive_for_text and original_has_text:
            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return
        # Either no text was in the original or there should be an archive
        # file created, so OCR the file and create an archive with any
        # text located via OCR
        import ocrmypdf
        from ocrmypdf import EncryptedPdfError
        from ocrmypdf import InputFileError
        from ocrmypdf import SubprocessOutputError
        from ocrmypdf.exceptions import DigitalSignatureError
        archive_path = Path(self.tempdir) / "archive.pdf"
        sidecar_file = Path(self.tempdir) / "sidecar.txt"
        args = self.construct_ocrmypdf_parameters(
            document_path,
            mime_type,
            archive_path,
            sidecar_file,
        )
        try:
            self.log.debug(f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)
            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
                self.archive_path = archive_path
            self.text = self.extract_text(sidecar_file, archive_path)
            if not self.text:
                raise NoTextFoundException("No text was found in the original document")
        except (DigitalSignatureError, EncryptedPdfError):
            self.log.warning(
                "This file is encrypted and/or signed, OCR is impossible. Using "
                "any text present in the original file.",
            )
            if original_has_text:
                self.text = text_original
        except SubprocessOutputError as e:
            if "Ghostscript PDF/A rendering" in str(e):
                self.log.warning(
                    "Ghostscript PDF/A rendering failed, consider setting "
                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
                )
            raise ParseError(
                f"SubprocessOutputError: {e!s}. See logs for more information.",
            ) from e
        except (NoTextFoundException, InputFileError) as e:
            self.log.warning(
                f"Encountered an error while running OCR: {e!s}. "
                f"Attempting force OCR to get the text.",
            )
            archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
            sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
            # Attempt to run OCR with safe settings.
            args = self.construct_ocrmypdf_parameters(
                document_path,
                mime_type,
                archive_path_fallback,
                sidecar_file_fallback,
                safe_fallback=True,
            )
            try:
                self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
                ocrmypdf.ocr(**args)
                # Don't return the archived file here, since this file
                # is bigger and blurry due to --force-ocr.
                self.text = self.extract_text(
                    sidecar_file_fallback,
                    archive_path_fallback,
                )
            except Exception as e:
                # If this fails, we have a serious issue at hand.
                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
        # As a last resort, if we still don't have any text for any reason,
        # try to extract the text from the original document.
        if not self.text:
            if original_has_text:
                self.text = text_original
            else:
                self.log.warning(
                    f"No text was found in {document_path}, the content will be empty.",
                )
                self.text = ""
 def post_process_text(text):
    if not text:
        return None
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
    # TODO: this needs a rework
    # replace \0 prevents issues with saving to postgres.
    # text may contain \0 when this character is present in PDF files.
    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/docker/paperless/plugins/readme.md
+++ b/docker/paperless/plugins/readme.md
@ -0,0 +1,37 @@
 ## 登陆
 ### 用户名： admin
 ### 密码： paperless
 ## 需要指定用户名
 ### 配置好 USERMAP_GID和USERMAP_GID，否则可能无法执行主机映射进去的脚本。
 ### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID
 ## 自定义的文件名解析脚本
 ```Bash
 # 文档
 https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
 https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
 # 配置
 environment:
  PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
 ```
 ## 源码修改，可以通过在容器里执行 docker_patch.sh 脚本来完成
 ### 对于无法简单读取pdf内容的文档，paperless会启动OCR扫描，且复杂情况下会执行两遍，非常慢而且消耗资源。只能通过修改源码解决：
 ```Bash
 # /usr/src/paperless/src/paperless_tesseract/parsers.py :
        # force skip ocr process.
        if not original_has_text:
            original_has_text = True
            text_original = "this is default content, as we skipped ocr process..."
            self.log.warning("Cannot read text from Document, use default message.")
        if skip_archive_for_text and original_has_text:
            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return
 ```
--- a/docker/paperless/plugins/redme.txt
+++ b/docker/paperless/plugins/redme.txt
@ -1,64 +0,0 @@
 -------------------------------------------------------｜
 ------------------- paperless 无纸化pdf管理  ------------｜
 -------------------------------------------------------｜
 ## 最好不要用命令，使用docker-compose.yml来创建，需要制定后端使用的数据库，以及redis！
 docker run -itd \
  --name paperless \
  --network devops \
  --platform linux/x86_64 \
  -e TZ="Asia/Shanghai"  \
  -v /etc/localtime:/etc/localtime:ro \
  -v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data"  \
  -v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db"  \
  -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
  -p 8000:8000 \
  ghcr.io/paperless-ngx/paperless-ngx
 # 容器创建好之后，要手动设置密码（二选一操作，目前设置的 admin / admin）
 docker compose run --rm webserver createsuperuser
 python3 manage.py createsuperuser
 # 已有文档，放在指定目录下，等系统自动加载(或者手工启动)
 cd /path/to/paperless/src/
 python3 manage.py document_consumer
 # 自动解析文件名
 https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
 https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
 environment:
  PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
 paperless 默认不会删除重复的文件，这会导致如果重复添加，会不停扫描，加载，报错。没找到配置，直接修改源码解决：
 /usr/src/paperless/src/documents/consumer.py
    def pre_check_duplicate(self):
        """
        Using the MD5 of the file, check this exact file doesn't already exist
        """
        with open(self.input_doc.original_file, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
        existing_doc = Document.global_objects.filter(
            Q(checksum=checksum) | Q(archive_checksum=checksum),
        )
        if existing_doc.exists():
            msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
            log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
            if existing_doc.first().deleted_at is not None:
                msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
                log_msg += " Note: existing document is in the trash."
            ## 修改这里，让它删除重复文件。
            if settings.CONSUMER_DELETE_DUPLICATES or True:
                os.unlink(self.input_doc.original_file)
            self._fail(
                msg,
                log_msg,
            )
--- a/docker/stash/scripts/batch_format_filename.py
+++ b/docker/stash/scripts/batch_format_filename.py
@ -0,0 +1,281 @@
 import sqlite3
 import os
 import logging
 import json
 from datetime import datetime
 import argparse
 import re
 res_dir = './result'
 os.makedirs(res_dir, exist_ok=True)
 # 配置日志
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'{res_dir}/rename_files.log'),
        logging.StreamHandler()
    ]
 )
 logger = logging.getLogger(__name__)
 def preload_folders(conn, prefix):
    """预加载所有文件夹路径到字典（folder_id -> path）"""
    sqlstr = "SELECT id, path FROM folders where 1=1 "
    if prefix and prefix.strip():
        sqlstr += f" and path like '%{prefix}%' "
    try:
        cursor = conn.cursor()
        cursor.execute(sqlstr)
        return {row[0]: row[1] for row in cursor.fetchall()}
    except sqlite3.Error as e:
        logger.error(f"预加载文件夹信息失败: {str(e)}")
        raise
 def preload_studios(conn):
    """预加载所有工作室名称到字典（studio_id -> name）"""
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT id, name FROM studios")
        studios = {row[0]: row[1] for row in cursor.fetchall()}
        # 补充默认值（未找到的工作室）
        studios[None] = "UnknownStudio"
        return studios
    except sqlite3.Error as e:
        logger.error(f"预加载工作室信息失败: {str(e)}")
        raise
 def get_performers(conn, scene_id):
    """获取场景对应的演员列表（按字母序排序，逗号分隔）"""
    try:
        cursor = conn.cursor()
        query = """
        SELECT p.name 
        FROM performers p
        JOIN performers_scenes ps ON p.id = ps.performer_id
        WHERE ps.scene_id = ?
        ORDER BY p.name
        """
        cursor.execute(query, (scene_id,))
        results = cursor.fetchall()
        return ','.join([row[0] for row in results]) or "UnknownPerformers"
    except sqlite3.Error as e:
        logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
        raise
 def parse_date(date_str):
    """解析日期为yyyy.mm.dd格式"""
    if not date_str:
        return "0000.00.00"
    date_formats = [
        "%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
        "%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
    ]
    for fmt in date_formats:
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y.%m.%d")
        except ValueError:
            continue
    logger.warning(f"无法解析日期格式: {date_str}，使用默认值")
    return "0000.00.00"
 def get_file_extension(basename):
    """获取文件扩展名"""
    if '.' in basename:
        return basename.split('.')[-1].lower()
    return ''
 def sanitize_filename(name):
    """清理文件名中的非法字符"""
    invalid_chars = '/\\:*?"<>|'
    for char in invalid_chars:
        name = name.replace(char, '-')
    return name
 def process_scene_files(conn, mode, prefix, rename_style):
    """处理所有场景文件映射关系（优化版：合并查询+预加载缓存）"""
    results = []
    try:
        # 1. 预加载文件夹和工作室到内存字典（仅2次SQL查询）
        folders = preload_folders(conn, prefix)
        studios = preload_studios(conn)
        logger.info(f"预加载完成 - 文件夹: {len(folders)} 个, 工作室: {len(studios)} 个")
        # 2. 一次性查询所有关联数据（1次SQL查询替代多次）
        cursor = conn.cursor()
        query = """
        SELECT 
            sf.scene_id, sf.file_id,
            f.id AS file_id, f.basename, f.parent_folder_id,
            s.title, s.date as release_date, s.studio_id, s.code
        FROM scenes_files sf
        LEFT JOIN files f ON sf.file_id = f.id
        LEFT JOIN scenes s ON sf.scene_id = s.id
        """
        cursor.execute(query)
        mappings = cursor.fetchall()
        logger.info(f"共找到 {len(mappings)} 条场景-文件映射记录")
        for idx, row in enumerate(mappings, 1):
            try:
                # 解析合并查询的结果
                scene_id = row[0]
                file_id = row[1]
                file_info = {
                    'id': row[2],
                    'basename': row[3],
                    'parent_folder_id': row[4]
                }
                scene_info = {
                    'title': row[5],
                    'release_date': row[6],
                    'studio_id': row[7],
                    'code': row[8]
                }
                # 校验必要数据
                if not file_id or not file_info['id'] or not file_info['basename'] or not file_info['parent_folder_id']:
                    logger.debug(f"文件ID信息不完整 (scene_id={scene_id}, file_id={file_id})，跳过")
                    continue
                if not scene_id or not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
                    logger.debug(f"场景信息不完整 (scene_id={scene_id}, file_id={file_id})，跳过")
                    continue
                # 3. 从内存缓存获取文件夹路径和工作室名称（无SQL查询）
                folder_path = folders.get(file_info['parent_folder_id'])
                if not folder_path:
                    logger.debug(f"文件夹ID不存在 (folder_id={file_info['parent_folder_id']})，跳过")
                    continue
                studio_name = studios.get(scene_info['studio_id'])
                if not studio_name:
                    logger.debug(f"工作室ID不存在 (studio_id={scene_info['studio_id']})，跳过")
                    continue
                # 4. 获取演员信息（仍需单独查询，因多对多关联需排序）
                performers = get_performers(conn, scene_id)
                # 5. 构建新文件名
                original_basename = file_info['basename'] or "unknown_file"
                ext = get_file_extension(original_basename)
                release_date = parse_date(scene_info['release_date'])
                title = scene_info['title'] or "Untitled"
                # 清理特殊字符
                sanitized_studio = sanitize_filename(studio_name)
                sanitized_performers = sanitize_filename(performers)[0:100]  # 限制长度避免过长
                sanitized_title = sanitize_filename(title)[0:100]  # 限制长度避免过长
                if scene_info.get('code'):
                    sanitized_title = f"{sanitized_title} ({scene_info['code']})"
                # 去掉sanitized_studio的空格，以及' " 等特殊符号
                sanitized_studio = re.sub(r'[\'"\s\-_]+', '', sanitized_studio)
                # 拼接新文件名
                if ext:
                    new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}.{ext}"
                else:
                    new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}"
                # 简化命名规则，适用于日本影片
                if rename_style == 'simple':
                    if scene_info.get('code'):
                        # code 转换成大写
                        new_code = scene_info['code'].upper()
                        new_basename = f"{new_code}_{release_date}.{ext}" if ext else f"{new_code}_{release_date}"
                if len(new_basename) > 254:
                    logger.warning(f"生成的文件名过长，跳过 (file_id={file_id}): {new_basename}")
                    continue
                # 构建完整路径
                original_path = os.path.join(folder_path, original_basename)
                new_path = os.path.join(folder_path, new_basename)
                if not os.path.exists(original_path):
                    logger.warning(f"文件不存在，跳过: {original_path}")
                    continue
                if os.path.exists(new_path):
                    logger.warning(f"目标文件已存在，跳过: {new_path}")
                    continue
                if original_path == new_path: # 文件名未变化
                    logger.info(f"文件名未变化，跳过 (file_id={file_id}): {original_path}")
                    continue
                # 记录结果
                result = {
                    'file_id': file_id,
                    'scene_id': scene_id,
                    'original_name': original_path,
                    'dest_name': new_path
                }
                results.append(result)
                logger.info(f"处理第 {idx}/{len(mappings)} 条: {original_path} -> {new_path}")
                # 运行模式：执行重命名和数据库更新
                if mode == 'run':
                    if not os.path.exists(original_path):
                        logger.warning(f"文件不存在，跳过: {original_path}")
                        continue
                    if os.path.exists(new_path):
                        logger.warning(f"目标文件已存在，跳过: {new_path}")
                        continue
                    if original_path != new_path:
                        os.rename(original_path, new_path)
                        #cursor.execute(
                        #    "UPDATE files SET basename = ? WHERE id = ?",
                        #    (new_basename, file_info['id'])
                        #)
                        #conn.commit()
                        logger.info(f"已更新文件 (file_id={file_info['id']})")
            except Exception as e:
                logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
                if mode == 'run':
                    conn.rollback()
                continue
        # 保存结果
        with open(f'{res_dir}/rename_results.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        logger.info(f"处理完成，结果已保存到 rename_results.json")
        return results
    except sqlite3.Error as e:
        logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
        if mode == 'run':
            conn.rollback()
        raise
    finally:
        if mode == 'run':
            conn.commit()
 def main():
    parser = argparse.ArgumentParser(description='电影文件重命名工具（优化版）')
    parser.add_argument('--mode', choices=['check', 'run'], default='check',
                      help='运行模式: check(检查) 或 run(执行)')
    parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
    parser.add_argument('--prefix', default='', help='目录前缀，用来过滤文件路径')
    parser.add_argument('--rename_style', choices=['standard', 'simple'], default='standard', help='文件命名规则，标准格式和简化格式')
    args = parser.parse_args()
    if not os.path.exists(args.db):
        logger.error(f"数据库文件不存在: {args.db}")
        return
    conn = None
    try:
        conn = sqlite3.connect(args.db)
        logger.info(f"成功连接到数据库: {args.db}")
        process_scene_files(conn, args.mode, args.prefix, args.rename_style)
    except sqlite3.Error as e:
        logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
    finally:
        if conn:
            conn.close()
            logger.info("数据库连接已关闭")
 if __name__ == "__main__":
    main()
--- a/docker/stash/scripts/format_filename.py
+++ b/docker/stash/scripts/format_filename.py
@ -0,0 +1,288 @@
 import sqlite3
 import os
 import logging
 import json
 from datetime import datetime
 import argparse
 import re
 # 配置日志
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('./result/rename_files.log'),
        logging.StreamHandler()
    ]
 )
 logger = logging.getLogger(__name__)
 def get_performers(conn, scene_id):
    """获取场景对应的演员列表（按字母序排序，逗号分隔）"""
    try:
        cursor = conn.cursor()
        # 优化查询：使用JOIN一次性获取所需数据
        query = """
        SELECT p.name 
        FROM performers p
        JOIN performers_scenes ps ON p.id = ps.performer_id
        WHERE ps.scene_id = ?
        ORDER BY p.name
        """
        cursor.execute(query, (scene_id,))
        results = cursor.fetchall()
        return ','.join([row[0] for row in results])
    except sqlite3.Error as e:
        logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
        raise
 def get_file_info(conn, file_id):
    """获取文件信息（ID、原始文件名、父目录ID）"""
    try:
        cursor = conn.cursor()
        cursor.execute("""
        SELECT id, basename, parent_folder_id 
        FROM files 
        WHERE id = ?
        """, (file_id,))
        result = cursor.fetchone()
        if not result:
            raise ValueError(f"未找到文件信息 (file_id={file_id})")
        return {
            'id': result[0],
            'basename': result[1],
            'parent_folder_id': result[2]
        }
    except sqlite3.Error as e:
        logger.error(f"获取文件信息失败 (file_id={file_id}): {str(e)}")
        raise
 def get_folder_path(conn, folder_id):
    """获取文件夹路径"""
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT path FROM folders WHERE id = ?", (folder_id,))
        result = cursor.fetchone()
        if not result:
            raise ValueError(f"未找到文件夹路径 (folder_id={folder_id})")
        return result[0]
    except sqlite3.Error as e:
        logger.error(f"获取文件夹路径失败 (folder_id={folder_id}): {str(e)}")
        raise
 def get_scene_info(conn, scene_id):
    """获取场景信息（标题、日期、工作室ID）"""
    try:
        cursor = conn.cursor()
        cursor.execute("""
        SELECT title, date as release_date, studio_id 
        FROM scenes 
        WHERE id = ?
        """, (scene_id,))
        result = cursor.fetchone()
        if not result:
            raise ValueError(f"未找到场景信息 (scene_id={scene_id})")
        return {
            'title': result[0],
            'release_date': result[1],
            'studio_id': result[2]
        }
    except sqlite3.Error as e:
        logger.error(f"获取场景信息失败 (scene_id={scene_id}): {str(e)}")
        raise
 def get_studio_name(conn, studio_id):
    """获取工作室名称"""
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM studios WHERE id = ?", (studio_id,))
        result = cursor.fetchone()
        if not result:
            logger.warning(f"未找到工作室信息 (studio_id={studio_id})，使用默认名称")
            return "UnknownStudio"
        return result[0]
    except sqlite3.Error as e:
        logger.error(f"获取工作室信息失败 (studio_id={studio_id}): {str(e)}")
        raise
 def parse_date(date_str):
    """解析日期为yyyy.mm.dd格式"""
    if not date_str:
        return "0000.00.00"
    # 尝试多种常见日期格式
    date_formats = [
        "%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
        "%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
    ]
    for fmt in date_formats:
        try:
            date_obj = datetime.strptime(date_str, fmt)
            return date_obj.strftime("%Y.%m.%d")
        except ValueError:
            continue
    logger.warning(f"无法解析日期格式: {date_str}，使用默认值")
    return "0000.00.00"
 def get_file_extension(basename):
    """获取文件扩展名"""
    if '.' in basename:
        return basename.split('.')[-1].lower()
    return ''
 def sanitize_filename(name):
    """清理文件名中的非法字符"""
    invalid_chars = '/\\:*?"<>|'
    for char in invalid_chars:
        name = name.replace(char, '-')
    return name
 def process_scene_files(conn, mode, prefix):
    """处理所有场景文件映射关系"""
    results = []
    try:
        cursor = conn.cursor()
        # 获取所有场景-文件映射关系
        cursor.execute("SELECT scene_id, file_id FROM scenes_files")
        mappings = cursor.fetchall()
        logger.debug(f"共找到 {len(mappings)} 条场景-文件映射记录")
        for idx, (scene_id, file_id) in enumerate(mappings, 1):
            logger.debug(f"处理第 {idx}/{len(mappings)} 条记录 (scene_id={scene_id}, file_id={file_id})")
            try:
                # 1. 获取文件信息
                file_info = get_file_info(conn, file_id)
                original_basename = file_info['basename']
                parent_folder_id = file_info['parent_folder_id']
                # 2.获取文件夹路径
                folder_path = get_folder_path(conn, parent_folder_id)
                # 3. 获取演员信息
                performers = get_performers(conn, scene_id)
                if not performers:
                    performers = "UnknownPerformers"
                    logger.warning(f"场景 {scene_id} 未找到演员信息，跳过")
                    continue
                # 4. 获取场景和工作室信息
                scene_info = get_scene_info(conn, scene_id)
                if not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
                    logger.warning(f"场景 {scene_id} 信息不完整，跳过")
                    continue
                title = scene_info['title'] or "Untitled"
                release_date = parse_date(scene_info['release_date'])
                studio_name = get_studio_name(conn, scene_info['studio_id'])
                # 5. 构建新文件名
                ext = get_file_extension(original_basename)
                sanitized_studio = sanitize_filename(studio_name)
                sanitized_performers = sanitize_filename(performers)[0:100]  # 限制长度避免过长
                sanitized_title = sanitize_filename(title)[0:100]  # 限制长度避免过长
                if ext:
                    new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}.{ext}"
                else:
                    new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}"
                if len(new_basename) > 254:
                    logger.warning(f"生成的文件名过长，跳过 (file_id={file_id}): {new_basename}")
                    continue
                # 构建完整路径
                original_path = os.path.join(folder_path, original_basename)
                new_path = os.path.join(folder_path, new_basename)
                # 记录结果
                result = {
                    'file_id': file_id,
                    'scene_id': scene_id,
                    'original_name': original_path,
                    'dest_name': new_path
                }
                results.append(result)
                # 输出检查信息
                logger.info(f"准备重命名: {original_path} -> {new_path}")
                # 在运行模式下执行操作
                if mode == 'run':
                    # 检查文件是否存在
                    if not os.path.exists(original_path):
                        logger.warning(f"文件不存在，跳过: {original_path}")
                        continue
                    # 执行文件重命名
                    if original_path != new_path:
                        os.rename(original_path, new_path)
                        logger.info(f"已重命名: {original_path} -> {new_path}")
                        # 更新数据库记录
                        cursor.execute(
                            "UPDATE files SET basename = ? WHERE id = ?",
                            (new_basename, file_id)
                        )
                        conn.commit()
                        logger.info(f"已更新数据库记录 (file_id={file_id})")
            except Exception as e:
                logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
                # 回滚当前事务（如果是运行模式）
                if mode == 'run':
                    conn.rollback()
                continue
        # 保存结果到文件
        with open('./result/rename_results.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        logger.info(f"处理完成，结果已保存到 rename_results.json")
        return results
    except sqlite3.Error as e:
        logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
        if mode == 'run':
            conn.rollback()
        raise
    finally:
        if mode == 'run':
            conn.commit()
 def main():
    # 解析命令行参数
    parser = argparse.ArgumentParser(description='电影文件重命名工具')
    parser.add_argument('--mode', choices=['check', 'run'], default='check',
                      help='运行模式: check(检查) 或 run(执行)')
    parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
    parser.add_argument('--prefix', default='', help='目录的前缀，用来匹配')
    args = parser.parse_args()
    # 验证数据库文件是否存在
    if not os.path.exists(args.db):
        logger.error(f"数据库文件不存在: {args.db}")
        return
    os.makedirs('./result', exist_ok=True)
    # 连接数据库
    conn = None
    try:
        conn = sqlite3.connect(args.db)
        conn.row_factory = sqlite3.Row  # 启用行工厂，方便按列名访问
        logger.info(f"成功连接到数据库: {args.db}")
        # 执行处理
        process_scene_files(conn, args.mode, args.prefix)
    except sqlite3.Error as e:
        logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
    finally:
        if conn:
            conn.close()
            logger.info("数据库连接已关闭")
 if __name__ == "__main__":
    main()
--- a/docker/stash/scripts/scrapers/JavBus/JavBus.yml
+++ b/docker/stash/scripts/scrapers/JavBus/JavBus.yml
@ -0,0 +1,110 @@
 name: Javbus
 sceneByFragment:
  action: scrapeXPath
  queryURL: https://www.javbus.com/{filename}
  queryURLReplace:
    filename:
      - regex: -JG\d
        with: ""
      - regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
        with: $2
  scraper: sceneScraper
 sceneByURL:
  - action: scrapeXPath
    url:
      - https://www.javbus.com
      - https://www.seejav.bid
      - https://www.cdnbus.lol
      - https://www.dmmbus.lol
      - https://www.seedmm.cfd
    scraper: sceneScraper
 sceneByName:
  action: scrapeXPath
  queryURL: https://www.javbus.com/search/{}&type=&parent=ce
  scraper: sceneSearch
 sceneByQueryFragment:
  action: scrapeXPath
  queryURL: "{url}"
  scraper: sceneScraper
 performerByURL: 
  - action: scrapeXPath
    url: 
      - https://www.javbus.com
      - https://www.seejav.bid
      - https://www.cdnbus.lol
      - https://www.dmmbus.lol
      - https://www.seedmm.cfd
    scraper: performerScraper
 performerByName: 
  action: scrapeXPath
  queryURL: https://www.javbus.com/searchstar/{}&type=&parent=ce
  scraper: performerSearch
 xPathScrapers: 
  performerSearch: 
    performer:
      Name: //span[@class="mleft"]
      URLs: //*[@id="waterfall"]/div/a/@href
  performerScraper: 
    performer: 
      Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
      Birthdate:
        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '生日')]
        postProcess:
          - replace:
            - regex: ^(.*? ){1}
              with:
      Height: 
        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '身高')]
        postProcess:
          - replace:
            - regex: ^(.*? ){1}
              with:
      # Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
      Image: 
        selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
        postProcess:
          - replace:
            - regex: ^
              with: https://www.javbus.com
  sceneSearch: 
    scene: 
      Title: //div[@class="photo-info"]/span
      URL: //*[@id="waterfall"]/div/a/@href
  sceneScraper:
    scene:
      Title:
        selector: //div[@class="col-md-3 info"]//span[contains(text(), '識別碼')]/../span[2]/text()
      URL:
        selector: /html/head/link[@hreflang="zh"]/@href
      Date:
        selector: //div[@class="col-md-3 info"]//span[contains(text(), '發行日期')]/../text()
      Details:
        selector: //div[@class="container"]/h3/text()
        postProcess:
          - replace:
            - regex: ^(.*? ){1}
              with:
      Tags:
        Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
      Performers:
        Name: //div[@class="star-name"]/a
      Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
      Image:
        selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
        postProcess:
          - replace:
            - regex: ^
              with: https://www.javbus.com
      Studio:
        Name: //div[@class="col-md-3 info"]//span[contains(text(), '發行商')]/../a/text()
 driver:
  headers:
    - Key: User-Agent
      Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
    - Key: Accept-Language
      Value: zh-cn
 # Last Updated September 17, 2025
--- a/docker/stash/scripts/scrapers/JavBus/manifest
+++ b/docker/stash/scripts/scrapers/JavBus/manifest
@ -0,0 +1,9 @@
 id: JavBus
 name: Javbus
 metadata: {}
 version: 5ee93a34
 date: "2025-09-17 10:48:13"
 requires: []
 source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
 files:
 - JavBus.yml
--- a/docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
+++ b/docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
@ -0,0 +1,111 @@
 name: Javbus_en
 sceneByFragment:
  action: scrapeXPath
  queryURL: https://www.javbus.com/en/{filename}
  queryURLReplace:
    filename:
      - regex: -JG\d
        with: ""
      - regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
        with: $2
  scraper: sceneScraper
 sceneByURL:
  - action: scrapeXPath
    url:
      - https://www.javbus.com/en
      - https://www.seejav.bid
      - https://www.cdnbus.lol
      - https://www.dmmbus.lol
      - https://www.seedmm.cfd
    scraper: sceneScraper
 sceneByName:
  action: scrapeXPath
  queryURL: https://www.javbus.com/en/search/{}&type=&parent=ce
  scraper: sceneSearch
 sceneByQueryFragment:
  action: scrapeXPath
  queryURL: "{url}"
  scraper: sceneScraper
 performerByURL: 
  - action: scrapeXPath
    url: 
      - https://www.javbus.com/en
      - https://www.seejav.bid
      - https://www.cdnbus.lol
      - https://www.dmmbus.lol
      - https://www.seedmm.cfd
    scraper: performerScraper
 performerByName: 
  action: scrapeXPath
  queryURL: https://www.javbus.com/en/searchstar/{}&type=&parent=ce
  scraper: performerSearch
 xPathScrapers: 
  performerSearch: 
    performer:
      Name: //span[@class="mleft"]
      URLs: //*[@id="waterfall"]/div/a/@href
  performerScraper: 
    performer: 
      Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
      Birthdate:
        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'D.O.B')]
        postProcess:
          - replace:
            - regex: ^(.*? ){1}
              with:
      Height: 
        selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'Height')]
        postProcess:
          - replace:
            - regex: ^(.*? ){1}
              with:
      # Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
      Image: 
        selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
        postProcess:
          - replace:
            - regex: ^
              with: https://www.javbus.com/en
  sceneSearch: 
    scene: 
      Title: //div[@class="photo-info"]/span
      URL: //*[@id="waterfall"]/div/a/@href
  sceneScraper:
    scene:
      Title:
        selector: //div[@class="col-md-3 info"]//span[contains(text(), 'ID')]/../span[2]/text()
      URL:
        selector: /html/head/link[@hreflang="zh"]/@href
      Date:
        selector: //div[@class="col-md-3 info"]//span[contains(normalize-space(text()), 'Release Date')]/../text()
        #selector: //div[@class="col-md-3 info"]//span[contains(text(), 'Release Date')]/../text()
      Details:
        selector: //div[@class="container"]/h3/text()
        postProcess:
          - replace:
            - regex: ^(.*? ){1}
              with:
      Tags:
        Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
      Performers:
        Name: //div[@class="star-name"]/a
      Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
      Image:
        selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
        postProcess:
          - replace:
            - regex: ^
              with: https://www.javbus.com/
      Studio:
        Name: //div[@class="col-md-3 info"]//span[contains(text(), 'Label')]/../a/text()
 driver:
  headers:
    - Key: User-Agent
      Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
    - Key: Accept-Language
      Value: zh-cn,en-US
 # Last Updated September 17, 2025
--- a/docker/stash/scripts/scrapers/JavBus_en/manifest
+++ b/docker/stash/scripts/scrapers/JavBus_en/manifest
@ -0,0 +1,9 @@
 id: JavBus_en
 name: Javbus_en
 metadata: {}
 version: b4672ccf
 date: "2025-08-01 16:01:27"
 requires: []
 source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
 files:
 - JavBus_en.yml
--- a/11
+++ b/11
@ -0,0 +1,11 @@
 # 其他已有的忽略规则
 *.pyc
 __pycache__/
 # 忽略环境配置文件
 .env
 # 忽略所有 log 目录 和 data 目录
 **/log/
 **/data/
 **/result/
--- a/tools/ssh_key_push.sh
+++ b/tools/ssh_key_push.sh
@ -29,18 +29,47 @@ else
    fi
    servers=()
    while IFS= read -r line; do
        # 跳过空行和注释行
        [[ -z "$line" || "$line" =~ ^# ]] && continue
        servers+=("$line")
    done < "$file_path"
 fi
 # 推送公钥到远程服务器
 for server in "${servers[@]}"; do
 public_key=$(cat ~/.ssh/id_rsa.pub)
-    ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys"
+for server in "${servers[@]}"; do
-    if [ $? -eq 0 ]; then
+    # 第一次尝试推送
    echo "正在推送公钥到 $server..."
    output=$(ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys" 2>&1)
    exit_code=$?
    if [ $exit_code -eq 0 ]; then
        echo "公钥已成功推送到 $server"
        continue
    fi
    # 检测是否是主机密钥验证失败
    if echo "$output" | grep -q "Host key verification failed"; then
        echo "检测到 $server 的主机密钥已变更，正在清理旧密钥..."
        # 提取主机地址（处理 user@host 格式，取 @ 后面的部分）
        host=$(echo "$server" | cut -d'@' -f2)
        # 清理旧密钥
        cleanup_output=$(ssh-keygen -R "$host" 2>&1)
        if [ $? -ne 0 ]; then
            echo "清理 $host 旧密钥失败：$cleanup_output"
            continue
        fi
        echo "已清理 $host 的旧密钥，重新尝试推送..."
        # 重新推送
        retry_output=$(ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys" 2>&1)
        retry_code=$?
        if [ $retry_code -eq 0 ]; then
            echo "公钥已成功推送到 $server"
        else
-        echo "推送公钥到 $server 时出错。"
+            echo "重新推送 $server 失败：$retry_output"
        fi
    else
        # 其他错误类型
        echo "推送 $server 失败：$output"
    fi
 done
Author	SHA1	Message	Date
sophon	2b0e1c0413	modify scripts	2026-01-11 11:50:55 +08:00
sophon	dece263c8b	modify scripts	2026-01-11 10:36:07 +08:00
sophon	00b267b651	modify scripts	2026-01-09 11:29:25 +08:00
sophon	0a4776479c	modify scripts	2025-12-25 17:08:29 +08:00
sophon	6cf529541d	modify scripts	2025-12-25 15:02:07 +08:00
sophon	2c0e3bd718	modify scripts	2025-12-25 14:53:33 +08:00
sophon	ebae625165	modify scripts	2025-12-25 14:51:09 +08:00
sophon	f8daffd47f	modify scripts	2025-12-04 11:08:31 +08:00
sophon	bed2de3cd1	modify scripts	2025-11-14 16:53:08 +08:00
sophon	d1c543512e	modify scripts	2025-11-14 14:09:42 +08:00
sophon	857339d261	modify scripts	2025-11-14 13:54:20 +08:00
sophon	f189dcfaca	modify scripts	2025-11-14 13:32:18 +08:00
sophon	1848510b65	modify scripts	2025-11-13 11:59:45 +08:00
sophon	04d76944ad	modify scripts	2025-11-13 10:00:07 +08:00
sophon	40eae5569a	modify scripts	2025-11-13 08:34:28 +08:00
sophon	15c4f7b823	modify scripts	2025-11-07 10:08:19 +08:00
sophon	17356c79f9	modify scripts	2025-11-07 09:03:35 +08:00
sophon	808dbaa985	modify scripts	2025-11-05 17:25:41 +08:00
sophon	b7dffc539c	modify scripts	2025-11-03 16:34:46 +08:00
sophon	91e7d38725	modify scripts	2025-11-03 16:21:46 +08:00
sophon	fe153d69cc	Merge branch 'master' of git.easyprompt8.com:backend/devops	2025-07-21 11:46:15 +08:00
sophon	31e07abf14	Merge branch 'master' of git.easyprompt8.com:backend/devops	2025-07-21 11:42:48 +08:00
sophon	30b315ecd0	modify scripts	2025-07-21 11:39:05 +08:00