modify scripts

2025-11-05 17:25:41 +08:00
parent b7dffc539c
commit 808dbaa985
4 changed files with 980 additions and 9 deletions
--- a/docker/paperless/docker-compose.yml
+++ b/docker/paperless/docker-compose.yml
@ -8,11 +8,6 @@ services:
    ports:
      - "8000:8000"
    environment:
-      PAPERLESS_OCR_LANGUAGES: ""   # 跳过OCR
-      PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always"  # 跳过创建文档存档版本的时间
-      PAPERLESS_OCR_OUTPUT_TYPE: "pdf"  # 尽量少修改PDF文档
-      PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔（以秒为单位），这将导致 paperless 定期检查消费目录中的更改
-      #PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间（以秒为单位）

      # 使用 SQLite 作为数据库（默认）
      PAPERLESS_DBENGINE: sqlite3
@ -39,8 +34,16 @@ services:
      # 解析文件里的关键信息，并更新。但无法更新strorage path。这个字段要靠工作流才行。
      PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"

-      # 使用mysql可以用下面这个连接， init_command=SET innodb_strict_mode=0：关键配置！在建立数据库连接时，自动执行SET innodb_strict_mode=0，关闭 InnoDB 严格模式，允许 MySQL 自动截断过长的索引（不再直接报错）。
-      PAPERLESS_DATABASE_URL: "mysql://root:root@mysql:3306/paperless?charset=utf8mb4&init_command=SET sql_mode='STRICT_TRANS_TABLES', innodb_strict_mode=0"
+      # 自动删除重复文件
+      PAPERLESS_CONSUMER_DELETE_DUPLICATES: true
+      # 支持消费目录递归检索，即子目录。这样可以支持多个宿主机的目录映射到docker中
+      PAPERLESS_CONSUMER_RECURSIVE: true
+
+      PAPERLESS_OCR_LANGUAGES: ""   # 跳过OCR，并不会，只会用默认的eng来执行
+      PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always"  # 跳过创建文档存档版本的时间
+      PAPERLESS_OCR_OUTPUT_TYPE: "pdf"  # 尽量少修改PDF文档
+      PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔（以秒为单位），这将导致 paperless 定期检查消费目录中的更改
+      #PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间（以秒为单位）

      # 运行用户
      USERMAP_UID: 1000
@ -49,8 +52,9 @@ services:
    volumes:
      # 存储所有数据（搜索索引、SQLite 数据库、分类模型等）的地方
      - ~/dockers/paperless/data:/usr/src/paperless/data
-      # 挂载文件导入目录
+      # 挂载文件导入目录，可以把多个宿主机的目录，挂到docker中，以子目录的形式存在
      - ~/dockers/paperless/consume:/usr/src/paperless/consume
+      - ~/dockers/sharedata/consume:/usr/src/paperless/consume/subdir
      # 挂载文件导出目录
      - ~/dockers/paperless/export:/usr/src/paperless/export
      # 存储您的文档和缩略图的地方
--- a/docker/paperless/plugins/bak_parsers.py
+++ b/docker/paperless/plugins/bak_parsers.py
@ -0,0 +1,472 @@
+import os
+import re
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from django.conf import settings
+from PIL import Image
+
+from documents.parsers import DocumentParser
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
+from paperless.config import OcrConfig
+from paperless.models import ArchiveFileChoices
+from paperless.models import CleanChoices
+from paperless.models import ModeChoices
+
+
+class NoTextFoundException(Exception):
+    pass
+
+
+class RtlLanguageException(Exception):
+    pass
+
+
+class RasterisedDocumentParser(DocumentParser):
+    """
+    This parser uses Tesseract to try and get some text out of a rasterised
+    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+    """
+
+    logging_name = "paperless.parsing.tesseract"
+
+    def get_settings(self) -> OcrConfig:
+        """
+        This parser uses the OCR configuration settings to parse documents
+        """
+        return OcrConfig()
+
+    def get_page_count(self, document_path, mime_type):
+        page_count = None
+        if mime_type == "application/pdf":
+            try:
+                import pikepdf
+
+                with pikepdf.Pdf.open(document_path) as pdf:
+                    page_count = len(pdf.pages)
+            except Exception as e:
+                self.log.warning(
+                    f"Unable to determine PDF page count {document_path}: {e}",
+                )
+        return page_count
+
+    def extract_metadata(self, document_path, mime_type):
+        result = []
+        if mime_type == "application/pdf":
+            import pikepdf
+
+            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+            pdf = pikepdf.open(document_path)
+            meta = pdf.open_metadata()
+            for key, value in meta.items():
+                if isinstance(value, list):
+                    value = " ".join([str(e) for e in value])
+                value = str(value)
+                try:
+                    m = namespace_pattern.match(key)
+                    if m is None:  # pragma: no cover
+                        continue
+                    namespace = m.group(1)
+                    key_value = m.group(2)
+                    try:
+                        namespace.encode("utf-8")
+                        key_value.encode("utf-8")
+                    except UnicodeEncodeError as e:  # pragma: no cover
+                        self.log.debug(f"Skipping metadata key {key}: {e}")
+                        continue
+                    result.append(
+                        {
+                            "namespace": namespace,
+                            "prefix": meta.REVERSE_NS[namespace],
+                            "key": key_value,
+                            "value": value,
+                        },
+                    )
+                except Exception as e:
+                    self.log.warning(
+                        f"Error while reading metadata {key}: {value}. Error: {e}",
+                    )
+        return result
+
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+        return make_thumbnail_from_pdf(
+            self.archive_path or document_path,
+            self.tempdir,
+            self.logging_group,
+        )
+
+    def is_image(self, mime_type) -> bool:
+        return mime_type in [
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+            "image/webp",
+            "image/heic",
+        ]
+
+    def has_alpha(self, image) -> bool:
+        with Image.open(image) as im:
+            return im.mode in ("RGBA", "LA")
+
+    def remove_alpha(self, image_path: str) -> Path:
+        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
+        run_subprocess(
+            [
+                settings.CONVERT_BINARY,
+                "-alpha",
+                "off",
+                image_path,
+                no_alpha_image,
+            ],
+            logger=self.log,
+        )
+        return no_alpha_image
+
+    def get_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                x, _ = im.info["dpi"]
+                return round(x)
+        except Exception as e:
+            self.log.warning(f"Error while getting DPI from image {image}: {e}")
+            return None
+
+    def calculate_a4_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                width, _ = im.size
+                # divide image width by A4 width (210mm) in inches.
+                dpi = int(width / (21 / 2.54))
+                self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
+                return dpi
+
+        except Exception as e:
+            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
+            return None
+
+    def extract_text(
+        self,
+        sidecar_file: Path | None,
+        pdf_file: Path,
+    ) -> str | None:
+        # When re-doing OCR, the sidecar contains ONLY the new text, not
+        # the whole text, so do not utilize it in that case
+        if (
+            sidecar_file is not None
+            and sidecar_file.is_file()
+            and self.settings.mode != "redo"
+        ):
+            text = self.read_file_handle_unicode_errors(sidecar_file)
+
+            if "[OCR skipped on page" not in text:
+                # This happens when there's already text in the input file.
+                # The sidecar file will only contain text for OCR'ed pages.
+                self.log.debug("Using text from sidecar file")
+                return post_process_text(text)
+            else:
+                self.log.debug("Incomplete sidecar file: discarding.")
+
+        # no success with the sidecar file, try PDF
+
+        if not Path(pdf_file).is_file():
+            return None
+
+        try:
+            text = None
+            with tempfile.NamedTemporaryFile(
+                mode="w+",
+                dir=self.tempdir,
+            ) as tmp:
+                run_subprocess(
+                    [
+                        "pdftotext",
+                        "-q",
+                        "-layout",
+                        "-enc",
+                        "UTF-8",
+                        pdf_file,
+                        tmp.name,
+                    ],
+                    logger=self.log,
+                )
+                text = self.read_file_handle_unicode_errors(Path(tmp.name))
+
+            return post_process_text(text)
+
+        except Exception:
+            #  If pdftotext fails, fall back to OCR.
+            self.log.warning(
+                "Error while getting text from PDF document with pdftotext",
+                exc_info=True,
+            )
+            # probably not a PDF file.
+            return None
+
+    def construct_ocrmypdf_parameters(
+        self,
+        input_file,
+        mime_type,
+        output_file,
+        sidecar_file,
+        *,
+        safe_fallback=False,
+    ):
+        if TYPE_CHECKING:
+            assert isinstance(self.settings, OcrConfig)
+        ocrmypdf_args = {
+            "input_file": input_file,
+            "output_file": output_file,
+            # need to use threads, since this will be run in daemonized
+            # processes via the task library.
+            "use_threads": True,
+            "jobs": settings.THREADS_PER_WORKER,
+            "language": self.settings.language,
+            "output_type": self.settings.output_type,
+            "progress_bar": False,
+        }
+
+        if "pdfa" in ocrmypdf_args["output_type"]:
+            ocrmypdf_args["color_conversion_strategy"] = (
+                self.settings.color_conversion_strategy
+            )
+
+        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
+            ocrmypdf_args["force_ocr"] = True
+        elif self.settings.mode in {
+            ModeChoices.SKIP,
+            ModeChoices.SKIP_NO_ARCHIVE,
+        }:
+            ocrmypdf_args["skip_text"] = True
+        elif self.settings.mode == ModeChoices.REDO:
+            ocrmypdf_args["redo_ocr"] = True
+        else:  # pragma: no cover
+            raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
+
+        if self.settings.clean == CleanChoices.CLEAN:
+            ocrmypdf_args["clean"] = True
+        elif self.settings.clean == CleanChoices.FINAL:
+            if self.settings.mode == ModeChoices.REDO:
+                ocrmypdf_args["clean"] = True
+            else:
+                # --clean-final is not compatible with --redo-ocr
+                ocrmypdf_args["clean_final"] = True
+
+        if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
+            # --deskew is not compatible with --redo-ocr
+            ocrmypdf_args["deskew"] = True
+
+        if self.settings.rotate:
+            ocrmypdf_args["rotate_pages"] = True
+            ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
+
+        if self.settings.pages is not None and self.settings.pages > 0:
+            ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
+        else:
+            # sidecar is incompatible with pages
+            ocrmypdf_args["sidecar"] = sidecar_file
+
+        if self.is_image(mime_type):
+            # This may be required, depending on the known information
+            maybe_override_pixel_limit()
+
+            dpi = self.get_dpi(input_file)
+            a4_dpi = self.calculate_a4_dpi(input_file)
+
+            if self.has_alpha(input_file):
+                self.log.info(
+                    f"Removing alpha layer from {input_file} "
+                    "for compatibility with img2pdf",
+                )
+                # Replace the input file with the non-alpha
+                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
+
+            if dpi:
+                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
+                ocrmypdf_args["image_dpi"] = dpi
+            elif self.settings.image_dpi is not None:
+                ocrmypdf_args["image_dpi"] = self.settings.image_dpi
+            elif a4_dpi:
+                ocrmypdf_args["image_dpi"] = a4_dpi
+            else:
+                raise ParseError(
+                    f"Cannot produce archive PDF for image {input_file}, "
+                    f"no DPI information is present in this image and "
+                    f"OCR_IMAGE_DPI is not set.",
+                )
+            if ocrmypdf_args["image_dpi"] < 70:  # pragma: no cover
+                self.log.warning(
+                    f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
+                )
+
+        if self.settings.user_args is not None:
+            try:
+                ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
+            except Exception as e:
+                self.log.warning(
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    f"they will not be used. Error: {e}",
+                )
+
+        if (
+            self.settings.max_image_pixel is not None
+            and self.settings.max_image_pixel >= 0
+        ):
+            # Convert pixels to mega-pixels and provide to ocrmypdf
+            max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
+            msg = (
+                "OCR pixel limit is disabled!"
+                if max_pixels_mpixels == 0
+                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
+            )
+            self.log.debug(msg)
+            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+
+        return ocrmypdf_args
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        # This forces tesseract to use one core per page.
+        os.environ["OMP_THREAD_LIMIT"] = "1"
+        VALID_TEXT_LENGTH = 50
+
+        if mime_type == "application/pdf":
+            text_original = self.extract_text(None, document_path)
+            original_has_text = (
+                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+            )
+        else:
+            text_original = None
+            original_has_text = False
+
+        # If the original has text, and the user doesn't want an archive,
+        # we're done here
+        skip_archive_for_text = (
+            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
+            or self.settings.skip_archive_file
+            in {
+                ArchiveFileChoices.WITH_TEXT,
+                ArchiveFileChoices.ALWAYS,
+            }
+        )
+        if skip_archive_for_text and original_has_text:
+            self.log.debug(f"Document has text, skipping OCRmyPDF entirely. {text_original}")
+            self.text = text_original
+            return
+
+        # Either no text was in the original or there should be an archive
+        # file created, so OCR the file and create an archive with any
+        # text located via OCR
+
+        import ocrmypdf
+        from ocrmypdf import EncryptedPdfError
+        from ocrmypdf import InputFileError
+        from ocrmypdf import SubprocessOutputError
+        from ocrmypdf.exceptions import DigitalSignatureError
+
+        archive_path = Path(self.tempdir) / "archive.pdf"
+        sidecar_file = Path(self.tempdir) / "sidecar.txt"
+
+        args = self.construct_ocrmypdf_parameters(
+            document_path,
+            mime_type,
+            archive_path,
+            sidecar_file,
+        )
+
+        try:
+            self.log.debug(f"Calling OCRmyPDF with args: {args}")
+            ocrmypdf.ocr(**args)
+
+            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
+                self.archive_path = archive_path
+
+            self.text = self.extract_text(sidecar_file, archive_path)
+
+            if not self.text:
+                raise NoTextFoundException("No text was found in the original document")
+        except (DigitalSignatureError, EncryptedPdfError):
+            self.log.warning(
+                "This file is encrypted and/or signed, OCR is impossible. Using "
+                "any text present in the original file.",
+            )
+            if original_has_text:
+                self.text = text_original
+        except SubprocessOutputError as e:
+            if "Ghostscript PDF/A rendering" in str(e):
+                self.log.warning(
+                    "Ghostscript PDF/A rendering failed, consider setting "
+                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
+                )
+
+            raise ParseError(
+                f"SubprocessOutputError: {e!s}. See logs for more information.",
+            ) from e
+        except (NoTextFoundException, InputFileError) as e:
+            self.log.warning(
+                f"Encountered an error while running OCR: {e!s}. "
+                f"Attempting force OCR to get the text.",
+            )
+
+            archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
+            sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
+
+            # Attempt to run OCR with safe settings.
+
+            args = self.construct_ocrmypdf_parameters(
+                document_path,
+                mime_type,
+                archive_path_fallback,
+                sidecar_file_fallback,
+                safe_fallback=True,
+            )
+
+            try:
+                self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
+                ocrmypdf.ocr(**args)
+
+                # Don't return the archived file here, since this file
+                # is bigger and blurry due to --force-ocr.
+
+                self.text = self.extract_text(
+                    sidecar_file_fallback,
+                    archive_path_fallback,
+                )
+
+            except Exception as e:
+                # If this fails, we have a serious issue at hand.
+                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        except Exception as e:
+            # Anything else is probably serious.
+            raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        # As a last resort, if we still don't have any text for any reason,
+        # try to extract the text from the original document.
+        if not self.text:
+            if original_has_text:
+                self.text = text_original
+            else:
+                self.log.warning(
+                    f"No text was found in {document_path}, the content will be empty.",
+                )
+                self.text = ""
+
+
+def post_process_text(text):
+    if not text:
+        return None
+
+    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
+    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
+
+    # TODO: this needs a rework
+    # replace \0 prevents issues with saving to postgres.
+    # text may contain \0 when this character is present in PDF files.
+    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/docker/paperless/plugins/parsers.py
+++ b/docker/paperless/plugins/parsers.py
@ -0,0 +1,479 @@
+import os
+import re
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from django.conf import settings
+from PIL import Image
+
+from documents.parsers import DocumentParser
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
+from paperless.config import OcrConfig
+from paperless.models import ArchiveFileChoices
+from paperless.models import CleanChoices
+from paperless.models import ModeChoices
+
+
+class NoTextFoundException(Exception):
+    pass
+
+
+class RtlLanguageException(Exception):
+    pass
+
+
+class RasterisedDocumentParser(DocumentParser):
+    """
+    This parser uses Tesseract to try and get some text out of a rasterised
+    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+    """
+
+    logging_name = "paperless.parsing.tesseract"
+
+    def get_settings(self) -> OcrConfig:
+        """
+        This parser uses the OCR configuration settings to parse documents
+        """
+        return OcrConfig()
+
+    def get_page_count(self, document_path, mime_type):
+        page_count = None
+        if mime_type == "application/pdf":
+            try:
+                import pikepdf
+
+                with pikepdf.Pdf.open(document_path) as pdf:
+                    page_count = len(pdf.pages)
+            except Exception as e:
+                self.log.warning(
+                    f"Unable to determine PDF page count {document_path}: {e}",
+                )
+        return page_count
+
+    def extract_metadata(self, document_path, mime_type):
+        result = []
+        if mime_type == "application/pdf":
+            import pikepdf
+
+            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+            pdf = pikepdf.open(document_path)
+            meta = pdf.open_metadata()
+            for key, value in meta.items():
+                if isinstance(value, list):
+                    value = " ".join([str(e) for e in value])
+                value = str(value)
+                try:
+                    m = namespace_pattern.match(key)
+                    if m is None:  # pragma: no cover
+                        continue
+                    namespace = m.group(1)
+                    key_value = m.group(2)
+                    try:
+                        namespace.encode("utf-8")
+                        key_value.encode("utf-8")
+                    except UnicodeEncodeError as e:  # pragma: no cover
+                        self.log.debug(f"Skipping metadata key {key}: {e}")
+                        continue
+                    result.append(
+                        {
+                            "namespace": namespace,
+                            "prefix": meta.REVERSE_NS[namespace],
+                            "key": key_value,
+                            "value": value,
+                        },
+                    )
+                except Exception as e:
+                    self.log.warning(
+                        f"Error while reading metadata {key}: {value}. Error: {e}",
+                    )
+        return result
+
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+        return make_thumbnail_from_pdf(
+            self.archive_path or document_path,
+            self.tempdir,
+            self.logging_group,
+        )
+
+    def is_image(self, mime_type) -> bool:
+        return mime_type in [
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+            "image/webp",
+            "image/heic",
+        ]
+
+    def has_alpha(self, image) -> bool:
+        with Image.open(image) as im:
+            return im.mode in ("RGBA", "LA")
+
+    def remove_alpha(self, image_path: str) -> Path:
+        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
+        run_subprocess(
+            [
+                settings.CONVERT_BINARY,
+                "-alpha",
+                "off",
+                image_path,
+                no_alpha_image,
+            ],
+            logger=self.log,
+        )
+        return no_alpha_image
+
+    def get_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                x, _ = im.info["dpi"]
+                return round(x)
+        except Exception as e:
+            self.log.warning(f"Error while getting DPI from image {image}: {e}")
+            return None
+
+    def calculate_a4_dpi(self, image) -> int | None:
+        try:
+            with Image.open(image) as im:
+                width, _ = im.size
+                # divide image width by A4 width (210mm) in inches.
+                dpi = int(width / (21 / 2.54))
+                self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
+                return dpi
+
+        except Exception as e:
+            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
+            return None
+
+    def extract_text(
+        self,
+        sidecar_file: Path | None,
+        pdf_file: Path,
+    ) -> str | None:
+        # When re-doing OCR, the sidecar contains ONLY the new text, not
+        # the whole text, so do not utilize it in that case
+        if (
+            sidecar_file is not None
+            and sidecar_file.is_file()
+            and self.settings.mode != "redo"
+        ):
+            text = self.read_file_handle_unicode_errors(sidecar_file)
+
+            if "[OCR skipped on page" not in text:
+                # This happens when there's already text in the input file.
+                # The sidecar file will only contain text for OCR'ed pages.
+                self.log.debug("Using text from sidecar file")
+                return post_process_text(text)
+            else:
+                self.log.debug("Incomplete sidecar file: discarding.")
+
+        # no success with the sidecar file, try PDF
+
+        if not Path(pdf_file).is_file():
+            return None
+
+        try:
+            text = None
+            with tempfile.NamedTemporaryFile(
+                mode="w+",
+                dir=self.tempdir,
+            ) as tmp:
+                run_subprocess(
+                    [
+                        "pdftotext",
+                        "-q",
+                        "-layout",
+                        "-enc",
+                        "UTF-8",
+                        pdf_file,
+                        tmp.name,
+                    ],
+                    logger=self.log,
+                )
+                text = self.read_file_handle_unicode_errors(Path(tmp.name))
+
+            return post_process_text(text)
+
+        except Exception:
+            #  If pdftotext fails, fall back to OCR.
+            self.log.warning(
+                "Error while getting text from PDF document with pdftotext",
+                exc_info=True,
+            )
+            # probably not a PDF file.
+            return None
+
+    def construct_ocrmypdf_parameters(
+        self,
+        input_file,
+        mime_type,
+        output_file,
+        sidecar_file,
+        *,
+        safe_fallback=False,
+    ):
+        if TYPE_CHECKING:
+            assert isinstance(self.settings, OcrConfig)
+        ocrmypdf_args = {
+            "input_file": input_file,
+            "output_file": output_file,
+            # need to use threads, since this will be run in daemonized
+            # processes via the task library.
+            "use_threads": True,
+            "jobs": settings.THREADS_PER_WORKER,
+            "language": self.settings.language,
+            "output_type": self.settings.output_type,
+            "progress_bar": False,
+        }
+
+        if "pdfa" in ocrmypdf_args["output_type"]:
+            ocrmypdf_args["color_conversion_strategy"] = (
+                self.settings.color_conversion_strategy
+            )
+
+        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
+            ocrmypdf_args["force_ocr"] = True
+        elif self.settings.mode in {
+            ModeChoices.SKIP,
+            ModeChoices.SKIP_NO_ARCHIVE,
+        }:
+            ocrmypdf_args["skip_text"] = True
+        elif self.settings.mode == ModeChoices.REDO:
+            ocrmypdf_args["redo_ocr"] = True
+        else:  # pragma: no cover
+            raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
+
+        if self.settings.clean == CleanChoices.CLEAN:
+            ocrmypdf_args["clean"] = True
+        elif self.settings.clean == CleanChoices.FINAL:
+            if self.settings.mode == ModeChoices.REDO:
+                ocrmypdf_args["clean"] = True
+            else:
+                # --clean-final is not compatible with --redo-ocr
+                ocrmypdf_args["clean_final"] = True
+
+        if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
+            # --deskew is not compatible with --redo-ocr
+            ocrmypdf_args["deskew"] = True
+
+        if self.settings.rotate:
+            ocrmypdf_args["rotate_pages"] = True
+            ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
+
+        if self.settings.pages is not None and self.settings.pages > 0:
+            ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
+        else:
+            # sidecar is incompatible with pages
+            ocrmypdf_args["sidecar"] = sidecar_file
+
+        if self.is_image(mime_type):
+            # This may be required, depending on the known information
+            maybe_override_pixel_limit()
+
+            dpi = self.get_dpi(input_file)
+            a4_dpi = self.calculate_a4_dpi(input_file)
+
+            if self.has_alpha(input_file):
+                self.log.info(
+                    f"Removing alpha layer from {input_file} "
+                    "for compatibility with img2pdf",
+                )
+                # Replace the input file with the non-alpha
+                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
+
+            if dpi:
+                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
+                ocrmypdf_args["image_dpi"] = dpi
+            elif self.settings.image_dpi is not None:
+                ocrmypdf_args["image_dpi"] = self.settings.image_dpi
+            elif a4_dpi:
+                ocrmypdf_args["image_dpi"] = a4_dpi
+            else:
+                raise ParseError(
+                    f"Cannot produce archive PDF for image {input_file}, "
+                    f"no DPI information is present in this image and "
+                    f"OCR_IMAGE_DPI is not set.",
+                )
+            if ocrmypdf_args["image_dpi"] < 70:  # pragma: no cover
+                self.log.warning(
+                    f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
+                )
+
+        if self.settings.user_args is not None:
+            try:
+                ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
+            except Exception as e:
+                self.log.warning(
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    f"they will not be used. Error: {e}",
+                )
+
+        if (
+            self.settings.max_image_pixel is not None
+            and self.settings.max_image_pixel >= 0
+        ):
+            # Convert pixels to mega-pixels and provide to ocrmypdf
+            max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
+            msg = (
+                "OCR pixel limit is disabled!"
+                if max_pixels_mpixels == 0
+                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
+            )
+            self.log.debug(msg)
+            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+
+        return ocrmypdf_args
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        # This forces tesseract to use one core per page.
+        os.environ["OMP_THREAD_LIMIT"] = "1"
+        VALID_TEXT_LENGTH = 50
+
+        if mime_type == "application/pdf":
+            text_original = self.extract_text(None, document_path)
+            original_has_text = (
+                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+            )
+        else:
+            text_original = None
+            original_has_text = False
+
+        # If the original has text, and the user doesn't want an archive,
+        # we're done here
+        skip_archive_for_text = (
+            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
+            or self.settings.skip_archive_file
+            in {
+                ArchiveFileChoices.WITH_TEXT,
+                ArchiveFileChoices.ALWAYS,
+            }
+        )
+	
+        # force skip ocr process.
+        if not original_has_text:
+            original_has_text = True
+            text_original = "this is default content, as we skipped ocr process..."
+            self.log.warning("Cannot read text from Document, use default message.")
+
+        if skip_archive_for_text and original_has_text:
+            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
+            self.text = text_original
+            return
+
+        # Either no text was in the original or there should be an archive
+        # file created, so OCR the file and create an archive with any
+        # text located via OCR
+
+        import ocrmypdf
+        from ocrmypdf import EncryptedPdfError
+        from ocrmypdf import InputFileError
+        from ocrmypdf import SubprocessOutputError
+        from ocrmypdf.exceptions import DigitalSignatureError
+
+        archive_path = Path(self.tempdir) / "archive.pdf"
+        sidecar_file = Path(self.tempdir) / "sidecar.txt"
+
+        args = self.construct_ocrmypdf_parameters(
+            document_path,
+            mime_type,
+            archive_path,
+            sidecar_file,
+        )
+
+        try:
+            self.log.debug(f"Calling OCRmyPDF with args: {args}")
+            ocrmypdf.ocr(**args)
+
+            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
+                self.archive_path = archive_path
+
+            self.text = self.extract_text(sidecar_file, archive_path)
+
+            if not self.text:
+                raise NoTextFoundException("No text was found in the original document")
+        except (DigitalSignatureError, EncryptedPdfError):
+            self.log.warning(
+                "This file is encrypted and/or signed, OCR is impossible. Using "
+                "any text present in the original file.",
+            )
+            if original_has_text:
+                self.text = text_original
+        except SubprocessOutputError as e:
+            if "Ghostscript PDF/A rendering" in str(e):
+                self.log.warning(
+                    "Ghostscript PDF/A rendering failed, consider setting "
+                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
+                )
+
+            raise ParseError(
+                f"SubprocessOutputError: {e!s}. See logs for more information.",
+            ) from e
+        except (NoTextFoundException, InputFileError) as e:
+            self.log.warning(
+                f"Encountered an error while running OCR: {e!s}. "
+                f"Attempting force OCR to get the text.",
+            )
+
+            archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
+            sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
+
+            # Attempt to run OCR with safe settings.
+
+            args = self.construct_ocrmypdf_parameters(
+                document_path,
+                mime_type,
+                archive_path_fallback,
+                sidecar_file_fallback,
+                safe_fallback=True,
+            )
+
+            try:
+                self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
+                ocrmypdf.ocr(**args)
+
+                # Don't return the archived file here, since this file
+                # is bigger and blurry due to --force-ocr.
+
+                self.text = self.extract_text(
+                    sidecar_file_fallback,
+                    archive_path_fallback,
+                )
+
+            except Exception as e:
+                # If this fails, we have a serious issue at hand.
+                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        except Exception as e:
+            # Anything else is probably serious.
+            raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+        # As a last resort, if we still don't have any text for any reason,
+        # try to extract the text from the original document.
+        if not self.text:
+            if original_has_text:
+                self.text = text_original
+            else:
+                self.log.warning(
+                    f"No text was found in {document_path}, the content will be empty.",
+                )
+                self.text = ""
+
+
+def post_process_text(text):
+    if not text:
+        return None
+
+    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
+    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
+
+    # TODO: this needs a rework
+    # replace \0 prevents issues with saving to postgres.
+    # text may contain \0 when this character is present in PDF files.
+    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/docker/paperless/plugins/redme.txt
+++ b/docker/paperless/plugins/redme.txt
@ -34,7 +34,23 @@ environment:
  PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"


-paperless 默认不会删除重复的文件，这会导致如果重复添加，会不停扫描，加载，报错。没找到配置，直接修改源码解决：
+对于无法简单读取pdf内容的文档，paperless会启动OCR扫描，且复杂情况下会执行两遍，非常慢而且消耗资源。只能通过修改源码解决：
+/usr/src/paperless/src/paperless_tesseract/parsers.py :
+
+        # force skip ocr process.
+        if not original_has_text:
+            original_has_text = True
+            text_original = "this is default content, as we skipped ocr process..."
+            self.log.warning("Cannot read text from Document, use default message.")
+
+        if skip_archive_for_text and original_has_text:
+            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
+            self.text = text_original
+            return
+
+
+
+paperless 默认不会删除重复的文件，这会导致如果重复添加，会不停扫描，加载，报错。没找到配置，直接修改源码解决：（已经有配置，详见 docker-compose.yml）

 /usr/src/paperless/src/documents/consumer.py