Compare commits

..

23 Commits

Author SHA1 Message Date
2b0e1c0413 modify scripts 2026-01-11 11:50:55 +08:00
dece263c8b modify scripts 2026-01-11 10:36:07 +08:00
00b267b651 modify scripts 2026-01-09 11:29:25 +08:00
0a4776479c modify scripts 2025-12-25 17:08:29 +08:00
6cf529541d modify scripts 2025-12-25 15:02:07 +08:00
2c0e3bd718 modify scripts 2025-12-25 14:53:33 +08:00
ebae625165 modify scripts 2025-12-25 14:51:09 +08:00
f8daffd47f modify scripts 2025-12-04 11:08:31 +08:00
bed2de3cd1 modify scripts 2025-11-14 16:53:08 +08:00
d1c543512e modify scripts 2025-11-14 14:09:42 +08:00
857339d261 modify scripts 2025-11-14 13:54:20 +08:00
f189dcfaca modify scripts 2025-11-14 13:32:18 +08:00
1848510b65 modify scripts 2025-11-13 11:59:45 +08:00
04d76944ad modify scripts 2025-11-13 10:00:07 +08:00
40eae5569a modify scripts 2025-11-13 08:34:28 +08:00
15c4f7b823 modify scripts 2025-11-07 10:08:19 +08:00
17356c79f9 modify scripts 2025-11-07 09:03:35 +08:00
808dbaa985 modify scripts 2025-11-05 17:25:41 +08:00
b7dffc539c modify scripts 2025-11-03 16:34:46 +08:00
91e7d38725 modify scripts 2025-11-03 16:21:46 +08:00
fe153d69cc Merge branch 'master' of git.easyprompt8.com:backend/devops 2025-07-21 11:46:15 +08:00
31e07abf14 Merge branch 'master' of git.easyprompt8.com:backend/devops 2025-07-21 11:42:48 +08:00
30b315ecd0 modify scripts 2025-07-21 11:39:05 +08:00
20 changed files with 2074 additions and 185 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
# 其他已有的忽略规则
*.pyc
__pycache__/
# 忽略环境配置文件
.env
# 忽略所有 log 目录 和 data 目录
**/log/
**/data/
**/result/

View File

@ -8,11 +8,6 @@ services:
ports: ports:
- "8000:8000" - "8000:8000"
environment: environment:
PAPERLESS_OCR_LANGUAGES: "" # 跳过OCR
PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always" # 跳过创建文档存档版本的时间
PAPERLESS_OCR_OUTPUT_TYPE: "pdf" # 尽量少修改PDF文档
PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔(以秒为单位),这将导致 paperless 定期检查消费目录中的更改
#PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间(以秒为单位)
# 使用 SQLite 作为数据库(默认) # 使用 SQLite 作为数据库(默认)
PAPERLESS_DBENGINE: sqlite3 PAPERLESS_DBENGINE: sqlite3
@ -34,11 +29,22 @@ services:
# 定义文件命名规则和存储路径 # 定义文件命名规则和存储路径
# 作用不大,主要还是用消费后脚本,以及工作流来指定存储路径。 # 作用不大,主要还是用消费后脚本,以及工作流来指定存储路径。
# 工作流先于消费后脚本运行因此消费后脚本里解析的document_type在工作流里无效。所以使用了文件名关键词匹配 # 工作流先于消费后脚本运行因此消费后脚本里解析的document_type在工作流里无效。所以使用了文件名关键词匹配
PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}.pdf" PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}"
# 解析文件里的关键信息并更新。但无法更新strorage path。这个字段要靠工作流才行。 # 解析文件里的关键信息并更新。但无法更新strorage path。这个字段要靠工作流才行。
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py" PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
# 自动删除重复文件
PAPERLESS_CONSUMER_DELETE_DUPLICATES: true
# 支持消费目录递归检索即子目录。这样可以支持多个宿主机的目录映射到docker中
PAPERLESS_CONSUMER_RECURSIVE: true
PAPERLESS_OCR_LANGUAGES: "" # 跳过OCR并不会只会用默认的eng来执行
PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always" # 跳过创建文档存档版本的时间
PAPERLESS_OCR_OUTPUT_TYPE: "pdf" # 尽量少修改PDF文档
PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔(以秒为单位),这将导致 paperless 定期检查消费目录中的更改
#PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间(以秒为单位)
# 运行用户 # 运行用户
USERMAP_UID: 1000 USERMAP_UID: 1000
USERMAP_GID: 1000 USERMAP_GID: 1000
@ -46,8 +52,9 @@ services:
volumes: volumes:
# 存储所有数据搜索索引、SQLite 数据库、分类模型等)的地方 # 存储所有数据搜索索引、SQLite 数据库、分类模型等)的地方
- ~/dockers/paperless/data:/usr/src/paperless/data - ~/dockers/paperless/data:/usr/src/paperless/data
# 挂载文件导入目录 # 挂载文件导入目录可以把多个宿主机的目录挂到docker中以子目录的形式存在
- ~/dockers/paperless/consume:/usr/src/paperless/consume - ~/dockers/paperless/consume:/usr/src/paperless/consume
- ~/dockers/sharedata/consume:/usr/src/paperless/consume/subdir
# 挂载文件导出目录 # 挂载文件导出目录
- ~/dockers/paperless/export:/usr/src/paperless/export - ~/dockers/paperless/export:/usr/src/paperless/export
# 存储您的文档和缩略图的地方 # 存储您的文档和缩略图的地方

View File

@ -9,7 +9,8 @@ import logging
# Paperless 服务器信息 # Paperless 服务器信息
PAPERLESS_URL = "http://localhost:8000/api" PAPERLESS_URL = "http://localhost:8000/api"
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证 #AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证 mac上用这个
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证NAS上用这个
# 日志配置 # 日志配置
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

View File

@ -0,0 +1,149 @@
#!/bin/bash
# 定义文件替换对(数组形式,格式:源文件 目标文件)
# 可按需添加/删除行,每行一组 源文件 目标文件
FILE_PAIRS=(
"/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py"
# 示例:新增更多文件对
#"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py"
# "/path/to/source/file3" "/path/to/dest/file3"
# "/path/to/source/file4" "/path/to/dest/file4"
)
# 检查所有文件是否存在仅检查replace/check操作需要的文件
check_files_exist() {
local missing=0
local pair_count=${#FILE_PAIRS[@]}
# 遍历文件对步长2源文件、目标文件为一组
for ((i=0; i<pair_count; i+=2)); do
local source="${FILE_PAIRS[$i]}"
local dest="${FILE_PAIRS[$i+1]}"
# 根据操作类型检查对应文件
if [ "$1" = "replace" ] || [ "$1" = "check" ]; then
if [ ! -f "$source" ]; then
echo "错误:源文件不存在 - $source"
missing=1
fi
if [ ! -f "$dest" ]; then
echo "错误:目标文件不存在 - $dest"
missing=1
fi
elif [ "$1" = "rollback" ]; then
if [ ! -f "$dest.bak" ]; then
echo "警告:备份文件不存在(未执行过替换?) - $dest.bak"
missing=1
fi
fi
done
if [ $missing -eq 1 ] && [ "$1" != "rollback" ]; then
echo "错误:关键文件缺失,无法继续执行"
exit 1
fi
}
# 显示所有文件对的差异
show_diffs() {
local pair_count=${#FILE_PAIRS[@]}
echo "=== 开始检查文件差异 ==="
for ((i=0; i<pair_count; i+=2)); do
local source="${FILE_PAIRS[$i]}"
local dest="${FILE_PAIRS[$i+1]}"
echo -e "\n--- 检查 $dest <-> $source 的差异 ---"
diff -u "$dest" "$source" || true # 无差异时不报错
done
}
# 备份单个文件(添加 .bak 后缀,保留原权限)
backup_file() {
local file="$1"
local backup="$file.bak"
if [ -f "$backup" ]; then
echo "提示:旧备份文件已存在,将覆盖 - $backup"
rm -f "$backup"
fi
cp -a "$file" "$backup" # -a 保留权限、属性、时间戳等
echo "已备份:$file -> $backup"
}
# 替换所有文件对
replace_files() {
local pair_count=${#FILE_PAIRS[@]}
echo "=== 开始替换文件(先备份目标文件) ==="
for ((i=0; i<pair_count; i+=2)); do
local source="${FILE_PAIRS[$i]}"
local dest="${FILE_PAIRS[$i+1]}"
echo -e "\n--- 处理文件对:$source -> $dest ---"
backup_file "$dest"
cp -f "$source" "$dest"
echo "已替换:$source 覆盖 $dest"
done
echo -e "\n=== 替换完成,验证最终差异(应无差异) ==="
show_diffs
}
# 回滚替换操作(恢复 .bak 备份文件)
rollback_files() {
local pair_count=${#FILE_PAIRS[@]}
echo "=== 开始回滚替换操作 ==="
for ((i=0; i<pair_count; i+=2)); do
local dest="${FILE_PAIRS[$i+1]}"
local backup="$dest.bak"
echo -e "\n--- 处理回滚:$backup -> $dest ---"
if [ -f "$backup" ]; then
# 先备份当前文件(防止回滚出错)
cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true
# 恢复备份文件
mv -f "$backup" "$dest"
echo "已回滚:$dest 恢复为备份版本"
# 删除临时文件
rm -f "$dest.rollback_temp" 2>/dev/null || true
else
echo "跳过:备份文件不存在 - $backup"
fi
done
echo -e "\n=== 回滚操作执行完成 ==="
}
# 主逻辑
main() {
case "$1" in
check)
echo "=== 执行文件差异检查(不修改文件) ==="
check_files_exist "check"
show_diffs
;;
replace)
echo "=== 执行文件替换操作(自动备份) ==="
check_files_exist "replace"
replace_files
;;
rollback)
echo "=== 执行文件回滚操作(恢复备份) ==="
check_files_exist "rollback"
rollback_files
;;
*)
echo "用法:$0 [check|replace|rollback]"
echo " check - 仅检查所有文件对的差异,不做修改"
echo " replace - 备份所有目标文件并执行替换,完成后验证差异"
echo " rollback - 回滚替换操作(恢复 .bak 备份文件)"
exit 1
;;
esac
}
# 启动主逻辑
main "$1"

View File

@ -0,0 +1,47 @@
#!/bin/bash
SRC="/volume1/docker/sharedata/stock_data/pdfs"
DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
LOG="/volume1/docker/projects/devops/docker/paperless/plugins/log/paperless.log"
TARGET_UID=1000
TARGET_GID=1000
# 检查目录
if [ ! -d "$SRC" ]; then
echo "$(date '+%F %T') [ERROR] 源目录不存在: $SRC" | tee -a "$LOG"
exit 1
fi
if [ ! -d "$DST" ]; then
echo "$(date '+%F %T') [ERROR] 目标目录不存在: $DST" | tee -a "$LOG"
exit 1
fi
# 关键添加检查并创建log目录-p 确保父目录存在,无报错)
LOG_DIR=$(dirname "$LOG") # 提取日志文件所在目录(即 ./log
if [ ! -d "$LOG_DIR" ]; then
mkdir -p "$LOG_DIR"
echo "$(date '+%F %T') [INFO] log目录不存在已创建: $LOG_DIR" | tee -a "$LOG"
fi
COUNT=0
for f in "$SRC"/*.pdf; do
[ -f "$f" ] || continue
# 移动 + 改属主 + 改权限
if install -D -o "$TARGET_UID" -g "$TARGET_GID" -m 644 "$f" "$DST"; then
rm -f "$f"
echo "$(date '+%F %T') [OK] Moved: $f" >> "$LOG"
((COUNT++))
# 每移动10个文件输出进度到屏幕同时写入日志
if (( COUNT % 100 == 0 )); then
PROGRESS_MSG="$(date '+%F %T') [PROGRESS] 已移动 $COUNT 个文件"
echo "$PROGRESS_MSG" | tee -a "$LOG"
fi
else
echo "$(date '+%F %T') [FAIL] Failed: $f" >> "$LOG"
fi
done
echo "$(date '+%F %T') [INFO] 搬运完成,共移动 $COUNT 个文件" | tee -a "$LOG"

View File

@ -0,0 +1,472 @@
import os
import re
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from django.conf import settings
from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
class NoTextFoundException(Exception):
pass
class RtlLanguageException(Exception):
pass
class RasterisedDocumentParser(DocumentParser):
"""
This parser uses Tesseract to try and get some text out of a rasterised
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
logging_name = "paperless.parsing.tesseract"
def get_settings(self) -> OcrConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return OcrConfig()
def get_page_count(self, document_path, mime_type):
page_count = None
if mime_type == "application/pdf":
try:
import pikepdf
with pikepdf.Pdf.open(document_path) as pdf:
page_count = len(pdf.pages)
except Exception as e:
self.log.warning(
f"Unable to determine PDF page count {document_path}: {e}",
)
return page_count
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":
import pikepdf
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
if m is None: # pragma: no cover
continue
namespace = m.group(1)
key_value = m.group(2)
try:
namespace.encode("utf-8")
key_value.encode("utf-8")
except UnicodeEncodeError as e: # pragma: no cover
self.log.debug(f"Skipping metadata key {key}: {e}")
continue
result.append(
{
"namespace": namespace,
"prefix": meta.REVERSE_NS[namespace],
"key": key_value,
"value": value,
},
)
except Exception as e:
self.log.warning(
f"Error while reading metadata {key}: {value}. Error: {e}",
)
return result
def get_thumbnail(self, document_path, mime_type, file_name=None):
return make_thumbnail_from_pdf(
self.archive_path or document_path,
self.tempdir,
self.logging_group,
)
def is_image(self, mime_type) -> bool:
return mime_type in [
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
"image/heic",
]
def has_alpha(self, image) -> bool:
with Image.open(image) as im:
return im.mode in ("RGBA", "LA")
def remove_alpha(self, image_path: str) -> Path:
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
run_subprocess(
[
settings.CONVERT_BINARY,
"-alpha",
"off",
image_path,
no_alpha_image,
],
logger=self.log,
)
return no_alpha_image
def get_dpi(self, image) -> int | None:
try:
with Image.open(image) as im:
x, _ = im.info["dpi"]
return round(x)
except Exception as e:
self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image) -> int | None:
try:
with Image.open(image) as im:
width, _ = im.size
# divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54))
self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
return dpi
except Exception as e:
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
return None
def extract_text(
self,
sidecar_file: Path | None,
pdf_file: Path,
) -> str | None:
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
sidecar_file is not None
and sidecar_file.is_file()
and self.settings.mode != "redo"
):
text = self.read_file_handle_unicode_errors(sidecar_file)
if "[OCR skipped on page" not in text:
# This happens when there's already text in the input file.
# The sidecar file will only contain text for OCR'ed pages.
self.log.debug("Using text from sidecar file")
return post_process_text(text)
else:
self.log.debug("Incomplete sidecar file: discarding.")
# no success with the sidecar file, try PDF
if not Path(pdf_file).is_file():
return None
try:
text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=self.tempdir,
) as tmp:
run_subprocess(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
pdf_file,
tmp.name,
],
logger=self.log,
)
text = self.read_file_handle_unicode_errors(Path(tmp.name))
return post_process_text(text)
except Exception:
# If pdftotext fails, fall back to OCR.
self.log.warning(
"Error while getting text from PDF document with pdftotext",
exc_info=True,
)
# probably not a PDF file.
return None
def construct_ocrmypdf_parameters(
self,
input_file,
mime_type,
output_file,
sidecar_file,
*,
safe_fallback=False,
):
if TYPE_CHECKING:
assert isinstance(self.settings, OcrConfig)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,
# need to use threads, since this will be run in daemonized
# processes via the task library.
"use_threads": True,
"jobs": settings.THREADS_PER_WORKER,
"language": self.settings.language,
"output_type": self.settings.output_type,
"progress_bar": False,
}
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args["color_conversion_strategy"] = (
self.settings.color_conversion_strategy
)
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif self.settings.mode in {
ModeChoices.SKIP,
ModeChoices.SKIP_NO_ARCHIVE,
}:
ocrmypdf_args["skip_text"] = True
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
else: # pragma: no cover
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
if self.settings.clean == CleanChoices.CLEAN:
ocrmypdf_args["clean"] = True
elif self.settings.clean == CleanChoices.FINAL:
if self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["clean"] = True
else:
# --clean-final is not compatible with --redo-ocr
ocrmypdf_args["clean_final"] = True
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
# --deskew is not compatible with --redo-ocr
ocrmypdf_args["deskew"] = True
if self.settings.rotate:
ocrmypdf_args["rotate_pages"] = True
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
if self.settings.pages is not None and self.settings.pages > 0:
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
else:
# sidecar is incompatible with pages
ocrmypdf_args["sidecar"] = sidecar_file
if self.is_image(mime_type):
# This may be required, depending on the known information
maybe_override_pixel_limit()
dpi = self.get_dpi(input_file)
a4_dpi = self.calculate_a4_dpi(input_file)
if self.has_alpha(input_file):
self.log.info(
f"Removing alpha layer from {input_file} "
"for compatibility with img2pdf",
)
# Replace the input file with the non-alpha
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
if dpi:
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
ocrmypdf_args["image_dpi"] = dpi
elif self.settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
elif a4_dpi:
ocrmypdf_args["image_dpi"] = a4_dpi
else:
raise ParseError(
f"Cannot produce archive PDF for image {input_file}, "
f"no DPI information is present in this image and "
f"OCR_IMAGE_DPI is not set.",
)
if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover
self.log.warning(
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
)
if self.settings.user_args is not None:
try:
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
except Exception as e:
self.log.warning(
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used. Error: {e}",
)
if (
self.settings.max_image_pixel is not None
and self.settings.max_image_pixel >= 0
):
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
msg = (
"OCR pixel limit is disabled!"
if max_pixels_mpixels == 0
else f"Calculated {max_pixels_mpixels} megapixels for OCR"
)
self.log.debug(msg)
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
return ocrmypdf_args
def parse(self, document_path: Path, mime_type, file_name=None):
# This forces tesseract to use one core per page.
os.environ["OMP_THREAD_LIMIT"] = "1"
VALID_TEXT_LENGTH = 50
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = (
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
)
else:
text_original = None
original_has_text = False
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
or self.settings.skip_archive_file
in {
ArchiveFileChoices.WITH_TEXT,
ArchiveFileChoices.ALWAYS,
}
)
if skip_archive_for_text and original_has_text:
self.log.debug(f"Document has text, skipping OCRmyPDF entirely. {text_original}")
self.text = text_original
return
# Either no text was in the original or there should be an archive
# file created, so OCR the file and create an archive with any
# text located via OCR
import ocrmypdf
from ocrmypdf import EncryptedPdfError
from ocrmypdf import InputFileError
from ocrmypdf import SubprocessOutputError
from ocrmypdf.exceptions import DigitalSignatureError
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path,
sidecar_file,
)
try:
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
if not self.text:
raise NoTextFoundException("No text was found in the original document")
except (DigitalSignatureError, EncryptedPdfError):
self.log.warning(
"This file is encrypted and/or signed, OCR is impossible. Using "
"any text present in the original file.",
)
if original_has_text:
self.text = text_original
except SubprocessOutputError as e:
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
except (NoTextFoundException, InputFileError) as e:
self.log.warning(
f"Encountered an error while running OCR: {e!s}. "
f"Attempting force OCR to get the text.",
)
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
# Attempt to run OCR with safe settings.
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path_fallback,
sidecar_file_fallback,
safe_fallback=True,
)
try:
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
# Don't return the archived file here, since this file
# is bigger and blurry due to --force-ocr.
self.text = self.extract_text(
sidecar_file_fallback,
archive_path_fallback,
)
except Exception as e:
# If this fails, we have a serious issue at hand.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
except Exception as e:
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
# As a last resort, if we still don't have any text for any reason,
# try to extract the text from the original document.
if not self.text:
if original_has_text:
self.text = text_original
else:
self.log.warning(
f"No text was found in {document_path}, the content will be empty.",
)
self.text = ""
def post_process_text(text):
if not text:
return None
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
# TODO: this needs a rework
# replace \0 prevents issues with saving to postgres.
# text may contain \0 when this character is present in PDF files.
return no_trailing_whitespace.strip().replace("\0", " ")

View File

@ -1,41 +0,0 @@
-- documents_correspondent definition
CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
-- documents_customfield definition
CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
-- documents_customfieldinstance definition
CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
-- documents_document definition
CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
-- documents_documenttype definition
CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");

View File

@ -1,63 +0,0 @@
我提供的文件,是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下
1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
2我们提取上面的各个字段然后
1 report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表如果对应的name不存在则插入一条记录然后得到对应的 documents_documenttype.id
2 org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表如果对应的name 不存在,则插入一条记录,然后得到对应的 documents_correspondent.id
3 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段,如果不存在,则创建; 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
3我们开始更新数据表
1 更新 documents_document 表对应的记录, reated = publish_date, correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title}
2) 向 documents_customfieldinstance 两条记录,分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
好了请你根据以上需求完成这个python脚本。注意异常情况的处理以及日志输出。如果文件名无法匹配以上的格式则忽略不用处理。
Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
The API provides the following main endpoints:
/api/correspondents/: Full CRUD support.
/api/custom_fields/: Full CRUD support.
/api/documents/: Full CRUD support, except POSTing new documents. See below.
/api/document_types/: Full CRUD support.
/api/groups/: Full CRUD support.
/api/logs/: Read-Only.
/api/mail_accounts/: Full CRUD support.
/api/mail_rules/: Full CRUD support.
/api/profile/: GET, PATCH
/api/share_links/: Full CRUD support.
/api/storage_paths/: Full CRUD support.
/api/tags/: Full CRUD support.
/api/tasks/: Read-only.
/api/users/: Full CRUD support.
/api/workflows/: Full CRUD support.
/api/search/ GET, see below.
All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
The objects served by the document endpoint contain the following fields:
id: ID of the document. Read-only.
title: Title of the document.
content: Plain text content of the document.
tags: List of IDs of tags assigned to this document, or empty list.
document_type: Document type of this document, or null.
correspondent: Correspondent of this document or null.
created: The date time at which this document was created.
created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
modified: The date at which this document was last edited in paperless. Read-only.
added: The date at which this document was added to paperless. Read-only.
archive_serial_number: The identifier of this document in a physical document archive.
original_file_name: Verbose filename of the original document. Read-only.
archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
notes: Array of notes associated with the document.
page_count: Number of pages.
set_permissions: Allows setting document permissions. Optional, write-only. See below.
custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么我想对编号为19的文档进行查询以及更新操作应该如何写对应的python代码

View File

@ -11,7 +11,8 @@ from requests.exceptions import RequestException
# Paperless 服务器信息 # Paperless 服务器信息
PAPERLESS_URL = "http://localhost:8000/api" PAPERLESS_URL = "http://localhost:8000/api"
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证 #AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证 mac上用这个
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证NAS上用这个
# 日志配置 # 日志配置
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
@ -22,7 +23,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(me
DB_PATH = "/usr/src/paperless/data/db.sqlite3" DB_PATH = "/usr/src/paperless/data/db.sqlite3"
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor() cursor = conn.cursor()
enable_db = True enable_db = False # 标准用法用API
# 正则解析文件名 # 正则解析文件名
FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf") FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")

View File

@ -0,0 +1,484 @@
import os
import re
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from django.conf import settings
from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
class NoTextFoundException(Exception):
pass
class RtlLanguageException(Exception):
pass
class RasterisedDocumentParser(DocumentParser):
"""
This parser uses Tesseract to try and get some text out of a rasterised
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
logging_name = "paperless.parsing.tesseract"
def get_settings(self) -> OcrConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return OcrConfig()
def get_page_count(self, document_path, mime_type):
page_count = None
if mime_type == "application/pdf":
try:
import pikepdf
with pikepdf.Pdf.open(document_path) as pdf:
page_count = len(pdf.pages)
except Exception as e:
self.log.warning(
f"Unable to determine PDF page count {document_path}: {e}",
)
return page_count
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":
import pikepdf
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
if m is None: # pragma: no cover
continue
namespace = m.group(1)
key_value = m.group(2)
try:
namespace.encode("utf-8")
key_value.encode("utf-8")
except UnicodeEncodeError as e: # pragma: no cover
self.log.debug(f"Skipping metadata key {key}: {e}")
continue
result.append(
{
"namespace": namespace,
"prefix": meta.REVERSE_NS[namespace],
"key": key_value,
"value": value,
},
)
except Exception as e:
self.log.warning(
f"Error while reading metadata {key}: {value}. Error: {e}",
)
return result
def get_thumbnail(self, document_path, mime_type, file_name=None):
return make_thumbnail_from_pdf(
self.archive_path or document_path,
self.tempdir,
self.logging_group,
)
def is_image(self, mime_type) -> bool:
return mime_type in [
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
"image/heic",
]
def has_alpha(self, image) -> bool:
with Image.open(image) as im:
return im.mode in ("RGBA", "LA")
def remove_alpha(self, image_path: str) -> Path:
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
run_subprocess(
[
settings.CONVERT_BINARY,
"-alpha",
"off",
image_path,
no_alpha_image,
],
logger=self.log,
)
return no_alpha_image
def get_dpi(self, image) -> int | None:
try:
with Image.open(image) as im:
x, _ = im.info["dpi"]
return round(x)
except Exception as e:
self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image) -> int | None:
try:
with Image.open(image) as im:
width, _ = im.size
# divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54))
self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
return dpi
except Exception as e:
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
return None
def extract_text(
self,
sidecar_file: Path | None,
pdf_file: Path,
) -> str | None:
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
sidecar_file is not None
and sidecar_file.is_file()
and self.settings.mode != "redo"
):
text = self.read_file_handle_unicode_errors(sidecar_file)
if "[OCR skipped on page" not in text:
# This happens when there's already text in the input file.
# The sidecar file will only contain text for OCR'ed pages.
self.log.debug("Using text from sidecar file")
return post_process_text(text)
else:
self.log.debug("Incomplete sidecar file: discarding.")
# no success with the sidecar file, try PDF
if not Path(pdf_file).is_file():
return None
try:
text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=self.tempdir,
) as tmp:
run_subprocess(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
pdf_file,
tmp.name,
],
logger=self.log,
)
text = self.read_file_handle_unicode_errors(Path(tmp.name))
return post_process_text(text)
except Exception:
# If pdftotext fails, fall back to OCR.
self.log.warning(
"Error while getting text from PDF document with pdftotext",
exc_info=True,
)
# probably not a PDF file.
return None
def construct_ocrmypdf_parameters(
self,
input_file,
mime_type,
output_file,
sidecar_file,
*,
safe_fallback=False,
):
if TYPE_CHECKING:
assert isinstance(self.settings, OcrConfig)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,
# need to use threads, since this will be run in daemonized
# processes via the task library.
"use_threads": True,
"jobs": settings.THREADS_PER_WORKER,
"language": self.settings.language,
"output_type": self.settings.output_type,
"progress_bar": False,
}
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args["color_conversion_strategy"] = (
self.settings.color_conversion_strategy
)
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif self.settings.mode in {
ModeChoices.SKIP,
ModeChoices.SKIP_NO_ARCHIVE,
}:
ocrmypdf_args["skip_text"] = True
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
else: # pragma: no cover
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
if self.settings.clean == CleanChoices.CLEAN:
ocrmypdf_args["clean"] = True
elif self.settings.clean == CleanChoices.FINAL:
if self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["clean"] = True
else:
# --clean-final is not compatible with --redo-ocr
ocrmypdf_args["clean_final"] = True
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
# --deskew is not compatible with --redo-ocr
ocrmypdf_args["deskew"] = True
if self.settings.rotate:
ocrmypdf_args["rotate_pages"] = True
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
if self.settings.pages is not None and self.settings.pages > 0:
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
else:
# sidecar is incompatible with pages
ocrmypdf_args["sidecar"] = sidecar_file
if self.is_image(mime_type):
# This may be required, depending on the known information
maybe_override_pixel_limit()
dpi = self.get_dpi(input_file)
a4_dpi = self.calculate_a4_dpi(input_file)
if self.has_alpha(input_file):
self.log.info(
f"Removing alpha layer from {input_file} "
"for compatibility with img2pdf",
)
# Replace the input file with the non-alpha
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
if dpi:
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
ocrmypdf_args["image_dpi"] = dpi
elif self.settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
elif a4_dpi:
ocrmypdf_args["image_dpi"] = a4_dpi
else:
raise ParseError(
f"Cannot produce archive PDF for image {input_file}, "
f"no DPI information is present in this image and "
f"OCR_IMAGE_DPI is not set.",
)
if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover
self.log.warning(
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
)
if self.settings.user_args is not None:
try:
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
except Exception as e:
self.log.warning(
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used. Error: {e}",
)
if (
self.settings.max_image_pixel is not None
and self.settings.max_image_pixel >= 0
):
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
msg = (
"OCR pixel limit is disabled!"
if max_pixels_mpixels == 0
else f"Calculated {max_pixels_mpixels} megapixels for OCR"
)
self.log.debug(msg)
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
return ocrmypdf_args
def parse(self, document_path: Path, mime_type, file_name=None):
# This forces tesseract to use one core per page.
os.environ["OMP_THREAD_LIMIT"] = "1"
VALID_TEXT_LENGTH = 50
# skip ocr process entirely to save time.
self.text = "defautl text"
self.log.debug("skipping reading file entirely.")
return
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = (
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
)
else:
text_original = None
original_has_text = False
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
or self.settings.skip_archive_file
in {
ArchiveFileChoices.WITH_TEXT,
ArchiveFileChoices.ALWAYS,
}
)
# force skip ocr process.
if not original_has_text:
original_has_text = True
text_original = "this is default content, as we skipped ocr process..."
self.log.warning("Cannot read text from Document, use default message.")
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
# Either no text was in the original or there should be an archive
# file created, so OCR the file and create an archive with any
# text located via OCR
import ocrmypdf
from ocrmypdf import EncryptedPdfError
from ocrmypdf import InputFileError
from ocrmypdf import SubprocessOutputError
from ocrmypdf.exceptions import DigitalSignatureError
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path,
sidecar_file,
)
try:
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
if not self.text:
raise NoTextFoundException("No text was found in the original document")
except (DigitalSignatureError, EncryptedPdfError):
self.log.warning(
"This file is encrypted and/or signed, OCR is impossible. Using "
"any text present in the original file.",
)
if original_has_text:
self.text = text_original
except SubprocessOutputError as e:
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
except (NoTextFoundException, InputFileError) as e:
self.log.warning(
f"Encountered an error while running OCR: {e!s}. "
f"Attempting force OCR to get the text.",
)
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
# Attempt to run OCR with safe settings.
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path_fallback,
sidecar_file_fallback,
safe_fallback=True,
)
try:
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
# Don't return the archived file here, since this file
# is bigger and blurry due to --force-ocr.
self.text = self.extract_text(
sidecar_file_fallback,
archive_path_fallback,
)
except Exception as e:
# If this fails, we have a serious issue at hand.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
except Exception as e:
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
# As a last resort, if we still don't have any text for any reason,
# try to extract the text from the original document.
if not self.text:
if original_has_text:
self.text = text_original
else:
self.log.warning(
f"No text was found in {document_path}, the content will be empty.",
)
self.text = ""
def post_process_text(text):
if not text:
return None
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
# TODO: this needs a rework
# replace \0 prevents issues with saving to postgres.
# text may contain \0 when this character is present in PDF files.
return no_trailing_whitespace.strip().replace("\0", " ")

View File

@ -0,0 +1,37 @@
## 登陆
### 用户名: admin
### 密码: paperless
## 需要指定用户名
### 配置好 USERMAP_GID和USERMAP_GID否则可能无法执行主机映射进去的脚本。
### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID
## 自定义的文件名解析脚本
```Bash
# 文档
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
# 配置
environment:
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
```
## 源码修改,可以通过在容器里执行 docker_patch.sh 脚本来完成
### 对于无法简单读取pdf内容的文档paperless会启动OCR扫描且复杂情况下会执行两遍非常慢而且消耗资源。只能通过修改源码解决
```Bash
# /usr/src/paperless/src/paperless_tesseract/parsers.py :
# force skip ocr process.
if not original_has_text:
original_has_text = True
text_original = "this is default content, as we skipped ocr process..."
self.log.warning("Cannot read text from Document, use default message.")
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
```

View File

@ -1,64 +0,0 @@
-------------------------------------------------------
------------------- paperless 无纸化pdf管理 ------------
-------------------------------------------------------
## 最好不要用命令使用docker-compose.yml来创建需要制定后端使用的数据库以及redis
docker run -itd \
--name paperless \
--network devops \
--platform linux/x86_64 \
-e TZ="Asia/Shanghai" \
-v /etc/localtime:/etc/localtime:ro \
-v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \
-v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
-p 8000:8000 \
ghcr.io/paperless-ngx/paperless-ngx
# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin
docker compose run --rm webserver createsuperuser
python3 manage.py createsuperuser
# 已有文档,放在指定目录下,等系统自动加载(或者手工启动)
cd /path/to/paperless/src/
python3 manage.py document_consumer
# 自动解析文件名
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
environment:
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:
/usr/src/paperless/src/documents/consumer.py
def pre_check_duplicate(self):
"""
Using the MD5 of the file, check this exact file doesn't already exist
"""
with open(self.input_doc.original_file, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)
if existing_doc.exists():
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
if existing_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
## 修改这里,让它删除重复文件。
if settings.CONSUMER_DELETE_DUPLICATES or True:
os.unlink(self.input_doc.original_file)
self._fail(
msg,
log_msg,
)

View File

@ -0,0 +1,281 @@
import sqlite3
import os
import logging
import json
from datetime import datetime
import argparse
import re
res_dir = './result'
os.makedirs(res_dir, exist_ok=True)
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'{res_dir}/rename_files.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def preload_folders(conn, prefix):
"""预加载所有文件夹路径到字典folder_id -> path"""
sqlstr = "SELECT id, path FROM folders where 1=1 "
if prefix and prefix.strip():
sqlstr += f" and path like '%{prefix}%' "
try:
cursor = conn.cursor()
cursor.execute(sqlstr)
return {row[0]: row[1] for row in cursor.fetchall()}
except sqlite3.Error as e:
logger.error(f"预加载文件夹信息失败: {str(e)}")
raise
def preload_studios(conn):
"""预加载所有工作室名称到字典studio_id -> name"""
try:
cursor = conn.cursor()
cursor.execute("SELECT id, name FROM studios")
studios = {row[0]: row[1] for row in cursor.fetchall()}
# 补充默认值(未找到的工作室)
studios[None] = "UnknownStudio"
return studios
except sqlite3.Error as e:
logger.error(f"预加载工作室信息失败: {str(e)}")
raise
def get_performers(conn, scene_id):
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
try:
cursor = conn.cursor()
query = """
SELECT p.name
FROM performers p
JOIN performers_scenes ps ON p.id = ps.performer_id
WHERE ps.scene_id = ?
ORDER BY p.name
"""
cursor.execute(query, (scene_id,))
results = cursor.fetchall()
return ','.join([row[0] for row in results]) or "UnknownPerformers"
except sqlite3.Error as e:
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
raise
def parse_date(date_str):
"""解析日期为yyyy.mm.dd格式"""
if not date_str:
return "0000.00.00"
date_formats = [
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
]
for fmt in date_formats:
try:
return datetime.strptime(date_str, fmt).strftime("%Y.%m.%d")
except ValueError:
continue
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
return "0000.00.00"
def get_file_extension(basename):
"""获取文件扩展名"""
if '.' in basename:
return basename.split('.')[-1].lower()
return ''
def sanitize_filename(name):
"""清理文件名中的非法字符"""
invalid_chars = '/\\:*?"<>|'
for char in invalid_chars:
name = name.replace(char, '-')
return name
def process_scene_files(conn, mode, prefix, rename_style):
"""处理所有场景文件映射关系(优化版:合并查询+预加载缓存)"""
results = []
try:
# 1. 预加载文件夹和工作室到内存字典仅2次SQL查询
folders = preload_folders(conn, prefix)
studios = preload_studios(conn)
logger.info(f"预加载完成 - 文件夹: {len(folders)} 个, 工作室: {len(studios)}")
# 2. 一次性查询所有关联数据1次SQL查询替代多次
cursor = conn.cursor()
query = """
SELECT
sf.scene_id, sf.file_id,
f.id AS file_id, f.basename, f.parent_folder_id,
s.title, s.date as release_date, s.studio_id, s.code
FROM scenes_files sf
LEFT JOIN files f ON sf.file_id = f.id
LEFT JOIN scenes s ON sf.scene_id = s.id
"""
cursor.execute(query)
mappings = cursor.fetchall()
logger.info(f"共找到 {len(mappings)} 条场景-文件映射记录")
for idx, row in enumerate(mappings, 1):
try:
# 解析合并查询的结果
scene_id = row[0]
file_id = row[1]
file_info = {
'id': row[2],
'basename': row[3],
'parent_folder_id': row[4]
}
scene_info = {
'title': row[5],
'release_date': row[6],
'studio_id': row[7],
'code': row[8]
}
# 校验必要数据
if not file_id or not file_info['id'] or not file_info['basename'] or not file_info['parent_folder_id']:
logger.debug(f"文件ID信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
continue
if not scene_id or not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
logger.debug(f"场景信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
continue
# 3. 从内存缓存获取文件夹路径和工作室名称无SQL查询
folder_path = folders.get(file_info['parent_folder_id'])
if not folder_path:
logger.debug(f"文件夹ID不存在 (folder_id={file_info['parent_folder_id']}),跳过")
continue
studio_name = studios.get(scene_info['studio_id'])
if not studio_name:
logger.debug(f"工作室ID不存在 (studio_id={scene_info['studio_id']}),跳过")
continue
# 4. 获取演员信息(仍需单独查询,因多对多关联需排序)
performers = get_performers(conn, scene_id)
# 5. 构建新文件名
original_basename = file_info['basename'] or "unknown_file"
ext = get_file_extension(original_basename)
release_date = parse_date(scene_info['release_date'])
title = scene_info['title'] or "Untitled"
# 清理特殊字符
sanitized_studio = sanitize_filename(studio_name)
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
if scene_info.get('code'):
sanitized_title = f"{sanitized_title} ({scene_info['code']})"
# 去掉sanitized_studio的空格以及' " 等特殊符号
sanitized_studio = re.sub(r'[\'"\s\-_]+', '', sanitized_studio)
# 拼接新文件名
if ext:
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}.{ext}"
else:
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}"
# 简化命名规则,适用于日本影片
if rename_style == 'simple':
if scene_info.get('code'):
# code 转换成大写
new_code = scene_info['code'].upper()
new_basename = f"{new_code}_{release_date}.{ext}" if ext else f"{new_code}_{release_date}"
if len(new_basename) > 254:
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
continue
# 构建完整路径
original_path = os.path.join(folder_path, original_basename)
new_path = os.path.join(folder_path, new_basename)
if not os.path.exists(original_path):
logger.warning(f"文件不存在,跳过: {original_path}")
continue
if os.path.exists(new_path):
logger.warning(f"目标文件已存在,跳过: {new_path}")
continue
if original_path == new_path: # 文件名未变化
logger.info(f"文件名未变化,跳过 (file_id={file_id}): {original_path}")
continue
# 记录结果
result = {
'file_id': file_id,
'scene_id': scene_id,
'original_name': original_path,
'dest_name': new_path
}
results.append(result)
logger.info(f"处理第 {idx}/{len(mappings)} 条: {original_path} -> {new_path}")
# 运行模式:执行重命名和数据库更新
if mode == 'run':
if not os.path.exists(original_path):
logger.warning(f"文件不存在,跳过: {original_path}")
continue
if os.path.exists(new_path):
logger.warning(f"目标文件已存在,跳过: {new_path}")
continue
if original_path != new_path:
os.rename(original_path, new_path)
#cursor.execute(
# "UPDATE files SET basename = ? WHERE id = ?",
# (new_basename, file_info['id'])
#)
#conn.commit()
logger.info(f"已更新文件 (file_id={file_info['id']})")
except Exception as e:
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
if mode == 'run':
conn.rollback()
continue
# 保存结果
with open(f'{res_dir}/rename_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
logger.info(f"处理完成,结果已保存到 rename_results.json")
return results
except sqlite3.Error as e:
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
if mode == 'run':
conn.rollback()
raise
finally:
if mode == 'run':
conn.commit()
def main():
parser = argparse.ArgumentParser(description='电影文件重命名工具(优化版)')
parser.add_argument('--mode', choices=['check', 'run'], default='check',
help='运行模式: check(检查) 或 run(执行)')
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
parser.add_argument('--prefix', default='', help='目录前缀,用来过滤文件路径')
parser.add_argument('--rename_style', choices=['standard', 'simple'], default='standard', help='文件命名规则,标准格式和简化格式')
args = parser.parse_args()
if not os.path.exists(args.db):
logger.error(f"数据库文件不存在: {args.db}")
return
conn = None
try:
conn = sqlite3.connect(args.db)
logger.info(f"成功连接到数据库: {args.db}")
process_scene_files(conn, args.mode, args.prefix, args.rename_style)
except sqlite3.Error as e:
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
finally:
if conn:
conn.close()
logger.info("数据库连接已关闭")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,288 @@
import sqlite3
import os
import logging
import json
from datetime import datetime
import argparse
import re
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('./result/rename_files.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def get_performers(conn, scene_id):
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
try:
cursor = conn.cursor()
# 优化查询使用JOIN一次性获取所需数据
query = """
SELECT p.name
FROM performers p
JOIN performers_scenes ps ON p.id = ps.performer_id
WHERE ps.scene_id = ?
ORDER BY p.name
"""
cursor.execute(query, (scene_id,))
results = cursor.fetchall()
return ','.join([row[0] for row in results])
except sqlite3.Error as e:
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
raise
def get_file_info(conn, file_id):
"""获取文件信息ID、原始文件名、父目录ID"""
try:
cursor = conn.cursor()
cursor.execute("""
SELECT id, basename, parent_folder_id
FROM files
WHERE id = ?
""", (file_id,))
result = cursor.fetchone()
if not result:
raise ValueError(f"未找到文件信息 (file_id={file_id})")
return {
'id': result[0],
'basename': result[1],
'parent_folder_id': result[2]
}
except sqlite3.Error as e:
logger.error(f"获取文件信息失败 (file_id={file_id}): {str(e)}")
raise
def get_folder_path(conn, folder_id):
"""获取文件夹路径"""
try:
cursor = conn.cursor()
cursor.execute("SELECT path FROM folders WHERE id = ?", (folder_id,))
result = cursor.fetchone()
if not result:
raise ValueError(f"未找到文件夹路径 (folder_id={folder_id})")
return result[0]
except sqlite3.Error as e:
logger.error(f"获取文件夹路径失败 (folder_id={folder_id}): {str(e)}")
raise
def get_scene_info(conn, scene_id):
"""获取场景信息标题、日期、工作室ID"""
try:
cursor = conn.cursor()
cursor.execute("""
SELECT title, date as release_date, studio_id
FROM scenes
WHERE id = ?
""", (scene_id,))
result = cursor.fetchone()
if not result:
raise ValueError(f"未找到场景信息 (scene_id={scene_id})")
return {
'title': result[0],
'release_date': result[1],
'studio_id': result[2]
}
except sqlite3.Error as e:
logger.error(f"获取场景信息失败 (scene_id={scene_id}): {str(e)}")
raise
def get_studio_name(conn, studio_id):
"""获取工作室名称"""
try:
cursor = conn.cursor()
cursor.execute("SELECT name FROM studios WHERE id = ?", (studio_id,))
result = cursor.fetchone()
if not result:
logger.warning(f"未找到工作室信息 (studio_id={studio_id}),使用默认名称")
return "UnknownStudio"
return result[0]
except sqlite3.Error as e:
logger.error(f"获取工作室信息失败 (studio_id={studio_id}): {str(e)}")
raise
def parse_date(date_str):
"""解析日期为yyyy.mm.dd格式"""
if not date_str:
return "0000.00.00"
# 尝试多种常见日期格式
date_formats = [
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
]
for fmt in date_formats:
try:
date_obj = datetime.strptime(date_str, fmt)
return date_obj.strftime("%Y.%m.%d")
except ValueError:
continue
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
return "0000.00.00"
def get_file_extension(basename):
"""获取文件扩展名"""
if '.' in basename:
return basename.split('.')[-1].lower()
return ''
def sanitize_filename(name):
"""清理文件名中的非法字符"""
invalid_chars = '/\\:*?"<>|'
for char in invalid_chars:
name = name.replace(char, '-')
return name
def process_scene_files(conn, mode, prefix):
"""处理所有场景文件映射关系"""
results = []
try:
cursor = conn.cursor()
# 获取所有场景-文件映射关系
cursor.execute("SELECT scene_id, file_id FROM scenes_files")
mappings = cursor.fetchall()
logger.debug(f"共找到 {len(mappings)} 条场景-文件映射记录")
for idx, (scene_id, file_id) in enumerate(mappings, 1):
logger.debug(f"处理第 {idx}/{len(mappings)} 条记录 (scene_id={scene_id}, file_id={file_id})")
try:
# 1. 获取文件信息
file_info = get_file_info(conn, file_id)
original_basename = file_info['basename']
parent_folder_id = file_info['parent_folder_id']
# 2.获取文件夹路径
folder_path = get_folder_path(conn, parent_folder_id)
# 3. 获取演员信息
performers = get_performers(conn, scene_id)
if not performers:
performers = "UnknownPerformers"
logger.warning(f"场景 {scene_id} 未找到演员信息,跳过")
continue
# 4. 获取场景和工作室信息
scene_info = get_scene_info(conn, scene_id)
if not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
logger.warning(f"场景 {scene_id} 信息不完整,跳过")
continue
title = scene_info['title'] or "Untitled"
release_date = parse_date(scene_info['release_date'])
studio_name = get_studio_name(conn, scene_info['studio_id'])
# 5. 构建新文件名
ext = get_file_extension(original_basename)
sanitized_studio = sanitize_filename(studio_name)
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
if ext:
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}.{ext}"
else:
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}"
if len(new_basename) > 254:
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
continue
# 构建完整路径
original_path = os.path.join(folder_path, original_basename)
new_path = os.path.join(folder_path, new_basename)
# 记录结果
result = {
'file_id': file_id,
'scene_id': scene_id,
'original_name': original_path,
'dest_name': new_path
}
results.append(result)
# 输出检查信息
logger.info(f"准备重命名: {original_path} -> {new_path}")
# 在运行模式下执行操作
if mode == 'run':
# 检查文件是否存在
if not os.path.exists(original_path):
logger.warning(f"文件不存在,跳过: {original_path}")
continue
# 执行文件重命名
if original_path != new_path:
os.rename(original_path, new_path)
logger.info(f"已重命名: {original_path} -> {new_path}")
# 更新数据库记录
cursor.execute(
"UPDATE files SET basename = ? WHERE id = ?",
(new_basename, file_id)
)
conn.commit()
logger.info(f"已更新数据库记录 (file_id={file_id})")
except Exception as e:
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
# 回滚当前事务(如果是运行模式)
if mode == 'run':
conn.rollback()
continue
# 保存结果到文件
with open('./result/rename_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
logger.info(f"处理完成,结果已保存到 rename_results.json")
return results
except sqlite3.Error as e:
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
if mode == 'run':
conn.rollback()
raise
finally:
if mode == 'run':
conn.commit()
def main():
# 解析命令行参数
parser = argparse.ArgumentParser(description='电影文件重命名工具')
parser.add_argument('--mode', choices=['check', 'run'], default='check',
help='运行模式: check(检查) 或 run(执行)')
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
parser.add_argument('--prefix', default='', help='目录的前缀,用来匹配')
args = parser.parse_args()
# 验证数据库文件是否存在
if not os.path.exists(args.db):
logger.error(f"数据库文件不存在: {args.db}")
return
os.makedirs('./result', exist_ok=True)
# 连接数据库
conn = None
try:
conn = sqlite3.connect(args.db)
conn.row_factory = sqlite3.Row # 启用行工厂,方便按列名访问
logger.info(f"成功连接到数据库: {args.db}")
# 执行处理
process_scene_files(conn, args.mode, args.prefix)
except sqlite3.Error as e:
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
finally:
if conn:
conn.close()
logger.info("数据库连接已关闭")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,110 @@
name: Javbus
sceneByFragment:
action: scrapeXPath
queryURL: https://www.javbus.com/{filename}
queryURLReplace:
filename:
- regex: -JG\d
with: ""
- regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
with: $2
scraper: sceneScraper
sceneByURL:
- action: scrapeXPath
url:
- https://www.javbus.com
- https://www.seejav.bid
- https://www.cdnbus.lol
- https://www.dmmbus.lol
- https://www.seedmm.cfd
scraper: sceneScraper
sceneByName:
action: scrapeXPath
queryURL: https://www.javbus.com/search/{}&type=&parent=ce
scraper: sceneSearch
sceneByQueryFragment:
action: scrapeXPath
queryURL: "{url}"
scraper: sceneScraper
performerByURL:
- action: scrapeXPath
url:
- https://www.javbus.com
- https://www.seejav.bid
- https://www.cdnbus.lol
- https://www.dmmbus.lol
- https://www.seedmm.cfd
scraper: performerScraper
performerByName:
action: scrapeXPath
queryURL: https://www.javbus.com/searchstar/{}&type=&parent=ce
scraper: performerSearch
xPathScrapers:
performerSearch:
performer:
Name: //span[@class="mleft"]
URLs: //*[@id="waterfall"]/div/a/@href
performerScraper:
performer:
Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
Birthdate:
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '生日')]
postProcess:
- replace:
- regex: ^(.*? ){1}
with:
Height:
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '身高')]
postProcess:
- replace:
- regex: ^(.*? ){1}
with:
# Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
Image:
selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
postProcess:
- replace:
- regex: ^
with: https://www.javbus.com
sceneSearch:
scene:
Title: //div[@class="photo-info"]/span
URL: //*[@id="waterfall"]/div/a/@href
sceneScraper:
scene:
Title:
selector: //div[@class="col-md-3 info"]//span[contains(text(), '識別碼')]/../span[2]/text()
URL:
selector: /html/head/link[@hreflang="zh"]/@href
Date:
selector: //div[@class="col-md-3 info"]//span[contains(text(), '發行日期')]/../text()
Details:
selector: //div[@class="container"]/h3/text()
postProcess:
- replace:
- regex: ^(.*? ){1}
with:
Tags:
Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
Performers:
Name: //div[@class="star-name"]/a
Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
Image:
selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
postProcess:
- replace:
- regex: ^
with: https://www.javbus.com
Studio:
Name: //div[@class="col-md-3 info"]//span[contains(text(), '發行商')]/../a/text()
driver:
headers:
- Key: User-Agent
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
- Key: Accept-Language
Value: zh-cn
# Last Updated September 17, 2025

View File

@ -0,0 +1,9 @@
id: JavBus
name: Javbus
metadata: {}
version: 5ee93a34
date: "2025-09-17 10:48:13"
requires: []
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
files:
- JavBus.yml

View File

@ -0,0 +1,111 @@
name: Javbus_en
sceneByFragment:
action: scrapeXPath
queryURL: https://www.javbus.com/en/{filename}
queryURLReplace:
filename:
- regex: -JG\d
with: ""
- regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
with: $2
scraper: sceneScraper
sceneByURL:
- action: scrapeXPath
url:
- https://www.javbus.com/en
- https://www.seejav.bid
- https://www.cdnbus.lol
- https://www.dmmbus.lol
- https://www.seedmm.cfd
scraper: sceneScraper
sceneByName:
action: scrapeXPath
queryURL: https://www.javbus.com/en/search/{}&type=&parent=ce
scraper: sceneSearch
sceneByQueryFragment:
action: scrapeXPath
queryURL: "{url}"
scraper: sceneScraper
performerByURL:
- action: scrapeXPath
url:
- https://www.javbus.com/en
- https://www.seejav.bid
- https://www.cdnbus.lol
- https://www.dmmbus.lol
- https://www.seedmm.cfd
scraper: performerScraper
performerByName:
action: scrapeXPath
queryURL: https://www.javbus.com/en/searchstar/{}&type=&parent=ce
scraper: performerSearch
xPathScrapers:
performerSearch:
performer:
Name: //span[@class="mleft"]
URLs: //*[@id="waterfall"]/div/a/@href
performerScraper:
performer:
Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
Birthdate:
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'D.O.B')]
postProcess:
- replace:
- regex: ^(.*? ){1}
with:
Height:
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'Height')]
postProcess:
- replace:
- regex: ^(.*? ){1}
with:
# Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
Image:
selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
postProcess:
- replace:
- regex: ^
with: https://www.javbus.com/en
sceneSearch:
scene:
Title: //div[@class="photo-info"]/span
URL: //*[@id="waterfall"]/div/a/@href
sceneScraper:
scene:
Title:
selector: //div[@class="col-md-3 info"]//span[contains(text(), 'ID')]/../span[2]/text()
URL:
selector: /html/head/link[@hreflang="zh"]/@href
Date:
selector: //div[@class="col-md-3 info"]//span[contains(normalize-space(text()), 'Release Date')]/../text()
#selector: //div[@class="col-md-3 info"]//span[contains(text(), 'Release Date')]/../text()
Details:
selector: //div[@class="container"]/h3/text()
postProcess:
- replace:
- regex: ^(.*? ){1}
with:
Tags:
Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
Performers:
Name: //div[@class="star-name"]/a
Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
Image:
selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
postProcess:
- replace:
- regex: ^
with: https://www.javbus.com/
Studio:
Name: //div[@class="col-md-3 info"]//span[contains(text(), 'Label')]/../a/text()
driver:
headers:
- Key: User-Agent
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
- Key: Accept-Language
Value: zh-cn,en-US
# Last Updated September 17, 2025

View File

@ -0,0 +1,9 @@
id: JavBus_en
name: Javbus_en
metadata: {}
version: b4672ccf
date: "2025-08-01 16:01:27"
requires: []
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
files:
- JavBus_en.yml

11
gitignore Normal file
View File

@ -0,0 +1,11 @@
# 其他已有的忽略规则
*.pyc
__pycache__/
# 忽略环境配置文件
.env
# 忽略所有 log 目录 和 data 目录
**/log/
**/data/
**/result/

View File

@ -29,18 +29,47 @@ else
fi fi
servers=() servers=()
while IFS= read -r line; do while IFS= read -r line; do
# 跳过空行和注释行
[[ -z "$line" || "$line" =~ ^# ]] && continue
servers+=("$line") servers+=("$line")
done < "$file_path" done < "$file_path"
fi fi
# 推送公钥到远程服务器 # 推送公钥到远程服务器
for server in "${servers[@]}"; do
public_key=$(cat ~/.ssh/id_rsa.pub) public_key=$(cat ~/.ssh/id_rsa.pub)
ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys" for server in "${servers[@]}"; do
if [ $? -eq 0 ]; then # 第一次尝试推送
echo "正在推送公钥到 $server..."
output=$(ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys" 2>&1)
exit_code=$?
if [ $exit_code -eq 0 ]; then
echo "公钥已成功推送到 $server"
continue
fi
# 检测是否是主机密钥验证失败
if echo "$output" | grep -q "Host key verification failed"; then
echo "检测到 $server 的主机密钥已变更,正在清理旧密钥..."
# 提取主机地址(处理 user@host 格式,取 @ 后面的部分)
host=$(echo "$server" | cut -d'@' -f2)
# 清理旧密钥
cleanup_output=$(ssh-keygen -R "$host" 2>&1)
if [ $? -ne 0 ]; then
echo "清理 $host 旧密钥失败:$cleanup_output"
continue
fi
echo "已清理 $host 的旧密钥,重新尝试推送..."
# 重新推送
retry_output=$(ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys" 2>&1)
retry_code=$?
if [ $retry_code -eq 0 ]; then
echo "公钥已成功推送到 $server" echo "公钥已成功推送到 $server"
else else
echo "推送公钥到 $server 时出错。" echo "重新推送 $server 失败:$retry_output"
fi
else
# 其他错误类型
echo "推送 $server 失败:$output"
fi fi
done done