Compare commits
23 Commits
c430547f0a
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 2b0e1c0413 | |||
| dece263c8b | |||
| 00b267b651 | |||
| 0a4776479c | |||
| 6cf529541d | |||
| 2c0e3bd718 | |||
| ebae625165 | |||
| f8daffd47f | |||
| bed2de3cd1 | |||
| d1c543512e | |||
| 857339d261 | |||
| f189dcfaca | |||
| 1848510b65 | |||
| 04d76944ad | |||
| 40eae5569a | |||
| 15c4f7b823 | |||
| 17356c79f9 | |||
| 808dbaa985 | |||
| b7dffc539c | |||
| 91e7d38725 | |||
| fe153d69cc | |||
| 31e07abf14 | |||
| 30b315ecd0 |
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# 其他已有的忽略规则
|
||||||
|
*.pyc
|
||||||
|
__pycache__/
|
||||||
|
|
||||||
|
# 忽略环境配置文件
|
||||||
|
.env
|
||||||
|
|
||||||
|
# 忽略所有 log 目录 和 data 目录
|
||||||
|
**/log/
|
||||||
|
**/data/
|
||||||
|
**/result/
|
||||||
@ -8,11 +8,6 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
environment:
|
environment:
|
||||||
PAPERLESS_OCR_LANGUAGES: "" # 跳过OCR
|
|
||||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always" # 跳过创建文档存档版本的时间
|
|
||||||
PAPERLESS_OCR_OUTPUT_TYPE: "pdf" # 尽量少修改PDF文档
|
|
||||||
PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔(以秒为单位),这将导致 paperless 定期检查消费目录中的更改
|
|
||||||
#PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间(以秒为单位)
|
|
||||||
|
|
||||||
# 使用 SQLite 作为数据库(默认)
|
# 使用 SQLite 作为数据库(默认)
|
||||||
PAPERLESS_DBENGINE: sqlite3
|
PAPERLESS_DBENGINE: sqlite3
|
||||||
@ -34,11 +29,22 @@ services:
|
|||||||
# 定义文件命名规则和存储路径
|
# 定义文件命名规则和存储路径
|
||||||
# 作用不大,主要还是用消费后脚本,以及工作流来指定存储路径。
|
# 作用不大,主要还是用消费后脚本,以及工作流来指定存储路径。
|
||||||
# 工作流先于消费后脚本运行,因此消费后脚本里解析的document_type在工作流里无效。所以使用了文件名关键词匹配
|
# 工作流先于消费后脚本运行,因此消费后脚本里解析的document_type在工作流里无效。所以使用了文件名关键词匹配
|
||||||
PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}.pdf"
|
PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}"
|
||||||
|
|
||||||
# 解析文件里的关键信息,并更新。但无法更新strorage path。这个字段要靠工作流才行。
|
# 解析文件里的关键信息,并更新。但无法更新strorage path。这个字段要靠工作流才行。
|
||||||
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||||
|
|
||||||
|
# 自动删除重复文件
|
||||||
|
PAPERLESS_CONSUMER_DELETE_DUPLICATES: true
|
||||||
|
# 支持消费目录递归检索,即子目录。这样可以支持多个宿主机的目录映射到docker中
|
||||||
|
PAPERLESS_CONSUMER_RECURSIVE: true
|
||||||
|
|
||||||
|
PAPERLESS_OCR_LANGUAGES: "" # 跳过OCR,并不会,只会用默认的eng来执行
|
||||||
|
PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always" # 跳过创建文档存档版本的时间
|
||||||
|
PAPERLESS_OCR_OUTPUT_TYPE: "pdf" # 尽量少修改PDF文档
|
||||||
|
PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔(以秒为单位),这将导致 paperless 定期检查消费目录中的更改
|
||||||
|
#PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间(以秒为单位)
|
||||||
|
|
||||||
# 运行用户
|
# 运行用户
|
||||||
USERMAP_UID: 1000
|
USERMAP_UID: 1000
|
||||||
USERMAP_GID: 1000
|
USERMAP_GID: 1000
|
||||||
@ -46,8 +52,9 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
# 存储所有数据(搜索索引、SQLite 数据库、分类模型等)的地方
|
# 存储所有数据(搜索索引、SQLite 数据库、分类模型等)的地方
|
||||||
- ~/dockers/paperless/data:/usr/src/paperless/data
|
- ~/dockers/paperless/data:/usr/src/paperless/data
|
||||||
# 挂载文件导入目录
|
# 挂载文件导入目录,可以把多个宿主机的目录,挂到docker中,以子目录的形式存在
|
||||||
- ~/dockers/paperless/consume:/usr/src/paperless/consume
|
- ~/dockers/paperless/consume:/usr/src/paperless/consume
|
||||||
|
- ~/dockers/sharedata/consume:/usr/src/paperless/consume/subdir
|
||||||
# 挂载文件导出目录
|
# 挂载文件导出目录
|
||||||
- ~/dockers/paperless/export:/usr/src/paperless/export
|
- ~/dockers/paperless/export:/usr/src/paperless/export
|
||||||
# 存储您的文档和缩略图的地方
|
# 存储您的文档和缩略图的地方
|
||||||
|
|||||||
@ -9,7 +9,8 @@ import logging
|
|||||||
|
|
||||||
# Paperless 服务器信息
|
# Paperless 服务器信息
|
||||||
PAPERLESS_URL = "http://localhost:8000/api"
|
PAPERLESS_URL = "http://localhost:8000/api"
|
||||||
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
|
#AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证, mac上用这个
|
||||||
|
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证,NAS上用这个
|
||||||
|
|
||||||
# 日志配置
|
# 日志配置
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
|
|||||||
149
docker/paperless/plugins/docker_patch.sh
Executable file
149
docker/paperless/plugins/docker_patch.sh
Executable file
@ -0,0 +1,149 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# 定义文件替换对(数组形式,格式:源文件 目标文件)
|
||||||
|
# 可按需添加/删除行,每行一组 源文件 目标文件
|
||||||
|
FILE_PAIRS=(
|
||||||
|
"/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py"
|
||||||
|
# 示例:新增更多文件对
|
||||||
|
#"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py"
|
||||||
|
# "/path/to/source/file3" "/path/to/dest/file3"
|
||||||
|
# "/path/to/source/file4" "/path/to/dest/file4"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 检查所有文件是否存在(仅检查replace/check操作需要的文件)
|
||||||
|
check_files_exist() {
|
||||||
|
local missing=0
|
||||||
|
local pair_count=${#FILE_PAIRS[@]}
|
||||||
|
|
||||||
|
# 遍历文件对(步长2:源文件、目标文件为一组)
|
||||||
|
for ((i=0; i<pair_count; i+=2)); do
|
||||||
|
local source="${FILE_PAIRS[$i]}"
|
||||||
|
local dest="${FILE_PAIRS[$i+1]}"
|
||||||
|
|
||||||
|
# 根据操作类型检查对应文件
|
||||||
|
if [ "$1" = "replace" ] || [ "$1" = "check" ]; then
|
||||||
|
if [ ! -f "$source" ]; then
|
||||||
|
echo "错误:源文件不存在 - $source"
|
||||||
|
missing=1
|
||||||
|
fi
|
||||||
|
if [ ! -f "$dest" ]; then
|
||||||
|
echo "错误:目标文件不存在 - $dest"
|
||||||
|
missing=1
|
||||||
|
fi
|
||||||
|
elif [ "$1" = "rollback" ]; then
|
||||||
|
if [ ! -f "$dest.bak" ]; then
|
||||||
|
echo "警告:备份文件不存在(未执行过替换?) - $dest.bak"
|
||||||
|
missing=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $missing -eq 1 ] && [ "$1" != "rollback" ]; then
|
||||||
|
echo "错误:关键文件缺失,无法继续执行"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 显示所有文件对的差异
|
||||||
|
show_diffs() {
|
||||||
|
local pair_count=${#FILE_PAIRS[@]}
|
||||||
|
echo "=== 开始检查文件差异 ==="
|
||||||
|
|
||||||
|
for ((i=0; i<pair_count; i+=2)); do
|
||||||
|
local source="${FILE_PAIRS[$i]}"
|
||||||
|
local dest="${FILE_PAIRS[$i+1]}"
|
||||||
|
|
||||||
|
echo -e "\n--- 检查 $dest <-> $source 的差异 ---"
|
||||||
|
diff -u "$dest" "$source" || true # 无差异时不报错
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# 备份单个文件(添加 .bak 后缀,保留原权限)
|
||||||
|
backup_file() {
|
||||||
|
local file="$1"
|
||||||
|
local backup="$file.bak"
|
||||||
|
|
||||||
|
if [ -f "$backup" ]; then
|
||||||
|
echo "提示:旧备份文件已存在,将覆盖 - $backup"
|
||||||
|
rm -f "$backup"
|
||||||
|
fi
|
||||||
|
|
||||||
|
cp -a "$file" "$backup" # -a 保留权限、属性、时间戳等
|
||||||
|
echo "已备份:$file -> $backup"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 替换所有文件对
|
||||||
|
replace_files() {
|
||||||
|
local pair_count=${#FILE_PAIRS[@]}
|
||||||
|
echo "=== 开始替换文件(先备份目标文件) ==="
|
||||||
|
|
||||||
|
for ((i=0; i<pair_count; i+=2)); do
|
||||||
|
local source="${FILE_PAIRS[$i]}"
|
||||||
|
local dest="${FILE_PAIRS[$i+1]}"
|
||||||
|
|
||||||
|
echo -e "\n--- 处理文件对:$source -> $dest ---"
|
||||||
|
backup_file "$dest"
|
||||||
|
cp -f "$source" "$dest"
|
||||||
|
echo "已替换:$source 覆盖 $dest"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo -e "\n=== 替换完成,验证最终差异(应无差异) ==="
|
||||||
|
show_diffs
|
||||||
|
}
|
||||||
|
|
||||||
|
# 回滚替换操作(恢复 .bak 备份文件)
|
||||||
|
rollback_files() {
|
||||||
|
local pair_count=${#FILE_PAIRS[@]}
|
||||||
|
echo "=== 开始回滚替换操作 ==="
|
||||||
|
|
||||||
|
for ((i=0; i<pair_count; i+=2)); do
|
||||||
|
local dest="${FILE_PAIRS[$i+1]}"
|
||||||
|
local backup="$dest.bak"
|
||||||
|
|
||||||
|
echo -e "\n--- 处理回滚:$backup -> $dest ---"
|
||||||
|
if [ -f "$backup" ]; then
|
||||||
|
# 先备份当前文件(防止回滚出错)
|
||||||
|
cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true
|
||||||
|
# 恢复备份文件
|
||||||
|
mv -f "$backup" "$dest"
|
||||||
|
echo "已回滚:$dest 恢复为备份版本"
|
||||||
|
# 删除临时文件
|
||||||
|
rm -f "$dest.rollback_temp" 2>/dev/null || true
|
||||||
|
else
|
||||||
|
echo "跳过:备份文件不存在 - $backup"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo -e "\n=== 回滚操作执行完成 ==="
|
||||||
|
}
|
||||||
|
|
||||||
|
# 主逻辑
|
||||||
|
main() {
|
||||||
|
case "$1" in
|
||||||
|
check)
|
||||||
|
echo "=== 执行文件差异检查(不修改文件) ==="
|
||||||
|
check_files_exist "check"
|
||||||
|
show_diffs
|
||||||
|
;;
|
||||||
|
replace)
|
||||||
|
echo "=== 执行文件替换操作(自动备份) ==="
|
||||||
|
check_files_exist "replace"
|
||||||
|
replace_files
|
||||||
|
;;
|
||||||
|
rollback)
|
||||||
|
echo "=== 执行文件回滚操作(恢复备份) ==="
|
||||||
|
check_files_exist "rollback"
|
||||||
|
rollback_files
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "用法:$0 [check|replace|rollback]"
|
||||||
|
echo " check - 仅检查所有文件对的差异,不做修改"
|
||||||
|
echo " replace - 备份所有目标文件并执行替换,完成后验证差异"
|
||||||
|
echo " rollback - 回滚替换操作(恢复 .bak 备份文件)"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
# 启动主逻辑
|
||||||
|
main "$1"
|
||||||
47
docker/paperless/plugins/em_reports_consume.sh
Normal file
47
docker/paperless/plugins/em_reports_consume.sh
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
SRC="/volume1/docker/sharedata/stock_data/pdfs"
|
||||||
|
DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
|
||||||
|
LOG="/volume1/docker/projects/devops/docker/paperless/plugins/log/paperless.log"
|
||||||
|
|
||||||
|
TARGET_UID=1000
|
||||||
|
TARGET_GID=1000
|
||||||
|
|
||||||
|
# 检查目录
|
||||||
|
if [ ! -d "$SRC" ]; then
|
||||||
|
echo "$(date '+%F %T') [ERROR] 源目录不存在: $SRC" | tee -a "$LOG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -d "$DST" ]; then
|
||||||
|
echo "$(date '+%F %T') [ERROR] 目标目录不存在: $DST" | tee -a "$LOG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 关键添加:检查并创建log目录(-p 确保父目录存在,无报错)
|
||||||
|
LOG_DIR=$(dirname "$LOG") # 提取日志文件所在目录(即 ./log)
|
||||||
|
if [ ! -d "$LOG_DIR" ]; then
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
echo "$(date '+%F %T') [INFO] log目录不存在,已创建: $LOG_DIR" | tee -a "$LOG"
|
||||||
|
fi
|
||||||
|
|
||||||
|
COUNT=0
|
||||||
|
for f in "$SRC"/*.pdf; do
|
||||||
|
[ -f "$f" ] || continue
|
||||||
|
|
||||||
|
# 移动 + 改属主 + 改权限
|
||||||
|
if install -D -o "$TARGET_UID" -g "$TARGET_GID" -m 644 "$f" "$DST"; then
|
||||||
|
rm -f "$f"
|
||||||
|
echo "$(date '+%F %T') [OK] Moved: $f" >> "$LOG"
|
||||||
|
((COUNT++))
|
||||||
|
|
||||||
|
# 每移动10个文件,输出进度到屏幕(同时写入日志)
|
||||||
|
if (( COUNT % 100 == 0 )); then
|
||||||
|
PROGRESS_MSG="$(date '+%F %T') [PROGRESS] 已移动 $COUNT 个文件"
|
||||||
|
echo "$PROGRESS_MSG" | tee -a "$LOG"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "$(date '+%F %T') [FAIL] Failed: $f" >> "$LOG"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "$(date '+%F %T') [INFO] 搬运完成,共移动 $COUNT 个文件" | tee -a "$LOG"
|
||||||
|
|
||||||
472
docker/paperless/plugins/origin_parsers.py
Normal file
472
docker/paperless/plugins/origin_parsers.py
Normal file
@ -0,0 +1,472 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from documents.parsers import DocumentParser
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
from documents.utils import maybe_override_pixel_limit
|
||||||
|
from documents.utils import run_subprocess
|
||||||
|
from paperless.config import OcrConfig
|
||||||
|
from paperless.models import ArchiveFileChoices
|
||||||
|
from paperless.models import CleanChoices
|
||||||
|
from paperless.models import ModeChoices
|
||||||
|
|
||||||
|
|
||||||
|
class NoTextFoundException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RtlLanguageException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RasterisedDocumentParser(DocumentParser):
|
||||||
|
"""
|
||||||
|
This parser uses Tesseract to try and get some text out of a rasterised
|
||||||
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging_name = "paperless.parsing.tesseract"
|
||||||
|
|
||||||
|
def get_settings(self) -> OcrConfig:
|
||||||
|
"""
|
||||||
|
This parser uses the OCR configuration settings to parse documents
|
||||||
|
"""
|
||||||
|
return OcrConfig()
|
||||||
|
|
||||||
|
def get_page_count(self, document_path, mime_type):
|
||||||
|
page_count = None
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
try:
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
with pikepdf.Pdf.open(document_path) as pdf:
|
||||||
|
page_count = len(pdf.pages)
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"Unable to determine PDF page count {document_path}: {e}",
|
||||||
|
)
|
||||||
|
return page_count
|
||||||
|
|
||||||
|
def extract_metadata(self, document_path, mime_type):
|
||||||
|
result = []
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||||
|
|
||||||
|
pdf = pikepdf.open(document_path)
|
||||||
|
meta = pdf.open_metadata()
|
||||||
|
for key, value in meta.items():
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join([str(e) for e in value])
|
||||||
|
value = str(value)
|
||||||
|
try:
|
||||||
|
m = namespace_pattern.match(key)
|
||||||
|
if m is None: # pragma: no cover
|
||||||
|
continue
|
||||||
|
namespace = m.group(1)
|
||||||
|
key_value = m.group(2)
|
||||||
|
try:
|
||||||
|
namespace.encode("utf-8")
|
||||||
|
key_value.encode("utf-8")
|
||||||
|
except UnicodeEncodeError as e: # pragma: no cover
|
||||||
|
self.log.debug(f"Skipping metadata key {key}: {e}")
|
||||||
|
continue
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"namespace": namespace,
|
||||||
|
"prefix": meta.REVERSE_NS[namespace],
|
||||||
|
"key": key_value,
|
||||||
|
"value": value,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"Error while reading metadata {key}: {value}. Error: {e}",
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||||
|
return make_thumbnail_from_pdf(
|
||||||
|
self.archive_path or document_path,
|
||||||
|
self.tempdir,
|
||||||
|
self.logging_group,
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_image(self, mime_type) -> bool:
|
||||||
|
return mime_type in [
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/tiff",
|
||||||
|
"image/bmp",
|
||||||
|
"image/gif",
|
||||||
|
"image/webp",
|
||||||
|
"image/heic",
|
||||||
|
]
|
||||||
|
|
||||||
|
def has_alpha(self, image) -> bool:
|
||||||
|
with Image.open(image) as im:
|
||||||
|
return im.mode in ("RGBA", "LA")
|
||||||
|
|
||||||
|
def remove_alpha(self, image_path: str) -> Path:
|
||||||
|
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||||
|
run_subprocess(
|
||||||
|
[
|
||||||
|
settings.CONVERT_BINARY,
|
||||||
|
"-alpha",
|
||||||
|
"off",
|
||||||
|
image_path,
|
||||||
|
no_alpha_image,
|
||||||
|
],
|
||||||
|
logger=self.log,
|
||||||
|
)
|
||||||
|
return no_alpha_image
|
||||||
|
|
||||||
|
def get_dpi(self, image) -> int | None:
|
||||||
|
try:
|
||||||
|
with Image.open(image) as im:
|
||||||
|
x, _ = im.info["dpi"]
|
||||||
|
return round(x)
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def calculate_a4_dpi(self, image) -> int | None:
|
||||||
|
try:
|
||||||
|
with Image.open(image) as im:
|
||||||
|
width, _ = im.size
|
||||||
|
# divide image width by A4 width (210mm) in inches.
|
||||||
|
dpi = int(width / (21 / 2.54))
|
||||||
|
self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
|
||||||
|
return dpi
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_text(
|
||||||
|
self,
|
||||||
|
sidecar_file: Path | None,
|
||||||
|
pdf_file: Path,
|
||||||
|
) -> str | None:
|
||||||
|
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||||
|
# the whole text, so do not utilize it in that case
|
||||||
|
if (
|
||||||
|
sidecar_file is not None
|
||||||
|
and sidecar_file.is_file()
|
||||||
|
and self.settings.mode != "redo"
|
||||||
|
):
|
||||||
|
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||||
|
|
||||||
|
if "[OCR skipped on page" not in text:
|
||||||
|
# This happens when there's already text in the input file.
|
||||||
|
# The sidecar file will only contain text for OCR'ed pages.
|
||||||
|
self.log.debug("Using text from sidecar file")
|
||||||
|
return post_process_text(text)
|
||||||
|
else:
|
||||||
|
self.log.debug("Incomplete sidecar file: discarding.")
|
||||||
|
|
||||||
|
# no success with the sidecar file, try PDF
|
||||||
|
|
||||||
|
if not Path(pdf_file).is_file():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = None
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w+",
|
||||||
|
dir=self.tempdir,
|
||||||
|
) as tmp:
|
||||||
|
run_subprocess(
|
||||||
|
[
|
||||||
|
"pdftotext",
|
||||||
|
"-q",
|
||||||
|
"-layout",
|
||||||
|
"-enc",
|
||||||
|
"UTF-8",
|
||||||
|
pdf_file,
|
||||||
|
tmp.name,
|
||||||
|
],
|
||||||
|
logger=self.log,
|
||||||
|
)
|
||||||
|
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||||
|
|
||||||
|
return post_process_text(text)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# If pdftotext fails, fall back to OCR.
|
||||||
|
self.log.warning(
|
||||||
|
"Error while getting text from PDF document with pdftotext",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
# probably not a PDF file.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def construct_ocrmypdf_parameters(
|
||||||
|
self,
|
||||||
|
input_file,
|
||||||
|
mime_type,
|
||||||
|
output_file,
|
||||||
|
sidecar_file,
|
||||||
|
*,
|
||||||
|
safe_fallback=False,
|
||||||
|
):
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert isinstance(self.settings, OcrConfig)
|
||||||
|
ocrmypdf_args = {
|
||||||
|
"input_file": input_file,
|
||||||
|
"output_file": output_file,
|
||||||
|
# need to use threads, since this will be run in daemonized
|
||||||
|
# processes via the task library.
|
||||||
|
"use_threads": True,
|
||||||
|
"jobs": settings.THREADS_PER_WORKER,
|
||||||
|
"language": self.settings.language,
|
||||||
|
"output_type": self.settings.output_type,
|
||||||
|
"progress_bar": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
if "pdfa" in ocrmypdf_args["output_type"]:
|
||||||
|
ocrmypdf_args["color_conversion_strategy"] = (
|
||||||
|
self.settings.color_conversion_strategy
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||||
|
ocrmypdf_args["force_ocr"] = True
|
||||||
|
elif self.settings.mode in {
|
||||||
|
ModeChoices.SKIP,
|
||||||
|
ModeChoices.SKIP_NO_ARCHIVE,
|
||||||
|
}:
|
||||||
|
ocrmypdf_args["skip_text"] = True
|
||||||
|
elif self.settings.mode == ModeChoices.REDO:
|
||||||
|
ocrmypdf_args["redo_ocr"] = True
|
||||||
|
else: # pragma: no cover
|
||||||
|
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||||
|
|
||||||
|
if self.settings.clean == CleanChoices.CLEAN:
|
||||||
|
ocrmypdf_args["clean"] = True
|
||||||
|
elif self.settings.clean == CleanChoices.FINAL:
|
||||||
|
if self.settings.mode == ModeChoices.REDO:
|
||||||
|
ocrmypdf_args["clean"] = True
|
||||||
|
else:
|
||||||
|
# --clean-final is not compatible with --redo-ocr
|
||||||
|
ocrmypdf_args["clean_final"] = True
|
||||||
|
|
||||||
|
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
|
||||||
|
# --deskew is not compatible with --redo-ocr
|
||||||
|
ocrmypdf_args["deskew"] = True
|
||||||
|
|
||||||
|
if self.settings.rotate:
|
||||||
|
ocrmypdf_args["rotate_pages"] = True
|
||||||
|
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
|
||||||
|
|
||||||
|
if self.settings.pages is not None and self.settings.pages > 0:
|
||||||
|
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
|
||||||
|
else:
|
||||||
|
# sidecar is incompatible with pages
|
||||||
|
ocrmypdf_args["sidecar"] = sidecar_file
|
||||||
|
|
||||||
|
if self.is_image(mime_type):
|
||||||
|
# This may be required, depending on the known information
|
||||||
|
maybe_override_pixel_limit()
|
||||||
|
|
||||||
|
dpi = self.get_dpi(input_file)
|
||||||
|
a4_dpi = self.calculate_a4_dpi(input_file)
|
||||||
|
|
||||||
|
if self.has_alpha(input_file):
|
||||||
|
self.log.info(
|
||||||
|
f"Removing alpha layer from {input_file} "
|
||||||
|
"for compatibility with img2pdf",
|
||||||
|
)
|
||||||
|
# Replace the input file with the non-alpha
|
||||||
|
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
|
||||||
|
|
||||||
|
if dpi:
|
||||||
|
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||||
|
ocrmypdf_args["image_dpi"] = dpi
|
||||||
|
elif self.settings.image_dpi is not None:
|
||||||
|
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
|
||||||
|
elif a4_dpi:
|
||||||
|
ocrmypdf_args["image_dpi"] = a4_dpi
|
||||||
|
else:
|
||||||
|
raise ParseError(
|
||||||
|
f"Cannot produce archive PDF for image {input_file}, "
|
||||||
|
f"no DPI information is present in this image and "
|
||||||
|
f"OCR_IMAGE_DPI is not set.",
|
||||||
|
)
|
||||||
|
if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover
|
||||||
|
self.log.warning(
|
||||||
|
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.settings.user_args is not None:
|
||||||
|
try:
|
||||||
|
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||||
|
f"they will not be used. Error: {e}",
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.settings.max_image_pixel is not None
|
||||||
|
and self.settings.max_image_pixel >= 0
|
||||||
|
):
|
||||||
|
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||||
|
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
|
||||||
|
msg = (
|
||||||
|
"OCR pixel limit is disabled!"
|
||||||
|
if max_pixels_mpixels == 0
|
||||||
|
else f"Calculated {max_pixels_mpixels} megapixels for OCR"
|
||||||
|
)
|
||||||
|
self.log.debug(msg)
|
||||||
|
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
|
||||||
|
|
||||||
|
return ocrmypdf_args
|
||||||
|
|
||||||
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||||
|
# This forces tesseract to use one core per page.
|
||||||
|
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||||
|
VALID_TEXT_LENGTH = 50
|
||||||
|
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
text_original = self.extract_text(None, document_path)
|
||||||
|
original_has_text = (
|
||||||
|
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
text_original = None
|
||||||
|
original_has_text = False
|
||||||
|
|
||||||
|
# If the original has text, and the user doesn't want an archive,
|
||||||
|
# we're done here
|
||||||
|
skip_archive_for_text = (
|
||||||
|
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||||
|
or self.settings.skip_archive_file
|
||||||
|
in {
|
||||||
|
ArchiveFileChoices.WITH_TEXT,
|
||||||
|
ArchiveFileChoices.ALWAYS,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if skip_archive_for_text and original_has_text:
|
||||||
|
self.log.debug(f"Document has text, skipping OCRmyPDF entirely. {text_original}")
|
||||||
|
self.text = text_original
|
||||||
|
return
|
||||||
|
|
||||||
|
# Either no text was in the original or there should be an archive
|
||||||
|
# file created, so OCR the file and create an archive with any
|
||||||
|
# text located via OCR
|
||||||
|
|
||||||
|
import ocrmypdf
|
||||||
|
from ocrmypdf import EncryptedPdfError
|
||||||
|
from ocrmypdf import InputFileError
|
||||||
|
from ocrmypdf import SubprocessOutputError
|
||||||
|
from ocrmypdf.exceptions import DigitalSignatureError
|
||||||
|
|
||||||
|
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||||
|
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||||
|
|
||||||
|
args = self.construct_ocrmypdf_parameters(
|
||||||
|
document_path,
|
||||||
|
mime_type,
|
||||||
|
archive_path,
|
||||||
|
sidecar_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||||
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
|
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||||
|
self.archive_path = archive_path
|
||||||
|
|
||||||
|
self.text = self.extract_text(sidecar_file, archive_path)
|
||||||
|
|
||||||
|
if not self.text:
|
||||||
|
raise NoTextFoundException("No text was found in the original document")
|
||||||
|
except (DigitalSignatureError, EncryptedPdfError):
|
||||||
|
self.log.warning(
|
||||||
|
"This file is encrypted and/or signed, OCR is impossible. Using "
|
||||||
|
"any text present in the original file.",
|
||||||
|
)
|
||||||
|
if original_has_text:
|
||||||
|
self.text = text_original
|
||||||
|
except SubprocessOutputError as e:
|
||||||
|
if "Ghostscript PDF/A rendering" in str(e):
|
||||||
|
self.log.warning(
|
||||||
|
"Ghostscript PDF/A rendering failed, consider setting "
|
||||||
|
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ParseError(
|
||||||
|
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||||
|
) from e
|
||||||
|
except (NoTextFoundException, InputFileError) as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"Encountered an error while running OCR: {e!s}. "
|
||||||
|
f"Attempting force OCR to get the text.",
|
||||||
|
)
|
||||||
|
|
||||||
|
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||||
|
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||||
|
|
||||||
|
# Attempt to run OCR with safe settings.
|
||||||
|
|
||||||
|
args = self.construct_ocrmypdf_parameters(
|
||||||
|
document_path,
|
||||||
|
mime_type,
|
||||||
|
archive_path_fallback,
|
||||||
|
sidecar_file_fallback,
|
||||||
|
safe_fallback=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||||
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
|
# Don't return the archived file here, since this file
|
||||||
|
# is bigger and blurry due to --force-ocr.
|
||||||
|
|
||||||
|
self.text = self.extract_text(
|
||||||
|
sidecar_file_fallback,
|
||||||
|
archive_path_fallback,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If this fails, we have a serious issue at hand.
|
||||||
|
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Anything else is probably serious.
|
||||||
|
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||||
|
|
||||||
|
# As a last resort, if we still don't have any text for any reason,
|
||||||
|
# try to extract the text from the original document.
|
||||||
|
if not self.text:
|
||||||
|
if original_has_text:
|
||||||
|
self.text = text_original
|
||||||
|
else:
|
||||||
|
self.log.warning(
|
||||||
|
f"No text was found in {document_path}, the content will be empty.",
|
||||||
|
)
|
||||||
|
self.text = ""
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_text(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||||
|
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
|
||||||
|
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
|
||||||
|
|
||||||
|
# TODO: this needs a rework
|
||||||
|
# replace \0 prevents issues with saving to postgres.
|
||||||
|
# text may contain \0 when this character is present in PDF files.
|
||||||
|
return no_trailing_whitespace.strip().replace("\0", " ")
|
||||||
@ -1,41 +0,0 @@
|
|||||||
-- documents_correspondent definition
|
|
||||||
|
|
||||||
CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
|
|
||||||
|
|
||||||
CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
|
|
||||||
CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
|
|
||||||
|
|
||||||
-- documents_customfield definition
|
|
||||||
|
|
||||||
CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
|
|
||||||
|
|
||||||
CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
|
|
||||||
|
|
||||||
-- documents_customfieldinstance definition
|
|
||||||
|
|
||||||
CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
|
|
||||||
|
|
||||||
CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
|
|
||||||
CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
|
|
||||||
CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
|
|
||||||
|
|
||||||
|
|
||||||
-- documents_document definition
|
|
||||||
|
|
||||||
CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
|
|
||||||
|
|
||||||
CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
|
|
||||||
CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
|
|
||||||
CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
|
|
||||||
CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
|
|
||||||
CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
|
|
||||||
CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
|
|
||||||
CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
|
|
||||||
CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
|
|
||||||
|
|
||||||
-- documents_documenttype definition
|
|
||||||
|
|
||||||
CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
|
|
||||||
|
|
||||||
CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
|
|
||||||
CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");
|
|
||||||
@ -1,63 +0,0 @@
|
|||||||
我提供的文件,是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下:
|
|
||||||
|
|
||||||
1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
|
|
||||||
2,我们提取上面的各个字段,然后:
|
|
||||||
1) report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表,如果对应的name不存在,则插入一条记录;然后得到对应的 documents_documenttype.id
|
|
||||||
2) org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表,如果对应的name 不存在,则插入一条记录,然后得到对应的 documents_correspondent.id
|
|
||||||
3) 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段,如果不存在,则创建; 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
|
|
||||||
3,我们开始更新数据表:
|
|
||||||
1) 更新 documents_document 表对应的记录, reated = publish_date, correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title}
|
|
||||||
2) 向 documents_customfieldinstance 两条记录,分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
|
|
||||||
|
|
||||||
好了,请你根据以上需求,完成这个python脚本。注意异常情况的处理,以及日志输出。如果文件名无法匹配以上的格式,则忽略,不用处理。
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
|
|
||||||
|
|
||||||
The API provides the following main endpoints:
|
|
||||||
|
|
||||||
/api/correspondents/: Full CRUD support.
|
|
||||||
/api/custom_fields/: Full CRUD support.
|
|
||||||
/api/documents/: Full CRUD support, except POSTing new documents. See below.
|
|
||||||
/api/document_types/: Full CRUD support.
|
|
||||||
/api/groups/: Full CRUD support.
|
|
||||||
/api/logs/: Read-Only.
|
|
||||||
/api/mail_accounts/: Full CRUD support.
|
|
||||||
/api/mail_rules/: Full CRUD support.
|
|
||||||
/api/profile/: GET, PATCH
|
|
||||||
/api/share_links/: Full CRUD support.
|
|
||||||
/api/storage_paths/: Full CRUD support.
|
|
||||||
/api/tags/: Full CRUD support.
|
|
||||||
/api/tasks/: Read-only.
|
|
||||||
/api/users/: Full CRUD support.
|
|
||||||
/api/workflows/: Full CRUD support.
|
|
||||||
/api/search/ GET, see below.
|
|
||||||
All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
|
|
||||||
|
|
||||||
The objects served by the document endpoint contain the following fields:
|
|
||||||
|
|
||||||
id: ID of the document. Read-only.
|
|
||||||
title: Title of the document.
|
|
||||||
content: Plain text content of the document.
|
|
||||||
tags: List of IDs of tags assigned to this document, or empty list.
|
|
||||||
document_type: Document type of this document, or null.
|
|
||||||
correspondent: Correspondent of this document or null.
|
|
||||||
created: The date time at which this document was created.
|
|
||||||
created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
|
|
||||||
modified: The date at which this document was last edited in paperless. Read-only.
|
|
||||||
added: The date at which this document was added to paperless. Read-only.
|
|
||||||
archive_serial_number: The identifier of this document in a physical document archive.
|
|
||||||
original_file_name: Verbose filename of the original document. Read-only.
|
|
||||||
archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
|
|
||||||
notes: Array of notes associated with the document.
|
|
||||||
page_count: Number of pages.
|
|
||||||
set_permissions: Allows setting document permissions. Optional, write-only. See below.
|
|
||||||
custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
|
|
||||||
|
|
||||||
|
|
||||||
以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么,我想对编号为19的文档进行查询,以及更新操作,应该如何写对应的python代码?
|
|
||||||
|
|
||||||
|
|
||||||
@ -11,7 +11,8 @@ from requests.exceptions import RequestException
|
|||||||
|
|
||||||
# Paperless 服务器信息
|
# Paperless 服务器信息
|
||||||
PAPERLESS_URL = "http://localhost:8000/api"
|
PAPERLESS_URL = "http://localhost:8000/api"
|
||||||
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
|
#AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证, mac上用这个
|
||||||
|
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证,NAS上用这个
|
||||||
|
|
||||||
# 日志配置
|
# 日志配置
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
@ -22,7 +23,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(me
|
|||||||
DB_PATH = "/usr/src/paperless/data/db.sqlite3"
|
DB_PATH = "/usr/src/paperless/data/db.sqlite3"
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
enable_db = True
|
enable_db = False # 标准用法,用API
|
||||||
|
|
||||||
# 正则解析文件名
|
# 正则解析文件名
|
||||||
FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")
|
FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")
|
||||||
|
|||||||
484
docker/paperless/plugins/parsers.py
Executable file
484
docker/paperless/plugins/parsers.py
Executable file
@ -0,0 +1,484 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from documents.parsers import DocumentParser
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
from documents.utils import maybe_override_pixel_limit
|
||||||
|
from documents.utils import run_subprocess
|
||||||
|
from paperless.config import OcrConfig
|
||||||
|
from paperless.models import ArchiveFileChoices
|
||||||
|
from paperless.models import CleanChoices
|
||||||
|
from paperless.models import ModeChoices
|
||||||
|
|
||||||
|
|
||||||
|
class NoTextFoundException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RtlLanguageException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RasterisedDocumentParser(DocumentParser):
|
||||||
|
"""
|
||||||
|
This parser uses Tesseract to try and get some text out of a rasterised
|
||||||
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging_name = "paperless.parsing.tesseract"
|
||||||
|
|
||||||
|
def get_settings(self) -> OcrConfig:
|
||||||
|
"""
|
||||||
|
This parser uses the OCR configuration settings to parse documents
|
||||||
|
"""
|
||||||
|
return OcrConfig()
|
||||||
|
|
||||||
|
def get_page_count(self, document_path, mime_type):
|
||||||
|
page_count = None
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
try:
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
with pikepdf.Pdf.open(document_path) as pdf:
|
||||||
|
page_count = len(pdf.pages)
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"Unable to determine PDF page count {document_path}: {e}",
|
||||||
|
)
|
||||||
|
return page_count
|
||||||
|
|
||||||
|
def extract_metadata(self, document_path, mime_type):
|
||||||
|
result = []
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||||
|
|
||||||
|
pdf = pikepdf.open(document_path)
|
||||||
|
meta = pdf.open_metadata()
|
||||||
|
for key, value in meta.items():
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join([str(e) for e in value])
|
||||||
|
value = str(value)
|
||||||
|
try:
|
||||||
|
m = namespace_pattern.match(key)
|
||||||
|
if m is None: # pragma: no cover
|
||||||
|
continue
|
||||||
|
namespace = m.group(1)
|
||||||
|
key_value = m.group(2)
|
||||||
|
try:
|
||||||
|
namespace.encode("utf-8")
|
||||||
|
key_value.encode("utf-8")
|
||||||
|
except UnicodeEncodeError as e: # pragma: no cover
|
||||||
|
self.log.debug(f"Skipping metadata key {key}: {e}")
|
||||||
|
continue
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"namespace": namespace,
|
||||||
|
"prefix": meta.REVERSE_NS[namespace],
|
||||||
|
"key": key_value,
|
||||||
|
"value": value,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"Error while reading metadata {key}: {value}. Error: {e}",
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||||
|
return make_thumbnail_from_pdf(
|
||||||
|
self.archive_path or document_path,
|
||||||
|
self.tempdir,
|
||||||
|
self.logging_group,
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_image(self, mime_type) -> bool:
|
||||||
|
return mime_type in [
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/tiff",
|
||||||
|
"image/bmp",
|
||||||
|
"image/gif",
|
||||||
|
"image/webp",
|
||||||
|
"image/heic",
|
||||||
|
]
|
||||||
|
|
||||||
|
def has_alpha(self, image) -> bool:
|
||||||
|
with Image.open(image) as im:
|
||||||
|
return im.mode in ("RGBA", "LA")
|
||||||
|
|
||||||
|
def remove_alpha(self, image_path: str) -> Path:
|
||||||
|
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||||
|
run_subprocess(
|
||||||
|
[
|
||||||
|
settings.CONVERT_BINARY,
|
||||||
|
"-alpha",
|
||||||
|
"off",
|
||||||
|
image_path,
|
||||||
|
no_alpha_image,
|
||||||
|
],
|
||||||
|
logger=self.log,
|
||||||
|
)
|
||||||
|
return no_alpha_image
|
||||||
|
|
||||||
|
def get_dpi(self, image) -> int | None:
|
||||||
|
try:
|
||||||
|
with Image.open(image) as im:
|
||||||
|
x, _ = im.info["dpi"]
|
||||||
|
return round(x)
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def calculate_a4_dpi(self, image) -> int | None:
|
||||||
|
try:
|
||||||
|
with Image.open(image) as im:
|
||||||
|
width, _ = im.size
|
||||||
|
# divide image width by A4 width (210mm) in inches.
|
||||||
|
dpi = int(width / (21 / 2.54))
|
||||||
|
self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
|
||||||
|
return dpi
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_text(
|
||||||
|
self,
|
||||||
|
sidecar_file: Path | None,
|
||||||
|
pdf_file: Path,
|
||||||
|
) -> str | None:
|
||||||
|
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||||
|
# the whole text, so do not utilize it in that case
|
||||||
|
if (
|
||||||
|
sidecar_file is not None
|
||||||
|
and sidecar_file.is_file()
|
||||||
|
and self.settings.mode != "redo"
|
||||||
|
):
|
||||||
|
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||||
|
|
||||||
|
if "[OCR skipped on page" not in text:
|
||||||
|
# This happens when there's already text in the input file.
|
||||||
|
# The sidecar file will only contain text for OCR'ed pages.
|
||||||
|
self.log.debug("Using text from sidecar file")
|
||||||
|
return post_process_text(text)
|
||||||
|
else:
|
||||||
|
self.log.debug("Incomplete sidecar file: discarding.")
|
||||||
|
|
||||||
|
# no success with the sidecar file, try PDF
|
||||||
|
|
||||||
|
if not Path(pdf_file).is_file():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = None
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w+",
|
||||||
|
dir=self.tempdir,
|
||||||
|
) as tmp:
|
||||||
|
run_subprocess(
|
||||||
|
[
|
||||||
|
"pdftotext",
|
||||||
|
"-q",
|
||||||
|
"-layout",
|
||||||
|
"-enc",
|
||||||
|
"UTF-8",
|
||||||
|
pdf_file,
|
||||||
|
tmp.name,
|
||||||
|
],
|
||||||
|
logger=self.log,
|
||||||
|
)
|
||||||
|
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||||
|
|
||||||
|
return post_process_text(text)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# If pdftotext fails, fall back to OCR.
|
||||||
|
self.log.warning(
|
||||||
|
"Error while getting text from PDF document with pdftotext",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
# probably not a PDF file.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def construct_ocrmypdf_parameters(
|
||||||
|
self,
|
||||||
|
input_file,
|
||||||
|
mime_type,
|
||||||
|
output_file,
|
||||||
|
sidecar_file,
|
||||||
|
*,
|
||||||
|
safe_fallback=False,
|
||||||
|
):
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert isinstance(self.settings, OcrConfig)
|
||||||
|
ocrmypdf_args = {
|
||||||
|
"input_file": input_file,
|
||||||
|
"output_file": output_file,
|
||||||
|
# need to use threads, since this will be run in daemonized
|
||||||
|
# processes via the task library.
|
||||||
|
"use_threads": True,
|
||||||
|
"jobs": settings.THREADS_PER_WORKER,
|
||||||
|
"language": self.settings.language,
|
||||||
|
"output_type": self.settings.output_type,
|
||||||
|
"progress_bar": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
if "pdfa" in ocrmypdf_args["output_type"]:
|
||||||
|
ocrmypdf_args["color_conversion_strategy"] = (
|
||||||
|
self.settings.color_conversion_strategy
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||||
|
ocrmypdf_args["force_ocr"] = True
|
||||||
|
elif self.settings.mode in {
|
||||||
|
ModeChoices.SKIP,
|
||||||
|
ModeChoices.SKIP_NO_ARCHIVE,
|
||||||
|
}:
|
||||||
|
ocrmypdf_args["skip_text"] = True
|
||||||
|
elif self.settings.mode == ModeChoices.REDO:
|
||||||
|
ocrmypdf_args["redo_ocr"] = True
|
||||||
|
else: # pragma: no cover
|
||||||
|
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||||
|
|
||||||
|
if self.settings.clean == CleanChoices.CLEAN:
|
||||||
|
ocrmypdf_args["clean"] = True
|
||||||
|
elif self.settings.clean == CleanChoices.FINAL:
|
||||||
|
if self.settings.mode == ModeChoices.REDO:
|
||||||
|
ocrmypdf_args["clean"] = True
|
||||||
|
else:
|
||||||
|
# --clean-final is not compatible with --redo-ocr
|
||||||
|
ocrmypdf_args["clean_final"] = True
|
||||||
|
|
||||||
|
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
|
||||||
|
# --deskew is not compatible with --redo-ocr
|
||||||
|
ocrmypdf_args["deskew"] = True
|
||||||
|
|
||||||
|
if self.settings.rotate:
|
||||||
|
ocrmypdf_args["rotate_pages"] = True
|
||||||
|
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
|
||||||
|
|
||||||
|
if self.settings.pages is not None and self.settings.pages > 0:
|
||||||
|
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
|
||||||
|
else:
|
||||||
|
# sidecar is incompatible with pages
|
||||||
|
ocrmypdf_args["sidecar"] = sidecar_file
|
||||||
|
|
||||||
|
if self.is_image(mime_type):
|
||||||
|
# This may be required, depending on the known information
|
||||||
|
maybe_override_pixel_limit()
|
||||||
|
|
||||||
|
dpi = self.get_dpi(input_file)
|
||||||
|
a4_dpi = self.calculate_a4_dpi(input_file)
|
||||||
|
|
||||||
|
if self.has_alpha(input_file):
|
||||||
|
self.log.info(
|
||||||
|
f"Removing alpha layer from {input_file} "
|
||||||
|
"for compatibility with img2pdf",
|
||||||
|
)
|
||||||
|
# Replace the input file with the non-alpha
|
||||||
|
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
|
||||||
|
|
||||||
|
if dpi:
|
||||||
|
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||||
|
ocrmypdf_args["image_dpi"] = dpi
|
||||||
|
elif self.settings.image_dpi is not None:
|
||||||
|
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
|
||||||
|
elif a4_dpi:
|
||||||
|
ocrmypdf_args["image_dpi"] = a4_dpi
|
||||||
|
else:
|
||||||
|
raise ParseError(
|
||||||
|
f"Cannot produce archive PDF for image {input_file}, "
|
||||||
|
f"no DPI information is present in this image and "
|
||||||
|
f"OCR_IMAGE_DPI is not set.",
|
||||||
|
)
|
||||||
|
if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover
|
||||||
|
self.log.warning(
|
||||||
|
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.settings.user_args is not None:
|
||||||
|
try:
|
||||||
|
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
|
||||||
|
except Exception as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||||
|
f"they will not be used. Error: {e}",
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.settings.max_image_pixel is not None
|
||||||
|
and self.settings.max_image_pixel >= 0
|
||||||
|
):
|
||||||
|
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||||
|
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
|
||||||
|
msg = (
|
||||||
|
"OCR pixel limit is disabled!"
|
||||||
|
if max_pixels_mpixels == 0
|
||||||
|
else f"Calculated {max_pixels_mpixels} megapixels for OCR"
|
||||||
|
)
|
||||||
|
self.log.debug(msg)
|
||||||
|
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
|
||||||
|
|
||||||
|
return ocrmypdf_args
|
||||||
|
|
||||||
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||||
|
# This forces tesseract to use one core per page.
|
||||||
|
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||||
|
VALID_TEXT_LENGTH = 50
|
||||||
|
|
||||||
|
# skip ocr process entirely to save time.
|
||||||
|
self.text = "defautl text"
|
||||||
|
self.log.debug("skipping reading file entirely.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
text_original = self.extract_text(None, document_path)
|
||||||
|
original_has_text = (
|
||||||
|
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
text_original = None
|
||||||
|
original_has_text = False
|
||||||
|
|
||||||
|
# If the original has text, and the user doesn't want an archive,
|
||||||
|
# we're done here
|
||||||
|
skip_archive_for_text = (
|
||||||
|
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||||
|
or self.settings.skip_archive_file
|
||||||
|
in {
|
||||||
|
ArchiveFileChoices.WITH_TEXT,
|
||||||
|
ArchiveFileChoices.ALWAYS,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# force skip ocr process.
|
||||||
|
if not original_has_text:
|
||||||
|
original_has_text = True
|
||||||
|
text_original = "this is default content, as we skipped ocr process..."
|
||||||
|
self.log.warning("Cannot read text from Document, use default message.")
|
||||||
|
|
||||||
|
if skip_archive_for_text and original_has_text:
|
||||||
|
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||||
|
self.text = text_original
|
||||||
|
return
|
||||||
|
|
||||||
|
# Either no text was in the original or there should be an archive
|
||||||
|
# file created, so OCR the file and create an archive with any
|
||||||
|
# text located via OCR
|
||||||
|
|
||||||
|
import ocrmypdf
|
||||||
|
from ocrmypdf import EncryptedPdfError
|
||||||
|
from ocrmypdf import InputFileError
|
||||||
|
from ocrmypdf import SubprocessOutputError
|
||||||
|
from ocrmypdf.exceptions import DigitalSignatureError
|
||||||
|
|
||||||
|
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||||
|
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||||
|
|
||||||
|
args = self.construct_ocrmypdf_parameters(
|
||||||
|
document_path,
|
||||||
|
mime_type,
|
||||||
|
archive_path,
|
||||||
|
sidecar_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||||
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
|
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||||
|
self.archive_path = archive_path
|
||||||
|
|
||||||
|
self.text = self.extract_text(sidecar_file, archive_path)
|
||||||
|
|
||||||
|
if not self.text:
|
||||||
|
raise NoTextFoundException("No text was found in the original document")
|
||||||
|
except (DigitalSignatureError, EncryptedPdfError):
|
||||||
|
self.log.warning(
|
||||||
|
"This file is encrypted and/or signed, OCR is impossible. Using "
|
||||||
|
"any text present in the original file.",
|
||||||
|
)
|
||||||
|
if original_has_text:
|
||||||
|
self.text = text_original
|
||||||
|
except SubprocessOutputError as e:
|
||||||
|
if "Ghostscript PDF/A rendering" in str(e):
|
||||||
|
self.log.warning(
|
||||||
|
"Ghostscript PDF/A rendering failed, consider setting "
|
||||||
|
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ParseError(
|
||||||
|
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||||
|
) from e
|
||||||
|
except (NoTextFoundException, InputFileError) as e:
|
||||||
|
self.log.warning(
|
||||||
|
f"Encountered an error while running OCR: {e!s}. "
|
||||||
|
f"Attempting force OCR to get the text.",
|
||||||
|
)
|
||||||
|
|
||||||
|
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||||
|
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||||
|
|
||||||
|
# Attempt to run OCR with safe settings.
|
||||||
|
|
||||||
|
args = self.construct_ocrmypdf_parameters(
|
||||||
|
document_path,
|
||||||
|
mime_type,
|
||||||
|
archive_path_fallback,
|
||||||
|
sidecar_file_fallback,
|
||||||
|
safe_fallback=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||||
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
|
# Don't return the archived file here, since this file
|
||||||
|
# is bigger and blurry due to --force-ocr.
|
||||||
|
|
||||||
|
self.text = self.extract_text(
|
||||||
|
sidecar_file_fallback,
|
||||||
|
archive_path_fallback,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If this fails, we have a serious issue at hand.
|
||||||
|
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Anything else is probably serious.
|
||||||
|
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||||
|
|
||||||
|
# As a last resort, if we still don't have any text for any reason,
|
||||||
|
# try to extract the text from the original document.
|
||||||
|
if not self.text:
|
||||||
|
if original_has_text:
|
||||||
|
self.text = text_original
|
||||||
|
else:
|
||||||
|
self.log.warning(
|
||||||
|
f"No text was found in {document_path}, the content will be empty.",
|
||||||
|
)
|
||||||
|
self.text = ""
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_text(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||||
|
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
|
||||||
|
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
|
||||||
|
|
||||||
|
# TODO: this needs a rework
|
||||||
|
# replace \0 prevents issues with saving to postgres.
|
||||||
|
# text may contain \0 when this character is present in PDF files.
|
||||||
|
return no_trailing_whitespace.strip().replace("\0", " ")
|
||||||
37
docker/paperless/plugins/readme.md
Normal file
37
docker/paperless/plugins/readme.md
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
## 登陆
|
||||||
|
### 用户名: admin
|
||||||
|
### 密码: paperless
|
||||||
|
|
||||||
|
## 需要指定用户名
|
||||||
|
### 配置好 USERMAP_GID和USERMAP_GID,否则可能无法执行主机映射进去的脚本。
|
||||||
|
### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID
|
||||||
|
|
||||||
|
## 自定义的文件名解析脚本
|
||||||
|
```Bash
|
||||||
|
# 文档
|
||||||
|
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
|
||||||
|
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
|
||||||
|
|
||||||
|
# 配置
|
||||||
|
environment:
|
||||||
|
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## 源码修改,可以通过在容器里执行 docker_patch.sh 脚本来完成
|
||||||
|
### 对于无法简单读取pdf内容的文档,paperless会启动OCR扫描,且复杂情况下会执行两遍,非常慢而且消耗资源。只能通过修改源码解决:
|
||||||
|
```Bash
|
||||||
|
# /usr/src/paperless/src/paperless_tesseract/parsers.py :
|
||||||
|
|
||||||
|
# force skip ocr process.
|
||||||
|
if not original_has_text:
|
||||||
|
original_has_text = True
|
||||||
|
text_original = "this is default content, as we skipped ocr process..."
|
||||||
|
self.log.warning("Cannot read text from Document, use default message.")
|
||||||
|
|
||||||
|
if skip_archive_for_text and original_has_text:
|
||||||
|
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||||
|
self.text = text_original
|
||||||
|
return
|
||||||
|
|
||||||
|
```
|
||||||
@ -1,64 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
-------------------------------------------------------|
|
|
||||||
------------------- paperless 无纸化pdf管理 ------------|
|
|
||||||
-------------------------------------------------------|
|
|
||||||
|
|
||||||
## 最好不要用命令,使用docker-compose.yml来创建,需要制定后端使用的数据库,以及redis!
|
|
||||||
docker run -itd \
|
|
||||||
--name paperless \
|
|
||||||
--network devops \
|
|
||||||
--platform linux/x86_64 \
|
|
||||||
-e TZ="Asia/Shanghai" \
|
|
||||||
-v /etc/localtime:/etc/localtime:ro \
|
|
||||||
-v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \
|
|
||||||
-v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \
|
|
||||||
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
|
||||||
-p 8000:8000 \
|
|
||||||
ghcr.io/paperless-ngx/paperless-ngx
|
|
||||||
|
|
||||||
|
|
||||||
# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin)
|
|
||||||
docker compose run --rm webserver createsuperuser
|
|
||||||
python3 manage.py createsuperuser
|
|
||||||
|
|
||||||
# 已有文档,放在指定目录下,等系统自动加载(或者手工启动)
|
|
||||||
cd /path/to/paperless/src/
|
|
||||||
python3 manage.py document_consumer
|
|
||||||
|
|
||||||
# 自动解析文件名
|
|
||||||
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
|
|
||||||
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
|
|
||||||
|
|
||||||
environment:
|
|
||||||
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
|
||||||
|
|
||||||
|
|
||||||
paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:
|
|
||||||
|
|
||||||
/usr/src/paperless/src/documents/consumer.py
|
|
||||||
|
|
||||||
def pre_check_duplicate(self):
|
|
||||||
"""
|
|
||||||
Using the MD5 of the file, check this exact file doesn't already exist
|
|
||||||
"""
|
|
||||||
with open(self.input_doc.original_file, "rb") as f:
|
|
||||||
checksum = hashlib.md5(f.read()).hexdigest()
|
|
||||||
existing_doc = Document.global_objects.filter(
|
|
||||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
|
||||||
)
|
|
||||||
if existing_doc.exists():
|
|
||||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
|
||||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
|
||||||
|
|
||||||
if existing_doc.first().deleted_at is not None:
|
|
||||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
|
||||||
log_msg += " Note: existing document is in the trash."
|
|
||||||
|
|
||||||
## 修改这里,让它删除重复文件。
|
|
||||||
if settings.CONSUMER_DELETE_DUPLICATES or True:
|
|
||||||
os.unlink(self.input_doc.original_file)
|
|
||||||
self._fail(
|
|
||||||
msg,
|
|
||||||
log_msg,
|
|
||||||
)
|
|
||||||
281
docker/stash/scripts/batch_format_filename.py
Normal file
281
docker/stash/scripts/batch_format_filename.py
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
import sqlite3
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
|
||||||
|
res_dir = './result'
|
||||||
|
os.makedirs(res_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 配置日志
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(f'{res_dir}/rename_files.log'),
|
||||||
|
logging.StreamHandler()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def preload_folders(conn, prefix):
|
||||||
|
"""预加载所有文件夹路径到字典(folder_id -> path)"""
|
||||||
|
sqlstr = "SELECT id, path FROM folders where 1=1 "
|
||||||
|
if prefix and prefix.strip():
|
||||||
|
sqlstr += f" and path like '%{prefix}%' "
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(sqlstr)
|
||||||
|
return {row[0]: row[1] for row in cursor.fetchall()}
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"预加载文件夹信息失败: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def preload_studios(conn):
|
||||||
|
"""预加载所有工作室名称到字典(studio_id -> name)"""
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT id, name FROM studios")
|
||||||
|
studios = {row[0]: row[1] for row in cursor.fetchall()}
|
||||||
|
# 补充默认值(未找到的工作室)
|
||||||
|
studios[None] = "UnknownStudio"
|
||||||
|
return studios
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"预加载工作室信息失败: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get_performers(conn, scene_id):
|
||||||
|
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
query = """
|
||||||
|
SELECT p.name
|
||||||
|
FROM performers p
|
||||||
|
JOIN performers_scenes ps ON p.id = ps.performer_id
|
||||||
|
WHERE ps.scene_id = ?
|
||||||
|
ORDER BY p.name
|
||||||
|
"""
|
||||||
|
cursor.execute(query, (scene_id,))
|
||||||
|
results = cursor.fetchall()
|
||||||
|
return ','.join([row[0] for row in results]) or "UnknownPerformers"
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def parse_date(date_str):
|
||||||
|
"""解析日期为yyyy.mm.dd格式"""
|
||||||
|
if not date_str:
|
||||||
|
return "0000.00.00"
|
||||||
|
|
||||||
|
date_formats = [
|
||||||
|
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
|
||||||
|
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
|
||||||
|
]
|
||||||
|
|
||||||
|
for fmt in date_formats:
|
||||||
|
try:
|
||||||
|
return datetime.strptime(date_str, fmt).strftime("%Y.%m.%d")
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
|
||||||
|
return "0000.00.00"
|
||||||
|
|
||||||
|
def get_file_extension(basename):
|
||||||
|
"""获取文件扩展名"""
|
||||||
|
if '.' in basename:
|
||||||
|
return basename.split('.')[-1].lower()
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def sanitize_filename(name):
|
||||||
|
"""清理文件名中的非法字符"""
|
||||||
|
invalid_chars = '/\\:*?"<>|'
|
||||||
|
for char in invalid_chars:
|
||||||
|
name = name.replace(char, '-')
|
||||||
|
return name
|
||||||
|
|
||||||
|
def process_scene_files(conn, mode, prefix, rename_style):
|
||||||
|
"""处理所有场景文件映射关系(优化版:合并查询+预加载缓存)"""
|
||||||
|
results = []
|
||||||
|
try:
|
||||||
|
# 1. 预加载文件夹和工作室到内存字典(仅2次SQL查询)
|
||||||
|
folders = preload_folders(conn, prefix)
|
||||||
|
studios = preload_studios(conn)
|
||||||
|
logger.info(f"预加载完成 - 文件夹: {len(folders)} 个, 工作室: {len(studios)} 个")
|
||||||
|
|
||||||
|
# 2. 一次性查询所有关联数据(1次SQL查询替代多次)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
sf.scene_id, sf.file_id,
|
||||||
|
f.id AS file_id, f.basename, f.parent_folder_id,
|
||||||
|
s.title, s.date as release_date, s.studio_id, s.code
|
||||||
|
FROM scenes_files sf
|
||||||
|
LEFT JOIN files f ON sf.file_id = f.id
|
||||||
|
LEFT JOIN scenes s ON sf.scene_id = s.id
|
||||||
|
"""
|
||||||
|
cursor.execute(query)
|
||||||
|
mappings = cursor.fetchall()
|
||||||
|
logger.info(f"共找到 {len(mappings)} 条场景-文件映射记录")
|
||||||
|
|
||||||
|
for idx, row in enumerate(mappings, 1):
|
||||||
|
try:
|
||||||
|
# 解析合并查询的结果
|
||||||
|
scene_id = row[0]
|
||||||
|
file_id = row[1]
|
||||||
|
file_info = {
|
||||||
|
'id': row[2],
|
||||||
|
'basename': row[3],
|
||||||
|
'parent_folder_id': row[4]
|
||||||
|
}
|
||||||
|
scene_info = {
|
||||||
|
'title': row[5],
|
||||||
|
'release_date': row[6],
|
||||||
|
'studio_id': row[7],
|
||||||
|
'code': row[8]
|
||||||
|
}
|
||||||
|
|
||||||
|
# 校验必要数据
|
||||||
|
if not file_id or not file_info['id'] or not file_info['basename'] or not file_info['parent_folder_id']:
|
||||||
|
logger.debug(f"文件ID信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
|
||||||
|
continue
|
||||||
|
if not scene_id or not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
|
||||||
|
logger.debug(f"场景信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 3. 从内存缓存获取文件夹路径和工作室名称(无SQL查询)
|
||||||
|
folder_path = folders.get(file_info['parent_folder_id'])
|
||||||
|
if not folder_path:
|
||||||
|
logger.debug(f"文件夹ID不存在 (folder_id={file_info['parent_folder_id']}),跳过")
|
||||||
|
continue
|
||||||
|
studio_name = studios.get(scene_info['studio_id'])
|
||||||
|
if not studio_name:
|
||||||
|
logger.debug(f"工作室ID不存在 (studio_id={scene_info['studio_id']}),跳过")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 4. 获取演员信息(仍需单独查询,因多对多关联需排序)
|
||||||
|
performers = get_performers(conn, scene_id)
|
||||||
|
|
||||||
|
# 5. 构建新文件名
|
||||||
|
original_basename = file_info['basename'] or "unknown_file"
|
||||||
|
ext = get_file_extension(original_basename)
|
||||||
|
release_date = parse_date(scene_info['release_date'])
|
||||||
|
title = scene_info['title'] or "Untitled"
|
||||||
|
|
||||||
|
# 清理特殊字符
|
||||||
|
sanitized_studio = sanitize_filename(studio_name)
|
||||||
|
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
|
||||||
|
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
|
||||||
|
if scene_info.get('code'):
|
||||||
|
sanitized_title = f"{sanitized_title} ({scene_info['code']})"
|
||||||
|
# 去掉sanitized_studio的空格,以及' " 等特殊符号
|
||||||
|
sanitized_studio = re.sub(r'[\'"\s\-_]+', '', sanitized_studio)
|
||||||
|
|
||||||
|
# 拼接新文件名
|
||||||
|
if ext:
|
||||||
|
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}.{ext}"
|
||||||
|
else:
|
||||||
|
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}"
|
||||||
|
|
||||||
|
# 简化命名规则,适用于日本影片
|
||||||
|
if rename_style == 'simple':
|
||||||
|
if scene_info.get('code'):
|
||||||
|
# code 转换成大写
|
||||||
|
new_code = scene_info['code'].upper()
|
||||||
|
new_basename = f"{new_code}_{release_date}.{ext}" if ext else f"{new_code}_{release_date}"
|
||||||
|
|
||||||
|
if len(new_basename) > 254:
|
||||||
|
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 构建完整路径
|
||||||
|
original_path = os.path.join(folder_path, original_basename)
|
||||||
|
new_path = os.path.join(folder_path, new_basename)
|
||||||
|
|
||||||
|
if not os.path.exists(original_path):
|
||||||
|
logger.warning(f"文件不存在,跳过: {original_path}")
|
||||||
|
continue
|
||||||
|
if os.path.exists(new_path):
|
||||||
|
logger.warning(f"目标文件已存在,跳过: {new_path}")
|
||||||
|
continue
|
||||||
|
if original_path == new_path: # 文件名未变化
|
||||||
|
logger.info(f"文件名未变化,跳过 (file_id={file_id}): {original_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 记录结果
|
||||||
|
result = {
|
||||||
|
'file_id': file_id,
|
||||||
|
'scene_id': scene_id,
|
||||||
|
'original_name': original_path,
|
||||||
|
'dest_name': new_path
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
logger.info(f"处理第 {idx}/{len(mappings)} 条: {original_path} -> {new_path}")
|
||||||
|
|
||||||
|
# 运行模式:执行重命名和数据库更新
|
||||||
|
if mode == 'run':
|
||||||
|
if not os.path.exists(original_path):
|
||||||
|
logger.warning(f"文件不存在,跳过: {original_path}")
|
||||||
|
continue
|
||||||
|
if os.path.exists(new_path):
|
||||||
|
logger.warning(f"目标文件已存在,跳过: {new_path}")
|
||||||
|
continue
|
||||||
|
if original_path != new_path:
|
||||||
|
os.rename(original_path, new_path)
|
||||||
|
#cursor.execute(
|
||||||
|
# "UPDATE files SET basename = ? WHERE id = ?",
|
||||||
|
# (new_basename, file_info['id'])
|
||||||
|
#)
|
||||||
|
#conn.commit()
|
||||||
|
logger.info(f"已更新文件 (file_id={file_info['id']})")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
|
||||||
|
if mode == 'run':
|
||||||
|
conn.rollback()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 保存结果
|
||||||
|
with open(f'{res_dir}/rename_results.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"处理完成,结果已保存到 rename_results.json")
|
||||||
|
return results
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
|
||||||
|
if mode == 'run':
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
if mode == 'run':
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='电影文件重命名工具(优化版)')
|
||||||
|
parser.add_argument('--mode', choices=['check', 'run'], default='check',
|
||||||
|
help='运行模式: check(检查) 或 run(执行)')
|
||||||
|
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
|
||||||
|
parser.add_argument('--prefix', default='', help='目录前缀,用来过滤文件路径')
|
||||||
|
parser.add_argument('--rename_style', choices=['standard', 'simple'], default='standard', help='文件命名规则,标准格式和简化格式')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not os.path.exists(args.db):
|
||||||
|
logger.error(f"数据库文件不存在: {args.db}")
|
||||||
|
return
|
||||||
|
|
||||||
|
conn = None
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(args.db)
|
||||||
|
logger.info(f"成功连接到数据库: {args.db}")
|
||||||
|
process_scene_files(conn, args.mode, args.prefix, args.rename_style)
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
|
||||||
|
finally:
|
||||||
|
if conn:
|
||||||
|
conn.close()
|
||||||
|
logger.info("数据库连接已关闭")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
288
docker/stash/scripts/format_filename.py
Normal file
288
docker/stash/scripts/format_filename.py
Normal file
@ -0,0 +1,288 @@
|
|||||||
|
import sqlite3
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 配置日志
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler('./result/rename_files.log'),
|
||||||
|
logging.StreamHandler()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def get_performers(conn, scene_id):
|
||||||
|
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
# 优化查询:使用JOIN一次性获取所需数据
|
||||||
|
query = """
|
||||||
|
SELECT p.name
|
||||||
|
FROM performers p
|
||||||
|
JOIN performers_scenes ps ON p.id = ps.performer_id
|
||||||
|
WHERE ps.scene_id = ?
|
||||||
|
ORDER BY p.name
|
||||||
|
"""
|
||||||
|
cursor.execute(query, (scene_id,))
|
||||||
|
results = cursor.fetchall()
|
||||||
|
return ','.join([row[0] for row in results])
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get_file_info(conn, file_id):
|
||||||
|
"""获取文件信息(ID、原始文件名、父目录ID)"""
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id, basename, parent_folder_id
|
||||||
|
FROM files
|
||||||
|
WHERE id = ?
|
||||||
|
""", (file_id,))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if not result:
|
||||||
|
raise ValueError(f"未找到文件信息 (file_id={file_id})")
|
||||||
|
return {
|
||||||
|
'id': result[0],
|
||||||
|
'basename': result[1],
|
||||||
|
'parent_folder_id': result[2]
|
||||||
|
}
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"获取文件信息失败 (file_id={file_id}): {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get_folder_path(conn, folder_id):
|
||||||
|
"""获取文件夹路径"""
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT path FROM folders WHERE id = ?", (folder_id,))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if not result:
|
||||||
|
raise ValueError(f"未找到文件夹路径 (folder_id={folder_id})")
|
||||||
|
return result[0]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"获取文件夹路径失败 (folder_id={folder_id}): {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get_scene_info(conn, scene_id):
|
||||||
|
"""获取场景信息(标题、日期、工作室ID)"""
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT title, date as release_date, studio_id
|
||||||
|
FROM scenes
|
||||||
|
WHERE id = ?
|
||||||
|
""", (scene_id,))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if not result:
|
||||||
|
raise ValueError(f"未找到场景信息 (scene_id={scene_id})")
|
||||||
|
return {
|
||||||
|
'title': result[0],
|
||||||
|
'release_date': result[1],
|
||||||
|
'studio_id': result[2]
|
||||||
|
}
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"获取场景信息失败 (scene_id={scene_id}): {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get_studio_name(conn, studio_id):
|
||||||
|
"""获取工作室名称"""
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT name FROM studios WHERE id = ?", (studio_id,))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if not result:
|
||||||
|
logger.warning(f"未找到工作室信息 (studio_id={studio_id}),使用默认名称")
|
||||||
|
return "UnknownStudio"
|
||||||
|
return result[0]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"获取工作室信息失败 (studio_id={studio_id}): {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def parse_date(date_str):
|
||||||
|
"""解析日期为yyyy.mm.dd格式"""
|
||||||
|
if not date_str:
|
||||||
|
return "0000.00.00"
|
||||||
|
|
||||||
|
# 尝试多种常见日期格式
|
||||||
|
date_formats = [
|
||||||
|
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
|
||||||
|
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
|
||||||
|
]
|
||||||
|
|
||||||
|
for fmt in date_formats:
|
||||||
|
try:
|
||||||
|
date_obj = datetime.strptime(date_str, fmt)
|
||||||
|
return date_obj.strftime("%Y.%m.%d")
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
|
||||||
|
return "0000.00.00"
|
||||||
|
|
||||||
|
def get_file_extension(basename):
|
||||||
|
"""获取文件扩展名"""
|
||||||
|
if '.' in basename:
|
||||||
|
return basename.split('.')[-1].lower()
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def sanitize_filename(name):
|
||||||
|
"""清理文件名中的非法字符"""
|
||||||
|
invalid_chars = '/\\:*?"<>|'
|
||||||
|
for char in invalid_chars:
|
||||||
|
name = name.replace(char, '-')
|
||||||
|
return name
|
||||||
|
|
||||||
|
def process_scene_files(conn, mode, prefix):
|
||||||
|
"""处理所有场景文件映射关系"""
|
||||||
|
results = []
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
# 获取所有场景-文件映射关系
|
||||||
|
cursor.execute("SELECT scene_id, file_id FROM scenes_files")
|
||||||
|
mappings = cursor.fetchall()
|
||||||
|
logger.debug(f"共找到 {len(mappings)} 条场景-文件映射记录")
|
||||||
|
|
||||||
|
for idx, (scene_id, file_id) in enumerate(mappings, 1):
|
||||||
|
logger.debug(f"处理第 {idx}/{len(mappings)} 条记录 (scene_id={scene_id}, file_id={file_id})")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. 获取文件信息
|
||||||
|
file_info = get_file_info(conn, file_id)
|
||||||
|
original_basename = file_info['basename']
|
||||||
|
parent_folder_id = file_info['parent_folder_id']
|
||||||
|
|
||||||
|
# 2.获取文件夹路径
|
||||||
|
folder_path = get_folder_path(conn, parent_folder_id)
|
||||||
|
|
||||||
|
# 3. 获取演员信息
|
||||||
|
performers = get_performers(conn, scene_id)
|
||||||
|
if not performers:
|
||||||
|
performers = "UnknownPerformers"
|
||||||
|
logger.warning(f"场景 {scene_id} 未找到演员信息,跳过")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 4. 获取场景和工作室信息
|
||||||
|
scene_info = get_scene_info(conn, scene_id)
|
||||||
|
if not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
|
||||||
|
logger.warning(f"场景 {scene_id} 信息不完整,跳过")
|
||||||
|
continue
|
||||||
|
title = scene_info['title'] or "Untitled"
|
||||||
|
release_date = parse_date(scene_info['release_date'])
|
||||||
|
studio_name = get_studio_name(conn, scene_info['studio_id'])
|
||||||
|
|
||||||
|
# 5. 构建新文件名
|
||||||
|
ext = get_file_extension(original_basename)
|
||||||
|
sanitized_studio = sanitize_filename(studio_name)
|
||||||
|
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
|
||||||
|
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
|
||||||
|
|
||||||
|
if ext:
|
||||||
|
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}.{ext}"
|
||||||
|
else:
|
||||||
|
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}"
|
||||||
|
|
||||||
|
if len(new_basename) > 254:
|
||||||
|
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 构建完整路径
|
||||||
|
original_path = os.path.join(folder_path, original_basename)
|
||||||
|
new_path = os.path.join(folder_path, new_basename)
|
||||||
|
|
||||||
|
# 记录结果
|
||||||
|
result = {
|
||||||
|
'file_id': file_id,
|
||||||
|
'scene_id': scene_id,
|
||||||
|
'original_name': original_path,
|
||||||
|
'dest_name': new_path
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# 输出检查信息
|
||||||
|
logger.info(f"准备重命名: {original_path} -> {new_path}")
|
||||||
|
|
||||||
|
# 在运行模式下执行操作
|
||||||
|
if mode == 'run':
|
||||||
|
# 检查文件是否存在
|
||||||
|
if not os.path.exists(original_path):
|
||||||
|
logger.warning(f"文件不存在,跳过: {original_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 执行文件重命名
|
||||||
|
if original_path != new_path:
|
||||||
|
os.rename(original_path, new_path)
|
||||||
|
logger.info(f"已重命名: {original_path} -> {new_path}")
|
||||||
|
|
||||||
|
# 更新数据库记录
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE files SET basename = ? WHERE id = ?",
|
||||||
|
(new_basename, file_id)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
logger.info(f"已更新数据库记录 (file_id={file_id})")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
|
||||||
|
# 回滚当前事务(如果是运行模式)
|
||||||
|
if mode == 'run':
|
||||||
|
conn.rollback()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 保存结果到文件
|
||||||
|
with open('./result/rename_results.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"处理完成,结果已保存到 rename_results.json")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
|
||||||
|
if mode == 'run':
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
if mode == 'run':
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 解析命令行参数
|
||||||
|
parser = argparse.ArgumentParser(description='电影文件重命名工具')
|
||||||
|
parser.add_argument('--mode', choices=['check', 'run'], default='check',
|
||||||
|
help='运行模式: check(检查) 或 run(执行)')
|
||||||
|
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
|
||||||
|
parser.add_argument('--prefix', default='', help='目录的前缀,用来匹配')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# 验证数据库文件是否存在
|
||||||
|
if not os.path.exists(args.db):
|
||||||
|
logger.error(f"数据库文件不存在: {args.db}")
|
||||||
|
return
|
||||||
|
|
||||||
|
os.makedirs('./result', exist_ok=True)
|
||||||
|
|
||||||
|
# 连接数据库
|
||||||
|
conn = None
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(args.db)
|
||||||
|
conn.row_factory = sqlite3.Row # 启用行工厂,方便按列名访问
|
||||||
|
logger.info(f"成功连接到数据库: {args.db}")
|
||||||
|
|
||||||
|
# 执行处理
|
||||||
|
process_scene_files(conn, args.mode, args.prefix)
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
|
||||||
|
finally:
|
||||||
|
if conn:
|
||||||
|
conn.close()
|
||||||
|
logger.info("数据库连接已关闭")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
110
docker/stash/scripts/scrapers/JavBus/JavBus.yml
Normal file
110
docker/stash/scripts/scrapers/JavBus/JavBus.yml
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
name: Javbus
|
||||||
|
sceneByFragment:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: https://www.javbus.com/{filename}
|
||||||
|
queryURLReplace:
|
||||||
|
filename:
|
||||||
|
- regex: -JG\d
|
||||||
|
with: ""
|
||||||
|
- regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
|
||||||
|
with: $2
|
||||||
|
scraper: sceneScraper
|
||||||
|
sceneByURL:
|
||||||
|
- action: scrapeXPath
|
||||||
|
url:
|
||||||
|
- https://www.javbus.com
|
||||||
|
- https://www.seejav.bid
|
||||||
|
- https://www.cdnbus.lol
|
||||||
|
- https://www.dmmbus.lol
|
||||||
|
- https://www.seedmm.cfd
|
||||||
|
scraper: sceneScraper
|
||||||
|
sceneByName:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: https://www.javbus.com/search/{}&type=&parent=ce
|
||||||
|
scraper: sceneSearch
|
||||||
|
sceneByQueryFragment:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: "{url}"
|
||||||
|
scraper: sceneScraper
|
||||||
|
|
||||||
|
performerByURL:
|
||||||
|
- action: scrapeXPath
|
||||||
|
url:
|
||||||
|
- https://www.javbus.com
|
||||||
|
- https://www.seejav.bid
|
||||||
|
- https://www.cdnbus.lol
|
||||||
|
- https://www.dmmbus.lol
|
||||||
|
- https://www.seedmm.cfd
|
||||||
|
scraper: performerScraper
|
||||||
|
performerByName:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: https://www.javbus.com/searchstar/{}&type=&parent=ce
|
||||||
|
scraper: performerSearch
|
||||||
|
|
||||||
|
xPathScrapers:
|
||||||
|
performerSearch:
|
||||||
|
performer:
|
||||||
|
Name: //span[@class="mleft"]
|
||||||
|
URLs: //*[@id="waterfall"]/div/a/@href
|
||||||
|
performerScraper:
|
||||||
|
performer:
|
||||||
|
Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
|
||||||
|
Birthdate:
|
||||||
|
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '生日')]
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^(.*? ){1}
|
||||||
|
with:
|
||||||
|
Height:
|
||||||
|
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '身高')]
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^(.*? ){1}
|
||||||
|
with:
|
||||||
|
# Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
|
||||||
|
Image:
|
||||||
|
selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^
|
||||||
|
with: https://www.javbus.com
|
||||||
|
|
||||||
|
sceneSearch:
|
||||||
|
scene:
|
||||||
|
Title: //div[@class="photo-info"]/span
|
||||||
|
URL: //*[@id="waterfall"]/div/a/@href
|
||||||
|
sceneScraper:
|
||||||
|
scene:
|
||||||
|
Title:
|
||||||
|
selector: //div[@class="col-md-3 info"]//span[contains(text(), '識別碼')]/../span[2]/text()
|
||||||
|
URL:
|
||||||
|
selector: /html/head/link[@hreflang="zh"]/@href
|
||||||
|
Date:
|
||||||
|
selector: //div[@class="col-md-3 info"]//span[contains(text(), '發行日期')]/../text()
|
||||||
|
Details:
|
||||||
|
selector: //div[@class="container"]/h3/text()
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^(.*? ){1}
|
||||||
|
with:
|
||||||
|
Tags:
|
||||||
|
Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
|
||||||
|
Performers:
|
||||||
|
Name: //div[@class="star-name"]/a
|
||||||
|
Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
|
||||||
|
Image:
|
||||||
|
selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^
|
||||||
|
with: https://www.javbus.com
|
||||||
|
Studio:
|
||||||
|
Name: //div[@class="col-md-3 info"]//span[contains(text(), '發行商')]/../a/text()
|
||||||
|
|
||||||
|
driver:
|
||||||
|
headers:
|
||||||
|
- Key: User-Agent
|
||||||
|
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
|
||||||
|
- Key: Accept-Language
|
||||||
|
Value: zh-cn
|
||||||
|
# Last Updated September 17, 2025
|
||||||
9
docker/stash/scripts/scrapers/JavBus/manifest
Normal file
9
docker/stash/scripts/scrapers/JavBus/manifest
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
id: JavBus
|
||||||
|
name: Javbus
|
||||||
|
metadata: {}
|
||||||
|
version: 5ee93a34
|
||||||
|
date: "2025-09-17 10:48:13"
|
||||||
|
requires: []
|
||||||
|
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||||
|
files:
|
||||||
|
- JavBus.yml
|
||||||
111
docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
Normal file
111
docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
name: Javbus_en
|
||||||
|
sceneByFragment:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: https://www.javbus.com/en/{filename}
|
||||||
|
queryURLReplace:
|
||||||
|
filename:
|
||||||
|
- regex: -JG\d
|
||||||
|
with: ""
|
||||||
|
- regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
|
||||||
|
with: $2
|
||||||
|
scraper: sceneScraper
|
||||||
|
sceneByURL:
|
||||||
|
- action: scrapeXPath
|
||||||
|
url:
|
||||||
|
- https://www.javbus.com/en
|
||||||
|
- https://www.seejav.bid
|
||||||
|
- https://www.cdnbus.lol
|
||||||
|
- https://www.dmmbus.lol
|
||||||
|
- https://www.seedmm.cfd
|
||||||
|
scraper: sceneScraper
|
||||||
|
sceneByName:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: https://www.javbus.com/en/search/{}&type=&parent=ce
|
||||||
|
scraper: sceneSearch
|
||||||
|
sceneByQueryFragment:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: "{url}"
|
||||||
|
scraper: sceneScraper
|
||||||
|
|
||||||
|
performerByURL:
|
||||||
|
- action: scrapeXPath
|
||||||
|
url:
|
||||||
|
- https://www.javbus.com/en
|
||||||
|
- https://www.seejav.bid
|
||||||
|
- https://www.cdnbus.lol
|
||||||
|
- https://www.dmmbus.lol
|
||||||
|
- https://www.seedmm.cfd
|
||||||
|
scraper: performerScraper
|
||||||
|
performerByName:
|
||||||
|
action: scrapeXPath
|
||||||
|
queryURL: https://www.javbus.com/en/searchstar/{}&type=&parent=ce
|
||||||
|
scraper: performerSearch
|
||||||
|
|
||||||
|
xPathScrapers:
|
||||||
|
performerSearch:
|
||||||
|
performer:
|
||||||
|
Name: //span[@class="mleft"]
|
||||||
|
URLs: //*[@id="waterfall"]/div/a/@href
|
||||||
|
performerScraper:
|
||||||
|
performer:
|
||||||
|
Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
|
||||||
|
Birthdate:
|
||||||
|
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'D.O.B')]
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^(.*? ){1}
|
||||||
|
with:
|
||||||
|
Height:
|
||||||
|
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'Height')]
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^(.*? ){1}
|
||||||
|
with:
|
||||||
|
# Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
|
||||||
|
Image:
|
||||||
|
selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^
|
||||||
|
with: https://www.javbus.com/en
|
||||||
|
|
||||||
|
sceneSearch:
|
||||||
|
scene:
|
||||||
|
Title: //div[@class="photo-info"]/span
|
||||||
|
URL: //*[@id="waterfall"]/div/a/@href
|
||||||
|
sceneScraper:
|
||||||
|
scene:
|
||||||
|
Title:
|
||||||
|
selector: //div[@class="col-md-3 info"]//span[contains(text(), 'ID')]/../span[2]/text()
|
||||||
|
URL:
|
||||||
|
selector: /html/head/link[@hreflang="zh"]/@href
|
||||||
|
Date:
|
||||||
|
selector: //div[@class="col-md-3 info"]//span[contains(normalize-space(text()), 'Release Date')]/../text()
|
||||||
|
#selector: //div[@class="col-md-3 info"]//span[contains(text(), 'Release Date')]/../text()
|
||||||
|
Details:
|
||||||
|
selector: //div[@class="container"]/h3/text()
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^(.*? ){1}
|
||||||
|
with:
|
||||||
|
Tags:
|
||||||
|
Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
|
||||||
|
Performers:
|
||||||
|
Name: //div[@class="star-name"]/a
|
||||||
|
Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
|
||||||
|
Image:
|
||||||
|
selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
|
||||||
|
postProcess:
|
||||||
|
- replace:
|
||||||
|
- regex: ^
|
||||||
|
with: https://www.javbus.com/
|
||||||
|
Studio:
|
||||||
|
Name: //div[@class="col-md-3 info"]//span[contains(text(), 'Label')]/../a/text()
|
||||||
|
|
||||||
|
driver:
|
||||||
|
headers:
|
||||||
|
- Key: User-Agent
|
||||||
|
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
|
||||||
|
- Key: Accept-Language
|
||||||
|
Value: zh-cn,en-US
|
||||||
|
# Last Updated September 17, 2025
|
||||||
9
docker/stash/scripts/scrapers/JavBus_en/manifest
Normal file
9
docker/stash/scripts/scrapers/JavBus_en/manifest
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
id: JavBus_en
|
||||||
|
name: Javbus_en
|
||||||
|
metadata: {}
|
||||||
|
version: b4672ccf
|
||||||
|
date: "2025-08-01 16:01:27"
|
||||||
|
requires: []
|
||||||
|
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||||
|
files:
|
||||||
|
- JavBus_en.yml
|
||||||
11
gitignore
Normal file
11
gitignore
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# 其他已有的忽略规则
|
||||||
|
*.pyc
|
||||||
|
__pycache__/
|
||||||
|
|
||||||
|
# 忽略环境配置文件
|
||||||
|
.env
|
||||||
|
|
||||||
|
# 忽略所有 log 目录 和 data 目录
|
||||||
|
**/log/
|
||||||
|
**/data/
|
||||||
|
**/result/
|
||||||
@ -29,18 +29,47 @@ else
|
|||||||
fi
|
fi
|
||||||
servers=()
|
servers=()
|
||||||
while IFS= read -r line; do
|
while IFS= read -r line; do
|
||||||
|
# 跳过空行和注释行
|
||||||
|
[[ -z "$line" || "$line" =~ ^# ]] && continue
|
||||||
servers+=("$line")
|
servers+=("$line")
|
||||||
done < "$file_path"
|
done < "$file_path"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 推送公钥到远程服务器
|
# 推送公钥到远程服务器
|
||||||
for server in "${servers[@]}"; do
|
|
||||||
public_key=$(cat ~/.ssh/id_rsa.pub)
|
public_key=$(cat ~/.ssh/id_rsa.pub)
|
||||||
ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys"
|
for server in "${servers[@]}"; do
|
||||||
if [ $? -eq 0 ]; then
|
# 第一次尝试推送
|
||||||
|
echo "正在推送公钥到 $server..."
|
||||||
|
output=$(ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys" 2>&1)
|
||||||
|
exit_code=$?
|
||||||
|
|
||||||
|
if [ $exit_code -eq 0 ]; then
|
||||||
|
echo "公钥已成功推送到 $server"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检测是否是主机密钥验证失败
|
||||||
|
if echo "$output" | grep -q "Host key verification failed"; then
|
||||||
|
echo "检测到 $server 的主机密钥已变更,正在清理旧密钥..."
|
||||||
|
# 提取主机地址(处理 user@host 格式,取 @ 后面的部分)
|
||||||
|
host=$(echo "$server" | cut -d'@' -f2)
|
||||||
|
# 清理旧密钥
|
||||||
|
cleanup_output=$(ssh-keygen -R "$host" 2>&1)
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "清理 $host 旧密钥失败:$cleanup_output"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
echo "已清理 $host 的旧密钥,重新尝试推送..."
|
||||||
|
# 重新推送
|
||||||
|
retry_output=$(ssh "$server" "mkdir -p ~/.ssh && echo '$public_key' >> ~/.ssh/authorized_keys" 2>&1)
|
||||||
|
retry_code=$?
|
||||||
|
if [ $retry_code -eq 0 ]; then
|
||||||
echo "公钥已成功推送到 $server"
|
echo "公钥已成功推送到 $server"
|
||||||
else
|
else
|
||||||
echo "推送公钥到 $server 时出错。"
|
echo "重新推送 $server 失败:$retry_output"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# 其他错误类型
|
||||||
|
echo "推送 $server 失败:$output"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
Reference in New Issue
Block a user