Compare commits
21 Commits
31e07abf14
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 2b0e1c0413 | |||
| dece263c8b | |||
| 00b267b651 | |||
| 0a4776479c | |||
| 6cf529541d | |||
| 2c0e3bd718 | |||
| ebae625165 | |||
| f8daffd47f | |||
| bed2de3cd1 | |||
| d1c543512e | |||
| 857339d261 | |||
| f189dcfaca | |||
| 1848510b65 | |||
| 04d76944ad | |||
| 40eae5569a | |||
| 15c4f7b823 | |||
| 17356c79f9 | |||
| 808dbaa985 | |||
| b7dffc539c | |||
| 91e7d38725 | |||
| fe153d69cc |
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
# 其他已有的忽略规则
|
||||
*.pyc
|
||||
__pycache__/
|
||||
|
||||
# 忽略环境配置文件
|
||||
.env
|
||||
|
||||
# 忽略所有 log 目录 和 data 目录
|
||||
**/log/
|
||||
**/data/
|
||||
**/result/
|
||||
@ -8,11 +8,6 @@ services:
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
PAPERLESS_OCR_LANGUAGES: "" # 跳过OCR
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always" # 跳过创建文档存档版本的时间
|
||||
PAPERLESS_OCR_OUTPUT_TYPE: "pdf" # 尽量少修改PDF文档
|
||||
PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔(以秒为单位),这将导致 paperless 定期检查消费目录中的更改
|
||||
#PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间(以秒为单位)
|
||||
|
||||
# 使用 SQLite 作为数据库(默认)
|
||||
PAPERLESS_DBENGINE: sqlite3
|
||||
@ -34,11 +29,22 @@ services:
|
||||
# 定义文件命名规则和存储路径
|
||||
# 作用不大,主要还是用消费后脚本,以及工作流来指定存储路径。
|
||||
# 工作流先于消费后脚本运行,因此消费后脚本里解析的document_type在工作流里无效。所以使用了文件名关键词匹配
|
||||
PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}.pdf"
|
||||
PAPERLESS_FILENAME_FORMAT: "{{created}}_{{document_type}}_{{correspondent}}_{{title}}"
|
||||
|
||||
# 解析文件里的关键信息,并更新。但无法更新strorage path。这个字段要靠工作流才行。
|
||||
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||
|
||||
# 自动删除重复文件
|
||||
PAPERLESS_CONSUMER_DELETE_DUPLICATES: true
|
||||
# 支持消费目录递归检索,即子目录。这样可以支持多个宿主机的目录映射到docker中
|
||||
PAPERLESS_CONSUMER_RECURSIVE: true
|
||||
|
||||
PAPERLESS_OCR_LANGUAGES: "" # 跳过OCR,并不会,只会用默认的eng来执行
|
||||
PAPERLESS_OCR_SKIP_ARCHIVE_FILE: "always" # 跳过创建文档存档版本的时间
|
||||
PAPERLESS_OCR_OUTPUT_TYPE: "pdf" # 尽量少修改PDF文档
|
||||
PAPERLESS_CONSUMER_POLLING: "5" # 指定轮询间隔(以秒为单位),这将导致 paperless 定期检查消费目录中的更改
|
||||
#PAPERLESS_CONSUMER_INOTIFY_DELAY: "2" # 设置消费者等待 inotify 发出的其他事件的时间(以秒为单位)
|
||||
|
||||
# 运行用户
|
||||
USERMAP_UID: 1000
|
||||
USERMAP_GID: 1000
|
||||
@ -46,8 +52,9 @@ services:
|
||||
volumes:
|
||||
# 存储所有数据(搜索索引、SQLite 数据库、分类模型等)的地方
|
||||
- ~/dockers/paperless/data:/usr/src/paperless/data
|
||||
# 挂载文件导入目录
|
||||
# 挂载文件导入目录,可以把多个宿主机的目录,挂到docker中,以子目录的形式存在
|
||||
- ~/dockers/paperless/consume:/usr/src/paperless/consume
|
||||
- ~/dockers/sharedata/consume:/usr/src/paperless/consume/subdir
|
||||
# 挂载文件导出目录
|
||||
- ~/dockers/paperless/export:/usr/src/paperless/export
|
||||
# 存储您的文档和缩略图的地方
|
||||
|
||||
@ -9,7 +9,8 @@ import logging
|
||||
|
||||
# Paperless 服务器信息
|
||||
PAPERLESS_URL = "http://localhost:8000/api"
|
||||
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
|
||||
#AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证, mac上用这个
|
||||
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证,NAS上用这个
|
||||
|
||||
# 日志配置
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
|
||||
149
docker/paperless/plugins/docker_patch.sh
Executable file
149
docker/paperless/plugins/docker_patch.sh
Executable file
@ -0,0 +1,149 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 定义文件替换对(数组形式,格式:源文件 目标文件)
|
||||
# 可按需添加/删除行,每行一组 源文件 目标文件
|
||||
FILE_PAIRS=(
|
||||
"/usr/src/paperless/scripts/parsers.py" "/usr/src/paperless/src/paperless_tesseract/parsers.py"
|
||||
# 示例:新增更多文件对
|
||||
#"/usr/src/paperless/git_scripts/parse_filename.py" "/usr/src/paperless/scripts/parse_filename.py"
|
||||
# "/path/to/source/file3" "/path/to/dest/file3"
|
||||
# "/path/to/source/file4" "/path/to/dest/file4"
|
||||
)
|
||||
|
||||
# 检查所有文件是否存在(仅检查replace/check操作需要的文件)
|
||||
check_files_exist() {
|
||||
local missing=0
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
|
||||
# 遍历文件对(步长2:源文件、目标文件为一组)
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local source="${FILE_PAIRS[$i]}"
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
|
||||
# 根据操作类型检查对应文件
|
||||
if [ "$1" = "replace" ] || [ "$1" = "check" ]; then
|
||||
if [ ! -f "$source" ]; then
|
||||
echo "错误:源文件不存在 - $source"
|
||||
missing=1
|
||||
fi
|
||||
if [ ! -f "$dest" ]; then
|
||||
echo "错误:目标文件不存在 - $dest"
|
||||
missing=1
|
||||
fi
|
||||
elif [ "$1" = "rollback" ]; then
|
||||
if [ ! -f "$dest.bak" ]; then
|
||||
echo "警告:备份文件不存在(未执行过替换?) - $dest.bak"
|
||||
missing=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $missing -eq 1 ] && [ "$1" != "rollback" ]; then
|
||||
echo "错误:关键文件缺失,无法继续执行"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示所有文件对的差异
|
||||
show_diffs() {
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
echo "=== 开始检查文件差异 ==="
|
||||
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local source="${FILE_PAIRS[$i]}"
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
|
||||
echo -e "\n--- 检查 $dest <-> $source 的差异 ---"
|
||||
diff -u "$dest" "$source" || true # 无差异时不报错
|
||||
done
|
||||
}
|
||||
|
||||
# 备份单个文件(添加 .bak 后缀,保留原权限)
|
||||
backup_file() {
|
||||
local file="$1"
|
||||
local backup="$file.bak"
|
||||
|
||||
if [ -f "$backup" ]; then
|
||||
echo "提示:旧备份文件已存在,将覆盖 - $backup"
|
||||
rm -f "$backup"
|
||||
fi
|
||||
|
||||
cp -a "$file" "$backup" # -a 保留权限、属性、时间戳等
|
||||
echo "已备份:$file -> $backup"
|
||||
}
|
||||
|
||||
# 替换所有文件对
|
||||
replace_files() {
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
echo "=== 开始替换文件(先备份目标文件) ==="
|
||||
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local source="${FILE_PAIRS[$i]}"
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
|
||||
echo -e "\n--- 处理文件对:$source -> $dest ---"
|
||||
backup_file "$dest"
|
||||
cp -f "$source" "$dest"
|
||||
echo "已替换:$source 覆盖 $dest"
|
||||
done
|
||||
|
||||
echo -e "\n=== 替换完成,验证最终差异(应无差异) ==="
|
||||
show_diffs
|
||||
}
|
||||
|
||||
# 回滚替换操作(恢复 .bak 备份文件)
|
||||
rollback_files() {
|
||||
local pair_count=${#FILE_PAIRS[@]}
|
||||
echo "=== 开始回滚替换操作 ==="
|
||||
|
||||
for ((i=0; i<pair_count; i+=2)); do
|
||||
local dest="${FILE_PAIRS[$i+1]}"
|
||||
local backup="$dest.bak"
|
||||
|
||||
echo -e "\n--- 处理回滚:$backup -> $dest ---"
|
||||
if [ -f "$backup" ]; then
|
||||
# 先备份当前文件(防止回滚出错)
|
||||
cp -a "$dest" "$dest.rollback_temp" 2>/dev/null || true
|
||||
# 恢复备份文件
|
||||
mv -f "$backup" "$dest"
|
||||
echo "已回滚:$dest 恢复为备份版本"
|
||||
# 删除临时文件
|
||||
rm -f "$dest.rollback_temp" 2>/dev/null || true
|
||||
else
|
||||
echo "跳过:备份文件不存在 - $backup"
|
||||
fi
|
||||
done
|
||||
|
||||
echo -e "\n=== 回滚操作执行完成 ==="
|
||||
}
|
||||
|
||||
# 主逻辑
|
||||
main() {
|
||||
case "$1" in
|
||||
check)
|
||||
echo "=== 执行文件差异检查(不修改文件) ==="
|
||||
check_files_exist "check"
|
||||
show_diffs
|
||||
;;
|
||||
replace)
|
||||
echo "=== 执行文件替换操作(自动备份) ==="
|
||||
check_files_exist "replace"
|
||||
replace_files
|
||||
;;
|
||||
rollback)
|
||||
echo "=== 执行文件回滚操作(恢复备份) ==="
|
||||
check_files_exist "rollback"
|
||||
rollback_files
|
||||
;;
|
||||
*)
|
||||
echo "用法:$0 [check|replace|rollback]"
|
||||
echo " check - 仅检查所有文件对的差异,不做修改"
|
||||
echo " replace - 备份所有目标文件并执行替换,完成后验证差异"
|
||||
echo " rollback - 回滚替换操作(恢复 .bak 备份文件)"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# 启动主逻辑
|
||||
main "$1"
|
||||
47
docker/paperless/plugins/em_reports_consume.sh
Normal file
47
docker/paperless/plugins/em_reports_consume.sh
Normal file
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
SRC="/volume1/docker/sharedata/stock_data/pdfs"
|
||||
DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
|
||||
LOG="/volume1/docker/projects/devops/docker/paperless/plugins/log/paperless.log"
|
||||
|
||||
TARGET_UID=1000
|
||||
TARGET_GID=1000
|
||||
|
||||
# 检查目录
|
||||
if [ ! -d "$SRC" ]; then
|
||||
echo "$(date '+%F %T') [ERROR] 源目录不存在: $SRC" | tee -a "$LOG"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d "$DST" ]; then
|
||||
echo "$(date '+%F %T') [ERROR] 目标目录不存在: $DST" | tee -a "$LOG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 关键添加:检查并创建log目录(-p 确保父目录存在,无报错)
|
||||
LOG_DIR=$(dirname "$LOG") # 提取日志文件所在目录(即 ./log)
|
||||
if [ ! -d "$LOG_DIR" ]; then
|
||||
mkdir -p "$LOG_DIR"
|
||||
echo "$(date '+%F %T') [INFO] log目录不存在,已创建: $LOG_DIR" | tee -a "$LOG"
|
||||
fi
|
||||
|
||||
COUNT=0
|
||||
for f in "$SRC"/*.pdf; do
|
||||
[ -f "$f" ] || continue
|
||||
|
||||
# 移动 + 改属主 + 改权限
|
||||
if install -D -o "$TARGET_UID" -g "$TARGET_GID" -m 644 "$f" "$DST"; then
|
||||
rm -f "$f"
|
||||
echo "$(date '+%F %T') [OK] Moved: $f" >> "$LOG"
|
||||
((COUNT++))
|
||||
|
||||
# 每移动10个文件,输出进度到屏幕(同时写入日志)
|
||||
if (( COUNT % 100 == 0 )); then
|
||||
PROGRESS_MSG="$(date '+%F %T') [PROGRESS] 已移动 $COUNT 个文件"
|
||||
echo "$PROGRESS_MSG" | tee -a "$LOG"
|
||||
fi
|
||||
else
|
||||
echo "$(date '+%F %T') [FAIL] Failed: $f" >> "$LOG"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "$(date '+%F %T') [INFO] 搬运完成,共移动 $COUNT 个文件" | tee -a "$LOG"
|
||||
|
||||
472
docker/paperless/plugins/origin_parsers.py
Normal file
472
docker/paperless/plugins/origin_parsers.py
Normal file
@ -0,0 +1,472 @@
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
|
||||
|
||||
class NoTextFoundException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RtlLanguageException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.tesseract"
|
||||
|
||||
def get_settings(self) -> OcrConfig:
|
||||
"""
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
"""
|
||||
return OcrConfig()
|
||||
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
page_count = None
|
||||
if mime_type == "application/pdf":
|
||||
try:
|
||||
import pikepdf
|
||||
|
||||
with pikepdf.Pdf.open(document_path) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Unable to determine PDF page count {document_path}: {e}",
|
||||
)
|
||||
return page_count
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
if mime_type == "application/pdf":
|
||||
import pikepdf
|
||||
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
if m is None: # pragma: no cover
|
||||
continue
|
||||
namespace = m.group(1)
|
||||
key_value = m.group(2)
|
||||
try:
|
||||
namespace.encode("utf-8")
|
||||
key_value.encode("utf-8")
|
||||
except UnicodeEncodeError as e: # pragma: no cover
|
||||
self.log.debug(f"Skipping metadata key {key}: {e}")
|
||||
continue
|
||||
result.append(
|
||||
{
|
||||
"namespace": namespace,
|
||||
"prefix": meta.REVERSE_NS[namespace],
|
||||
"key": key_value,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Error while reading metadata {key}: {value}. Error: {e}",
|
||||
)
|
||||
return result
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path or document_path,
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def is_image(self, mime_type) -> bool:
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
"image/heic",
|
||||
]
|
||||
|
||||
def has_alpha(self, image) -> bool:
|
||||
with Image.open(image) as im:
|
||||
return im.mode in ("RGBA", "LA")
|
||||
|
||||
def remove_alpha(self, image_path: str) -> Path:
|
||||
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||
run_subprocess(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
image_path,
|
||||
no_alpha_image,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
return no_alpha_image
|
||||
|
||||
def get_dpi(self, image) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
x, _ = im.info["dpi"]
|
||||
return round(x)
|
||||
except Exception as e:
|
||||
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, _ = im.size
|
||||
# divide image width by A4 width (210mm) in inches.
|
||||
dpi = int(width / (21 / 2.54))
|
||||
self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
|
||||
return dpi
|
||||
|
||||
except Exception as e:
|
||||
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
|
||||
return None
|
||||
|
||||
def extract_text(
|
||||
self,
|
||||
sidecar_file: Path | None,
|
||||
pdf_file: Path,
|
||||
) -> str | None:
|
||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||
# the whole text, so do not utilize it in that case
|
||||
if (
|
||||
sidecar_file is not None
|
||||
and sidecar_file.is_file()
|
||||
and self.settings.mode != "redo"
|
||||
):
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
if "[OCR skipped on page" not in text:
|
||||
# This happens when there's already text in the input file.
|
||||
# The sidecar file will only contain text for OCR'ed pages.
|
||||
self.log.debug("Using text from sidecar file")
|
||||
return post_process_text(text)
|
||||
else:
|
||||
self.log.debug("Incomplete sidecar file: discarding.")
|
||||
|
||||
# no success with the sidecar file, try PDF
|
||||
|
||||
if not Path(pdf_file).is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
pdf_file,
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
except Exception:
|
||||
# If pdftotext fails, fall back to OCR.
|
||||
self.log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
# probably not a PDF file.
|
||||
return None
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
input_file,
|
||||
mime_type,
|
||||
output_file,
|
||||
sidecar_file,
|
||||
*,
|
||||
safe_fallback=False,
|
||||
):
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.settings, OcrConfig)
|
||||
ocrmypdf_args = {
|
||||
"input_file": input_file,
|
||||
"output_file": output_file,
|
||||
# need to use threads, since this will be run in daemonized
|
||||
# processes via the task library.
|
||||
"use_threads": True,
|
||||
"jobs": settings.THREADS_PER_WORKER,
|
||||
"language": self.settings.language,
|
||||
"output_type": self.settings.output_type,
|
||||
"progress_bar": False,
|
||||
}
|
||||
|
||||
if "pdfa" in ocrmypdf_args["output_type"]:
|
||||
ocrmypdf_args["color_conversion_strategy"] = (
|
||||
self.settings.color_conversion_strategy
|
||||
)
|
||||
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.settings.mode in {
|
||||
ModeChoices.SKIP,
|
||||
ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
else: # pragma: no cover
|
||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||
|
||||
if self.settings.clean == CleanChoices.CLEAN:
|
||||
ocrmypdf_args["clean"] = True
|
||||
elif self.settings.clean == CleanChoices.FINAL:
|
||||
if self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["clean"] = True
|
||||
else:
|
||||
# --clean-final is not compatible with --redo-ocr
|
||||
ocrmypdf_args["clean_final"] = True
|
||||
|
||||
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
|
||||
# --deskew is not compatible with --redo-ocr
|
||||
ocrmypdf_args["deskew"] = True
|
||||
|
||||
if self.settings.rotate:
|
||||
ocrmypdf_args["rotate_pages"] = True
|
||||
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
|
||||
|
||||
if self.settings.pages is not None and self.settings.pages > 0:
|
||||
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
|
||||
else:
|
||||
# sidecar is incompatible with pages
|
||||
ocrmypdf_args["sidecar"] = sidecar_file
|
||||
|
||||
if self.is_image(mime_type):
|
||||
# This may be required, depending on the known information
|
||||
maybe_override_pixel_limit()
|
||||
|
||||
dpi = self.get_dpi(input_file)
|
||||
a4_dpi = self.calculate_a4_dpi(input_file)
|
||||
|
||||
if self.has_alpha(input_file):
|
||||
self.log.info(
|
||||
f"Removing alpha layer from {input_file} "
|
||||
"for compatibility with img2pdf",
|
||||
)
|
||||
# Replace the input file with the non-alpha
|
||||
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
|
||||
|
||||
if dpi:
|
||||
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||
ocrmypdf_args["image_dpi"] = dpi
|
||||
elif self.settings.image_dpi is not None:
|
||||
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
|
||||
elif a4_dpi:
|
||||
ocrmypdf_args["image_dpi"] = a4_dpi
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {input_file}, "
|
||||
f"no DPI information is present in this image and "
|
||||
f"OCR_IMAGE_DPI is not set.",
|
||||
)
|
||||
if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover
|
||||
self.log.warning(
|
||||
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
|
||||
)
|
||||
|
||||
if self.settings.user_args is not None:
|
||||
try:
|
||||
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||
f"they will not be used. Error: {e}",
|
||||
)
|
||||
|
||||
if (
|
||||
self.settings.max_image_pixel is not None
|
||||
and self.settings.max_image_pixel >= 0
|
||||
):
|
||||
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
|
||||
msg = (
|
||||
"OCR pixel limit is disabled!"
|
||||
if max_pixels_mpixels == 0
|
||||
else f"Calculated {max_pixels_mpixels} megapixels for OCR"
|
||||
)
|
||||
self.log.debug(msg)
|
||||
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = (
|
||||
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.settings.skip_archive_file
|
||||
in {
|
||||
ArchiveFileChoices.WITH_TEXT,
|
||||
ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug(f"Document has text, skipping OCRmyPDF entirely. {text_original}")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# Either no text was in the original or there should be an archive
|
||||
# file created, so OCR the file and create an archive with any
|
||||
# text located via OCR
|
||||
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
|
||||
if not self.text:
|
||||
raise NoTextFoundException("No text was found in the original document")
|
||||
except (DigitalSignatureError, EncryptedPdfError):
|
||||
self.log.warning(
|
||||
"This file is encrypted and/or signed, OCR is impossible. Using "
|
||||
"any text present in the original file.",
|
||||
)
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
)
|
||||
|
||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||
|
||||
# Attempt to run OCR with safe settings.
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path_fallback,
|
||||
sidecar_file_fallback,
|
||||
safe_fallback=True,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
# Don't return the archived file here, since this file
|
||||
# is bigger and blurry due to --force-ocr.
|
||||
|
||||
self.text = self.extract_text(
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# If this fails, we have a serious issue at hand.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
# As a last resort, if we still don't have any text for any reason,
|
||||
# try to extract the text from the original document.
|
||||
if not self.text:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
else:
|
||||
self.log.warning(
|
||||
f"No text was found in {document_path}, the content will be empty.",
|
||||
)
|
||||
self.text = ""
|
||||
|
||||
|
||||
def post_process_text(text):
|
||||
if not text:
|
||||
return None
|
||||
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
|
||||
|
||||
# TODO: this needs a rework
|
||||
# replace \0 prevents issues with saving to postgres.
|
||||
# text may contain \0 when this character is present in PDF files.
|
||||
return no_trailing_whitespace.strip().replace("\0", " ")
|
||||
@ -1,41 +0,0 @@
|
||||
-- documents_correspondent definition
|
||||
|
||||
CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
|
||||
|
||||
CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
|
||||
CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
|
||||
|
||||
-- documents_customfield definition
|
||||
|
||||
CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
|
||||
|
||||
CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
|
||||
|
||||
-- documents_customfieldinstance definition
|
||||
|
||||
CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
|
||||
|
||||
CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
|
||||
CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
|
||||
CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
|
||||
|
||||
|
||||
-- documents_document definition
|
||||
|
||||
CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
|
||||
|
||||
CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
|
||||
CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
|
||||
CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
|
||||
CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
|
||||
CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
|
||||
CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
|
||||
CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
|
||||
CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
|
||||
|
||||
-- documents_documenttype definition
|
||||
|
||||
CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
|
||||
|
||||
CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
|
||||
CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");
|
||||
@ -1,63 +0,0 @@
|
||||
我提供的文件,是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下:
|
||||
|
||||
1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
|
||||
2,我们提取上面的各个字段,然后:
|
||||
1) report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表,如果对应的name不存在,则插入一条记录;然后得到对应的 documents_documenttype.id
|
||||
2) org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表,如果对应的name 不存在,则插入一条记录,然后得到对应的 documents_correspondent.id
|
||||
3) 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段,如果不存在,则创建; 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
|
||||
3,我们开始更新数据表:
|
||||
1) 更新 documents_document 表对应的记录, reated = publish_date, correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title}
|
||||
2) 向 documents_customfieldinstance 两条记录,分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
|
||||
|
||||
好了,请你根据以上需求,完成这个python脚本。注意异常情况的处理,以及日志输出。如果文件名无法匹配以上的格式,则忽略,不用处理。
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
|
||||
|
||||
The API provides the following main endpoints:
|
||||
|
||||
/api/correspondents/: Full CRUD support.
|
||||
/api/custom_fields/: Full CRUD support.
|
||||
/api/documents/: Full CRUD support, except POSTing new documents. See below.
|
||||
/api/document_types/: Full CRUD support.
|
||||
/api/groups/: Full CRUD support.
|
||||
/api/logs/: Read-Only.
|
||||
/api/mail_accounts/: Full CRUD support.
|
||||
/api/mail_rules/: Full CRUD support.
|
||||
/api/profile/: GET, PATCH
|
||||
/api/share_links/: Full CRUD support.
|
||||
/api/storage_paths/: Full CRUD support.
|
||||
/api/tags/: Full CRUD support.
|
||||
/api/tasks/: Read-only.
|
||||
/api/users/: Full CRUD support.
|
||||
/api/workflows/: Full CRUD support.
|
||||
/api/search/ GET, see below.
|
||||
All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
|
||||
|
||||
The objects served by the document endpoint contain the following fields:
|
||||
|
||||
id: ID of the document. Read-only.
|
||||
title: Title of the document.
|
||||
content: Plain text content of the document.
|
||||
tags: List of IDs of tags assigned to this document, or empty list.
|
||||
document_type: Document type of this document, or null.
|
||||
correspondent: Correspondent of this document or null.
|
||||
created: The date time at which this document was created.
|
||||
created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
|
||||
modified: The date at which this document was last edited in paperless. Read-only.
|
||||
added: The date at which this document was added to paperless. Read-only.
|
||||
archive_serial_number: The identifier of this document in a physical document archive.
|
||||
original_file_name: Verbose filename of the original document. Read-only.
|
||||
archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
|
||||
notes: Array of notes associated with the document.
|
||||
page_count: Number of pages.
|
||||
set_permissions: Allows setting document permissions. Optional, write-only. See below.
|
||||
custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
|
||||
|
||||
|
||||
以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么,我想对编号为19的文档进行查询,以及更新操作,应该如何写对应的python代码?
|
||||
|
||||
|
||||
@ -11,7 +11,8 @@ from requests.exceptions import RequestException
|
||||
|
||||
# Paperless 服务器信息
|
||||
PAPERLESS_URL = "http://localhost:8000/api"
|
||||
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
|
||||
#AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证, mac上用这个
|
||||
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证,NAS上用这个
|
||||
|
||||
# 日志配置
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
@ -22,7 +23,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(me
|
||||
DB_PATH = "/usr/src/paperless/data/db.sqlite3"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
enable_db = True
|
||||
enable_db = False # 标准用法,用API
|
||||
|
||||
# 正则解析文件名
|
||||
FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")
|
||||
|
||||
484
docker/paperless/plugins/parsers.py
Executable file
484
docker/paperless/plugins/parsers.py
Executable file
@ -0,0 +1,484 @@
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
|
||||
|
||||
class NoTextFoundException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RtlLanguageException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.tesseract"
|
||||
|
||||
def get_settings(self) -> OcrConfig:
|
||||
"""
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
"""
|
||||
return OcrConfig()
|
||||
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
page_count = None
|
||||
if mime_type == "application/pdf":
|
||||
try:
|
||||
import pikepdf
|
||||
|
||||
with pikepdf.Pdf.open(document_path) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Unable to determine PDF page count {document_path}: {e}",
|
||||
)
|
||||
return page_count
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
if mime_type == "application/pdf":
|
||||
import pikepdf
|
||||
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
if m is None: # pragma: no cover
|
||||
continue
|
||||
namespace = m.group(1)
|
||||
key_value = m.group(2)
|
||||
try:
|
||||
namespace.encode("utf-8")
|
||||
key_value.encode("utf-8")
|
||||
except UnicodeEncodeError as e: # pragma: no cover
|
||||
self.log.debug(f"Skipping metadata key {key}: {e}")
|
||||
continue
|
||||
result.append(
|
||||
{
|
||||
"namespace": namespace,
|
||||
"prefix": meta.REVERSE_NS[namespace],
|
||||
"key": key_value,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Error while reading metadata {key}: {value}. Error: {e}",
|
||||
)
|
||||
return result
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path or document_path,
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def is_image(self, mime_type) -> bool:
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
"image/heic",
|
||||
]
|
||||
|
||||
def has_alpha(self, image) -> bool:
|
||||
with Image.open(image) as im:
|
||||
return im.mode in ("RGBA", "LA")
|
||||
|
||||
def remove_alpha(self, image_path: str) -> Path:
|
||||
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||
run_subprocess(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
image_path,
|
||||
no_alpha_image,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
return no_alpha_image
|
||||
|
||||
def get_dpi(self, image) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
x, _ = im.info["dpi"]
|
||||
return round(x)
|
||||
except Exception as e:
|
||||
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, _ = im.size
|
||||
# divide image width by A4 width (210mm) in inches.
|
||||
dpi = int(width / (21 / 2.54))
|
||||
self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
|
||||
return dpi
|
||||
|
||||
except Exception as e:
|
||||
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
|
||||
return None
|
||||
|
||||
def extract_text(
|
||||
self,
|
||||
sidecar_file: Path | None,
|
||||
pdf_file: Path,
|
||||
) -> str | None:
|
||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||
# the whole text, so do not utilize it in that case
|
||||
if (
|
||||
sidecar_file is not None
|
||||
and sidecar_file.is_file()
|
||||
and self.settings.mode != "redo"
|
||||
):
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
if "[OCR skipped on page" not in text:
|
||||
# This happens when there's already text in the input file.
|
||||
# The sidecar file will only contain text for OCR'ed pages.
|
||||
self.log.debug("Using text from sidecar file")
|
||||
return post_process_text(text)
|
||||
else:
|
||||
self.log.debug("Incomplete sidecar file: discarding.")
|
||||
|
||||
# no success with the sidecar file, try PDF
|
||||
|
||||
if not Path(pdf_file).is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
pdf_file,
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
except Exception:
|
||||
# If pdftotext fails, fall back to OCR.
|
||||
self.log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
# probably not a PDF file.
|
||||
return None
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
input_file,
|
||||
mime_type,
|
||||
output_file,
|
||||
sidecar_file,
|
||||
*,
|
||||
safe_fallback=False,
|
||||
):
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.settings, OcrConfig)
|
||||
ocrmypdf_args = {
|
||||
"input_file": input_file,
|
||||
"output_file": output_file,
|
||||
# need to use threads, since this will be run in daemonized
|
||||
# processes via the task library.
|
||||
"use_threads": True,
|
||||
"jobs": settings.THREADS_PER_WORKER,
|
||||
"language": self.settings.language,
|
||||
"output_type": self.settings.output_type,
|
||||
"progress_bar": False,
|
||||
}
|
||||
|
||||
if "pdfa" in ocrmypdf_args["output_type"]:
|
||||
ocrmypdf_args["color_conversion_strategy"] = (
|
||||
self.settings.color_conversion_strategy
|
||||
)
|
||||
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.settings.mode in {
|
||||
ModeChoices.SKIP,
|
||||
ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
else: # pragma: no cover
|
||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||
|
||||
if self.settings.clean == CleanChoices.CLEAN:
|
||||
ocrmypdf_args["clean"] = True
|
||||
elif self.settings.clean == CleanChoices.FINAL:
|
||||
if self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["clean"] = True
|
||||
else:
|
||||
# --clean-final is not compatible with --redo-ocr
|
||||
ocrmypdf_args["clean_final"] = True
|
||||
|
||||
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
|
||||
# --deskew is not compatible with --redo-ocr
|
||||
ocrmypdf_args["deskew"] = True
|
||||
|
||||
if self.settings.rotate:
|
||||
ocrmypdf_args["rotate_pages"] = True
|
||||
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
|
||||
|
||||
if self.settings.pages is not None and self.settings.pages > 0:
|
||||
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
|
||||
else:
|
||||
# sidecar is incompatible with pages
|
||||
ocrmypdf_args["sidecar"] = sidecar_file
|
||||
|
||||
if self.is_image(mime_type):
|
||||
# This may be required, depending on the known information
|
||||
maybe_override_pixel_limit()
|
||||
|
||||
dpi = self.get_dpi(input_file)
|
||||
a4_dpi = self.calculate_a4_dpi(input_file)
|
||||
|
||||
if self.has_alpha(input_file):
|
||||
self.log.info(
|
||||
f"Removing alpha layer from {input_file} "
|
||||
"for compatibility with img2pdf",
|
||||
)
|
||||
# Replace the input file with the non-alpha
|
||||
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
|
||||
|
||||
if dpi:
|
||||
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||
ocrmypdf_args["image_dpi"] = dpi
|
||||
elif self.settings.image_dpi is not None:
|
||||
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
|
||||
elif a4_dpi:
|
||||
ocrmypdf_args["image_dpi"] = a4_dpi
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {input_file}, "
|
||||
f"no DPI information is present in this image and "
|
||||
f"OCR_IMAGE_DPI is not set.",
|
||||
)
|
||||
if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover
|
||||
self.log.warning(
|
||||
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
|
||||
)
|
||||
|
||||
if self.settings.user_args is not None:
|
||||
try:
|
||||
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||
f"they will not be used. Error: {e}",
|
||||
)
|
||||
|
||||
if (
|
||||
self.settings.max_image_pixel is not None
|
||||
and self.settings.max_image_pixel >= 0
|
||||
):
|
||||
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
|
||||
msg = (
|
||||
"OCR pixel limit is disabled!"
|
||||
if max_pixels_mpixels == 0
|
||||
else f"Calculated {max_pixels_mpixels} megapixels for OCR"
|
||||
)
|
||||
self.log.debug(msg)
|
||||
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
|
||||
# skip ocr process entirely to save time.
|
||||
self.text = "defautl text"
|
||||
self.log.debug("skipping reading file entirely.")
|
||||
return
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = (
|
||||
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.settings.skip_archive_file
|
||||
in {
|
||||
ArchiveFileChoices.WITH_TEXT,
|
||||
ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
|
||||
# force skip ocr process.
|
||||
if not original_has_text:
|
||||
original_has_text = True
|
||||
text_original = "this is default content, as we skipped ocr process..."
|
||||
self.log.warning("Cannot read text from Document, use default message.")
|
||||
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# Either no text was in the original or there should be an archive
|
||||
# file created, so OCR the file and create an archive with any
|
||||
# text located via OCR
|
||||
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
|
||||
if not self.text:
|
||||
raise NoTextFoundException("No text was found in the original document")
|
||||
except (DigitalSignatureError, EncryptedPdfError):
|
||||
self.log.warning(
|
||||
"This file is encrypted and/or signed, OCR is impossible. Using "
|
||||
"any text present in the original file.",
|
||||
)
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
)
|
||||
|
||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||
|
||||
# Attempt to run OCR with safe settings.
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path_fallback,
|
||||
sidecar_file_fallback,
|
||||
safe_fallback=True,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
# Don't return the archived file here, since this file
|
||||
# is bigger and blurry due to --force-ocr.
|
||||
|
||||
self.text = self.extract_text(
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# If this fails, we have a serious issue at hand.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
# As a last resort, if we still don't have any text for any reason,
|
||||
# try to extract the text from the original document.
|
||||
if not self.text:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
else:
|
||||
self.log.warning(
|
||||
f"No text was found in {document_path}, the content will be empty.",
|
||||
)
|
||||
self.text = ""
|
||||
|
||||
|
||||
def post_process_text(text):
|
||||
if not text:
|
||||
return None
|
||||
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
|
||||
|
||||
# TODO: this needs a rework
|
||||
# replace \0 prevents issues with saving to postgres.
|
||||
# text may contain \0 when this character is present in PDF files.
|
||||
return no_trailing_whitespace.strip().replace("\0", " ")
|
||||
37
docker/paperless/plugins/readme.md
Normal file
37
docker/paperless/plugins/readme.md
Normal file
@ -0,0 +1,37 @@
|
||||
## 登陆
|
||||
### 用户名: admin
|
||||
### 密码: paperless
|
||||
|
||||
## 需要指定用户名
|
||||
### 配置好 USERMAP_GID和USERMAP_GID,否则可能无法执行主机映射进去的脚本。
|
||||
### 详见 https://docs.paperless-ngx.com/configuration/#USERMAP_UID
|
||||
|
||||
## 自定义的文件名解析脚本
|
||||
```Bash
|
||||
# 文档
|
||||
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
|
||||
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
|
||||
|
||||
# 配置
|
||||
environment:
|
||||
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||
```
|
||||
|
||||
|
||||
## 源码修改,可以通过在容器里执行 docker_patch.sh 脚本来完成
|
||||
### 对于无法简单读取pdf内容的文档,paperless会启动OCR扫描,且复杂情况下会执行两遍,非常慢而且消耗资源。只能通过修改源码解决:
|
||||
```Bash
|
||||
# /usr/src/paperless/src/paperless_tesseract/parsers.py :
|
||||
|
||||
# force skip ocr process.
|
||||
if not original_has_text:
|
||||
original_has_text = True
|
||||
text_original = "this is default content, as we skipped ocr process..."
|
||||
self.log.warning("Cannot read text from Document, use default message.")
|
||||
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
```
|
||||
@ -1,64 +0,0 @@
|
||||
|
||||
|
||||
-------------------------------------------------------|
|
||||
------------------- paperless 无纸化pdf管理 ------------|
|
||||
-------------------------------------------------------|
|
||||
|
||||
## 最好不要用命令,使用docker-compose.yml来创建,需要制定后端使用的数据库,以及redis!
|
||||
docker run -itd \
|
||||
--name paperless \
|
||||
--network devops \
|
||||
--platform linux/x86_64 \
|
||||
-e TZ="Asia/Shanghai" \
|
||||
-v /etc/localtime:/etc/localtime:ro \
|
||||
-v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \
|
||||
-v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \
|
||||
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
||||
-p 8000:8000 \
|
||||
ghcr.io/paperless-ngx/paperless-ngx
|
||||
|
||||
|
||||
# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin)
|
||||
docker compose run --rm webserver createsuperuser
|
||||
python3 manage.py createsuperuser
|
||||
|
||||
# 已有文档,放在指定目录下,等系统自动加载(或者手工启动)
|
||||
cd /path/to/paperless/src/
|
||||
python3 manage.py document_consumer
|
||||
|
||||
# 自动解析文件名
|
||||
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
|
||||
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
|
||||
|
||||
environment:
|
||||
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||
|
||||
|
||||
paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:
|
||||
|
||||
/usr/src/paperless/src/documents/consumer.py
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with open(self.input_doc.original_file, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||
|
||||
if existing_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
## 修改这里,让它删除重复文件。
|
||||
if settings.CONSUMER_DELETE_DUPLICATES or True:
|
||||
os.unlink(self.input_doc.original_file)
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
)
|
||||
281
docker/stash/scripts/batch_format_filename.py
Normal file
281
docker/stash/scripts/batch_format_filename.py
Normal file
@ -0,0 +1,281 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import re
|
||||
|
||||
res_dir = './result'
|
||||
os.makedirs(res_dir, exist_ok=True)
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f'{res_dir}/rename_files.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def preload_folders(conn, prefix):
|
||||
"""预加载所有文件夹路径到字典(folder_id -> path)"""
|
||||
sqlstr = "SELECT id, path FROM folders where 1=1 "
|
||||
if prefix and prefix.strip():
|
||||
sqlstr += f" and path like '%{prefix}%' "
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sqlstr)
|
||||
return {row[0]: row[1] for row in cursor.fetchall()}
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"预加载文件夹信息失败: {str(e)}")
|
||||
raise
|
||||
|
||||
def preload_studios(conn):
|
||||
"""预加载所有工作室名称到字典(studio_id -> name)"""
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id, name FROM studios")
|
||||
studios = {row[0]: row[1] for row in cursor.fetchall()}
|
||||
# 补充默认值(未找到的工作室)
|
||||
studios[None] = "UnknownStudio"
|
||||
return studios
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"预加载工作室信息失败: {str(e)}")
|
||||
raise
|
||||
|
||||
def get_performers(conn, scene_id):
|
||||
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
query = """
|
||||
SELECT p.name
|
||||
FROM performers p
|
||||
JOIN performers_scenes ps ON p.id = ps.performer_id
|
||||
WHERE ps.scene_id = ?
|
||||
ORDER BY p.name
|
||||
"""
|
||||
cursor.execute(query, (scene_id,))
|
||||
results = cursor.fetchall()
|
||||
return ','.join([row[0] for row in results]) or "UnknownPerformers"
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
|
||||
raise
|
||||
|
||||
def parse_date(date_str):
|
||||
"""解析日期为yyyy.mm.dd格式"""
|
||||
if not date_str:
|
||||
return "0000.00.00"
|
||||
|
||||
date_formats = [
|
||||
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
|
||||
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
|
||||
]
|
||||
|
||||
for fmt in date_formats:
|
||||
try:
|
||||
return datetime.strptime(date_str, fmt).strftime("%Y.%m.%d")
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
|
||||
return "0000.00.00"
|
||||
|
||||
def get_file_extension(basename):
|
||||
"""获取文件扩展名"""
|
||||
if '.' in basename:
|
||||
return basename.split('.')[-1].lower()
|
||||
return ''
|
||||
|
||||
def sanitize_filename(name):
|
||||
"""清理文件名中的非法字符"""
|
||||
invalid_chars = '/\\:*?"<>|'
|
||||
for char in invalid_chars:
|
||||
name = name.replace(char, '-')
|
||||
return name
|
||||
|
||||
def process_scene_files(conn, mode, prefix, rename_style):
|
||||
"""处理所有场景文件映射关系(优化版:合并查询+预加载缓存)"""
|
||||
results = []
|
||||
try:
|
||||
# 1. 预加载文件夹和工作室到内存字典(仅2次SQL查询)
|
||||
folders = preload_folders(conn, prefix)
|
||||
studios = preload_studios(conn)
|
||||
logger.info(f"预加载完成 - 文件夹: {len(folders)} 个, 工作室: {len(studios)} 个")
|
||||
|
||||
# 2. 一次性查询所有关联数据(1次SQL查询替代多次)
|
||||
cursor = conn.cursor()
|
||||
query = """
|
||||
SELECT
|
||||
sf.scene_id, sf.file_id,
|
||||
f.id AS file_id, f.basename, f.parent_folder_id,
|
||||
s.title, s.date as release_date, s.studio_id, s.code
|
||||
FROM scenes_files sf
|
||||
LEFT JOIN files f ON sf.file_id = f.id
|
||||
LEFT JOIN scenes s ON sf.scene_id = s.id
|
||||
"""
|
||||
cursor.execute(query)
|
||||
mappings = cursor.fetchall()
|
||||
logger.info(f"共找到 {len(mappings)} 条场景-文件映射记录")
|
||||
|
||||
for idx, row in enumerate(mappings, 1):
|
||||
try:
|
||||
# 解析合并查询的结果
|
||||
scene_id = row[0]
|
||||
file_id = row[1]
|
||||
file_info = {
|
||||
'id': row[2],
|
||||
'basename': row[3],
|
||||
'parent_folder_id': row[4]
|
||||
}
|
||||
scene_info = {
|
||||
'title': row[5],
|
||||
'release_date': row[6],
|
||||
'studio_id': row[7],
|
||||
'code': row[8]
|
||||
}
|
||||
|
||||
# 校验必要数据
|
||||
if not file_id or not file_info['id'] or not file_info['basename'] or not file_info['parent_folder_id']:
|
||||
logger.debug(f"文件ID信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
|
||||
continue
|
||||
if not scene_id or not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
|
||||
logger.debug(f"场景信息不完整 (scene_id={scene_id}, file_id={file_id}),跳过")
|
||||
continue
|
||||
|
||||
# 3. 从内存缓存获取文件夹路径和工作室名称(无SQL查询)
|
||||
folder_path = folders.get(file_info['parent_folder_id'])
|
||||
if not folder_path:
|
||||
logger.debug(f"文件夹ID不存在 (folder_id={file_info['parent_folder_id']}),跳过")
|
||||
continue
|
||||
studio_name = studios.get(scene_info['studio_id'])
|
||||
if not studio_name:
|
||||
logger.debug(f"工作室ID不存在 (studio_id={scene_info['studio_id']}),跳过")
|
||||
continue
|
||||
|
||||
# 4. 获取演员信息(仍需单独查询,因多对多关联需排序)
|
||||
performers = get_performers(conn, scene_id)
|
||||
|
||||
# 5. 构建新文件名
|
||||
original_basename = file_info['basename'] or "unknown_file"
|
||||
ext = get_file_extension(original_basename)
|
||||
release_date = parse_date(scene_info['release_date'])
|
||||
title = scene_info['title'] or "Untitled"
|
||||
|
||||
# 清理特殊字符
|
||||
sanitized_studio = sanitize_filename(studio_name)
|
||||
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
|
||||
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
|
||||
if scene_info.get('code'):
|
||||
sanitized_title = f"{sanitized_title} ({scene_info['code']})"
|
||||
# 去掉sanitized_studio的空格,以及' " 等特殊符号
|
||||
sanitized_studio = re.sub(r'[\'"\s\-_]+', '', sanitized_studio)
|
||||
|
||||
# 拼接新文件名
|
||||
if ext:
|
||||
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}.{ext}"
|
||||
else:
|
||||
new_basename = f"{sanitized_studio}.{release_date} {sanitized_performers} - {sanitized_title}"
|
||||
|
||||
# 简化命名规则,适用于日本影片
|
||||
if rename_style == 'simple':
|
||||
if scene_info.get('code'):
|
||||
# code 转换成大写
|
||||
new_code = scene_info['code'].upper()
|
||||
new_basename = f"{new_code}_{release_date}.{ext}" if ext else f"{new_code}_{release_date}"
|
||||
|
||||
if len(new_basename) > 254:
|
||||
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
|
||||
continue
|
||||
|
||||
# 构建完整路径
|
||||
original_path = os.path.join(folder_path, original_basename)
|
||||
new_path = os.path.join(folder_path, new_basename)
|
||||
|
||||
if not os.path.exists(original_path):
|
||||
logger.warning(f"文件不存在,跳过: {original_path}")
|
||||
continue
|
||||
if os.path.exists(new_path):
|
||||
logger.warning(f"目标文件已存在,跳过: {new_path}")
|
||||
continue
|
||||
if original_path == new_path: # 文件名未变化
|
||||
logger.info(f"文件名未变化,跳过 (file_id={file_id}): {original_path}")
|
||||
continue
|
||||
|
||||
# 记录结果
|
||||
result = {
|
||||
'file_id': file_id,
|
||||
'scene_id': scene_id,
|
||||
'original_name': original_path,
|
||||
'dest_name': new_path
|
||||
}
|
||||
results.append(result)
|
||||
logger.info(f"处理第 {idx}/{len(mappings)} 条: {original_path} -> {new_path}")
|
||||
|
||||
# 运行模式:执行重命名和数据库更新
|
||||
if mode == 'run':
|
||||
if not os.path.exists(original_path):
|
||||
logger.warning(f"文件不存在,跳过: {original_path}")
|
||||
continue
|
||||
if os.path.exists(new_path):
|
||||
logger.warning(f"目标文件已存在,跳过: {new_path}")
|
||||
continue
|
||||
if original_path != new_path:
|
||||
os.rename(original_path, new_path)
|
||||
#cursor.execute(
|
||||
# "UPDATE files SET basename = ? WHERE id = ?",
|
||||
# (new_basename, file_info['id'])
|
||||
#)
|
||||
#conn.commit()
|
||||
logger.info(f"已更新文件 (file_id={file_info['id']})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
|
||||
if mode == 'run':
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
# 保存结果
|
||||
with open(f'{res_dir}/rename_results.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
logger.info(f"处理完成,结果已保存到 rename_results.json")
|
||||
return results
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
|
||||
if mode == 'run':
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
if mode == 'run':
|
||||
conn.commit()
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='电影文件重命名工具(优化版)')
|
||||
parser.add_argument('--mode', choices=['check', 'run'], default='check',
|
||||
help='运行模式: check(检查) 或 run(执行)')
|
||||
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
|
||||
parser.add_argument('--prefix', default='', help='目录前缀,用来过滤文件路径')
|
||||
parser.add_argument('--rename_style', choices=['standard', 'simple'], default='standard', help='文件命名规则,标准格式和简化格式')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.db):
|
||||
logger.error(f"数据库文件不存在: {args.db}")
|
||||
return
|
||||
|
||||
conn = None
|
||||
try:
|
||||
conn = sqlite3.connect(args.db)
|
||||
logger.info(f"成功连接到数据库: {args.db}")
|
||||
process_scene_files(conn, args.mode, args.prefix, args.rename_style)
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
logger.info("数据库连接已关闭")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
288
docker/stash/scripts/format_filename.py
Normal file
288
docker/stash/scripts/format_filename.py
Normal file
@ -0,0 +1,288 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import re
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('./result/rename_files.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_performers(conn, scene_id):
|
||||
"""获取场景对应的演员列表(按字母序排序,逗号分隔)"""
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
# 优化查询:使用JOIN一次性获取所需数据
|
||||
query = """
|
||||
SELECT p.name
|
||||
FROM performers p
|
||||
JOIN performers_scenes ps ON p.id = ps.performer_id
|
||||
WHERE ps.scene_id = ?
|
||||
ORDER BY p.name
|
||||
"""
|
||||
cursor.execute(query, (scene_id,))
|
||||
results = cursor.fetchall()
|
||||
return ','.join([row[0] for row in results])
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"获取演员信息失败 (scene_id={scene_id}): {str(e)}")
|
||||
raise
|
||||
|
||||
def get_file_info(conn, file_id):
|
||||
"""获取文件信息(ID、原始文件名、父目录ID)"""
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT id, basename, parent_folder_id
|
||||
FROM files
|
||||
WHERE id = ?
|
||||
""", (file_id,))
|
||||
result = cursor.fetchone()
|
||||
if not result:
|
||||
raise ValueError(f"未找到文件信息 (file_id={file_id})")
|
||||
return {
|
||||
'id': result[0],
|
||||
'basename': result[1],
|
||||
'parent_folder_id': result[2]
|
||||
}
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"获取文件信息失败 (file_id={file_id}): {str(e)}")
|
||||
raise
|
||||
|
||||
def get_folder_path(conn, folder_id):
|
||||
"""获取文件夹路径"""
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT path FROM folders WHERE id = ?", (folder_id,))
|
||||
result = cursor.fetchone()
|
||||
if not result:
|
||||
raise ValueError(f"未找到文件夹路径 (folder_id={folder_id})")
|
||||
return result[0]
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"获取文件夹路径失败 (folder_id={folder_id}): {str(e)}")
|
||||
raise
|
||||
|
||||
def get_scene_info(conn, scene_id):
|
||||
"""获取场景信息(标题、日期、工作室ID)"""
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT title, date as release_date, studio_id
|
||||
FROM scenes
|
||||
WHERE id = ?
|
||||
""", (scene_id,))
|
||||
result = cursor.fetchone()
|
||||
if not result:
|
||||
raise ValueError(f"未找到场景信息 (scene_id={scene_id})")
|
||||
return {
|
||||
'title': result[0],
|
||||
'release_date': result[1],
|
||||
'studio_id': result[2]
|
||||
}
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"获取场景信息失败 (scene_id={scene_id}): {str(e)}")
|
||||
raise
|
||||
|
||||
def get_studio_name(conn, studio_id):
|
||||
"""获取工作室名称"""
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT name FROM studios WHERE id = ?", (studio_id,))
|
||||
result = cursor.fetchone()
|
||||
if not result:
|
||||
logger.warning(f"未找到工作室信息 (studio_id={studio_id}),使用默认名称")
|
||||
return "UnknownStudio"
|
||||
return result[0]
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"获取工作室信息失败 (studio_id={studio_id}): {str(e)}")
|
||||
raise
|
||||
|
||||
def parse_date(date_str):
|
||||
"""解析日期为yyyy.mm.dd格式"""
|
||||
if not date_str:
|
||||
return "0000.00.00"
|
||||
|
||||
# 尝试多种常见日期格式
|
||||
date_formats = [
|
||||
"%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y",
|
||||
"%Y%m%d", "%m-%d-%Y", "%m/%d/%Y"
|
||||
]
|
||||
|
||||
for fmt in date_formats:
|
||||
try:
|
||||
date_obj = datetime.strptime(date_str, fmt)
|
||||
return date_obj.strftime("%Y.%m.%d")
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
logger.warning(f"无法解析日期格式: {date_str},使用默认值")
|
||||
return "0000.00.00"
|
||||
|
||||
def get_file_extension(basename):
|
||||
"""获取文件扩展名"""
|
||||
if '.' in basename:
|
||||
return basename.split('.')[-1].lower()
|
||||
return ''
|
||||
|
||||
def sanitize_filename(name):
|
||||
"""清理文件名中的非法字符"""
|
||||
invalid_chars = '/\\:*?"<>|'
|
||||
for char in invalid_chars:
|
||||
name = name.replace(char, '-')
|
||||
return name
|
||||
|
||||
def process_scene_files(conn, mode, prefix):
|
||||
"""处理所有场景文件映射关系"""
|
||||
results = []
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
# 获取所有场景-文件映射关系
|
||||
cursor.execute("SELECT scene_id, file_id FROM scenes_files")
|
||||
mappings = cursor.fetchall()
|
||||
logger.debug(f"共找到 {len(mappings)} 条场景-文件映射记录")
|
||||
|
||||
for idx, (scene_id, file_id) in enumerate(mappings, 1):
|
||||
logger.debug(f"处理第 {idx}/{len(mappings)} 条记录 (scene_id={scene_id}, file_id={file_id})")
|
||||
|
||||
try:
|
||||
# 1. 获取文件信息
|
||||
file_info = get_file_info(conn, file_id)
|
||||
original_basename = file_info['basename']
|
||||
parent_folder_id = file_info['parent_folder_id']
|
||||
|
||||
# 2.获取文件夹路径
|
||||
folder_path = get_folder_path(conn, parent_folder_id)
|
||||
|
||||
# 3. 获取演员信息
|
||||
performers = get_performers(conn, scene_id)
|
||||
if not performers:
|
||||
performers = "UnknownPerformers"
|
||||
logger.warning(f"场景 {scene_id} 未找到演员信息,跳过")
|
||||
continue
|
||||
|
||||
# 4. 获取场景和工作室信息
|
||||
scene_info = get_scene_info(conn, scene_id)
|
||||
if not scene_info['title'] or not scene_info['release_date'] or not scene_info['studio_id']:
|
||||
logger.warning(f"场景 {scene_id} 信息不完整,跳过")
|
||||
continue
|
||||
title = scene_info['title'] or "Untitled"
|
||||
release_date = parse_date(scene_info['release_date'])
|
||||
studio_name = get_studio_name(conn, scene_info['studio_id'])
|
||||
|
||||
# 5. 构建新文件名
|
||||
ext = get_file_extension(original_basename)
|
||||
sanitized_studio = sanitize_filename(studio_name)
|
||||
sanitized_performers = sanitize_filename(performers)[0:100] # 限制长度避免过长
|
||||
sanitized_title = sanitize_filename(title)[0:100] # 限制长度避免过长
|
||||
|
||||
if ext:
|
||||
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}.{ext}"
|
||||
else:
|
||||
new_basename = f"{sanitized_studio} - {release_date} - {sanitized_performers} - {sanitized_title}"
|
||||
|
||||
if len(new_basename) > 254:
|
||||
logger.warning(f"生成的文件名过长,跳过 (file_id={file_id}): {new_basename}")
|
||||
continue
|
||||
|
||||
# 构建完整路径
|
||||
original_path = os.path.join(folder_path, original_basename)
|
||||
new_path = os.path.join(folder_path, new_basename)
|
||||
|
||||
# 记录结果
|
||||
result = {
|
||||
'file_id': file_id,
|
||||
'scene_id': scene_id,
|
||||
'original_name': original_path,
|
||||
'dest_name': new_path
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
# 输出检查信息
|
||||
logger.info(f"准备重命名: {original_path} -> {new_path}")
|
||||
|
||||
# 在运行模式下执行操作
|
||||
if mode == 'run':
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(original_path):
|
||||
logger.warning(f"文件不存在,跳过: {original_path}")
|
||||
continue
|
||||
|
||||
# 执行文件重命名
|
||||
if original_path != new_path:
|
||||
os.rename(original_path, new_path)
|
||||
logger.info(f"已重命名: {original_path} -> {new_path}")
|
||||
|
||||
# 更新数据库记录
|
||||
cursor.execute(
|
||||
"UPDATE files SET basename = ? WHERE id = ?",
|
||||
(new_basename, file_id)
|
||||
)
|
||||
conn.commit()
|
||||
logger.info(f"已更新数据库记录 (file_id={file_id})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理记录失败 (scene_id={scene_id}, file_id={file_id}): {str(e)}", exc_info=True)
|
||||
# 回滚当前事务(如果是运行模式)
|
||||
if mode == 'run':
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
# 保存结果到文件
|
||||
with open('./result/rename_results.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
logger.info(f"处理完成,结果已保存到 rename_results.json")
|
||||
|
||||
return results
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"数据库操作失败: {str(e)}", exc_info=True)
|
||||
if mode == 'run':
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
if mode == 'run':
|
||||
conn.commit()
|
||||
|
||||
def main():
|
||||
# 解析命令行参数
|
||||
parser = argparse.ArgumentParser(description='电影文件重命名工具')
|
||||
parser.add_argument('--mode', choices=['check', 'run'], default='check',
|
||||
help='运行模式: check(检查) 或 run(执行)')
|
||||
parser.add_argument('--db', default='movies.db', help='SQLite数据库文件路径')
|
||||
parser.add_argument('--prefix', default='', help='目录的前缀,用来匹配')
|
||||
args = parser.parse_args()
|
||||
|
||||
# 验证数据库文件是否存在
|
||||
if not os.path.exists(args.db):
|
||||
logger.error(f"数据库文件不存在: {args.db}")
|
||||
return
|
||||
|
||||
os.makedirs('./result', exist_ok=True)
|
||||
|
||||
# 连接数据库
|
||||
conn = None
|
||||
try:
|
||||
conn = sqlite3.connect(args.db)
|
||||
conn.row_factory = sqlite3.Row # 启用行工厂,方便按列名访问
|
||||
logger.info(f"成功连接到数据库: {args.db}")
|
||||
|
||||
# 执行处理
|
||||
process_scene_files(conn, args.mode, args.prefix)
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"数据库连接失败: {str(e)}", exc_info=True)
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
logger.info("数据库连接已关闭")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
110
docker/stash/scripts/scrapers/JavBus/JavBus.yml
Normal file
110
docker/stash/scripts/scrapers/JavBus/JavBus.yml
Normal file
@ -0,0 +1,110 @@
|
||||
name: Javbus
|
||||
sceneByFragment:
|
||||
action: scrapeXPath
|
||||
queryURL: https://www.javbus.com/{filename}
|
||||
queryURLReplace:
|
||||
filename:
|
||||
- regex: -JG\d
|
||||
with: ""
|
||||
- regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
|
||||
with: $2
|
||||
scraper: sceneScraper
|
||||
sceneByURL:
|
||||
- action: scrapeXPath
|
||||
url:
|
||||
- https://www.javbus.com
|
||||
- https://www.seejav.bid
|
||||
- https://www.cdnbus.lol
|
||||
- https://www.dmmbus.lol
|
||||
- https://www.seedmm.cfd
|
||||
scraper: sceneScraper
|
||||
sceneByName:
|
||||
action: scrapeXPath
|
||||
queryURL: https://www.javbus.com/search/{}&type=&parent=ce
|
||||
scraper: sceneSearch
|
||||
sceneByQueryFragment:
|
||||
action: scrapeXPath
|
||||
queryURL: "{url}"
|
||||
scraper: sceneScraper
|
||||
|
||||
performerByURL:
|
||||
- action: scrapeXPath
|
||||
url:
|
||||
- https://www.javbus.com
|
||||
- https://www.seejav.bid
|
||||
- https://www.cdnbus.lol
|
||||
- https://www.dmmbus.lol
|
||||
- https://www.seedmm.cfd
|
||||
scraper: performerScraper
|
||||
performerByName:
|
||||
action: scrapeXPath
|
||||
queryURL: https://www.javbus.com/searchstar/{}&type=&parent=ce
|
||||
scraper: performerSearch
|
||||
|
||||
xPathScrapers:
|
||||
performerSearch:
|
||||
performer:
|
||||
Name: //span[@class="mleft"]
|
||||
URLs: //*[@id="waterfall"]/div/a/@href
|
||||
performerScraper:
|
||||
performer:
|
||||
Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
|
||||
Birthdate:
|
||||
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '生日')]
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^(.*? ){1}
|
||||
with:
|
||||
Height:
|
||||
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '身高')]
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^(.*? ){1}
|
||||
with:
|
||||
# Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
|
||||
Image:
|
||||
selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^
|
||||
with: https://www.javbus.com
|
||||
|
||||
sceneSearch:
|
||||
scene:
|
||||
Title: //div[@class="photo-info"]/span
|
||||
URL: //*[@id="waterfall"]/div/a/@href
|
||||
sceneScraper:
|
||||
scene:
|
||||
Title:
|
||||
selector: //div[@class="col-md-3 info"]//span[contains(text(), '識別碼')]/../span[2]/text()
|
||||
URL:
|
||||
selector: /html/head/link[@hreflang="zh"]/@href
|
||||
Date:
|
||||
selector: //div[@class="col-md-3 info"]//span[contains(text(), '發行日期')]/../text()
|
||||
Details:
|
||||
selector: //div[@class="container"]/h3/text()
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^(.*? ){1}
|
||||
with:
|
||||
Tags:
|
||||
Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
|
||||
Performers:
|
||||
Name: //div[@class="star-name"]/a
|
||||
Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
|
||||
Image:
|
||||
selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^
|
||||
with: https://www.javbus.com
|
||||
Studio:
|
||||
Name: //div[@class="col-md-3 info"]//span[contains(text(), '發行商')]/../a/text()
|
||||
|
||||
driver:
|
||||
headers:
|
||||
- Key: User-Agent
|
||||
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
|
||||
- Key: Accept-Language
|
||||
Value: zh-cn
|
||||
# Last Updated September 17, 2025
|
||||
9
docker/stash/scripts/scrapers/JavBus/manifest
Normal file
9
docker/stash/scripts/scrapers/JavBus/manifest
Normal file
@ -0,0 +1,9 @@
|
||||
id: JavBus
|
||||
name: Javbus
|
||||
metadata: {}
|
||||
version: 5ee93a34
|
||||
date: "2025-09-17 10:48:13"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- JavBus.yml
|
||||
111
docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
Normal file
111
docker/stash/scripts/scrapers/JavBus_en/JavBus_en.yml
Normal file
@ -0,0 +1,111 @@
|
||||
name: Javbus_en
|
||||
sceneByFragment:
|
||||
action: scrapeXPath
|
||||
queryURL: https://www.javbus.com/en/{filename}
|
||||
queryURLReplace:
|
||||
filename:
|
||||
- regex: -JG\d
|
||||
with: ""
|
||||
- regex: (.*[^a-zA-Z0-9])*([a-zA-Z-]+\d+)(.+)
|
||||
with: $2
|
||||
scraper: sceneScraper
|
||||
sceneByURL:
|
||||
- action: scrapeXPath
|
||||
url:
|
||||
- https://www.javbus.com/en
|
||||
- https://www.seejav.bid
|
||||
- https://www.cdnbus.lol
|
||||
- https://www.dmmbus.lol
|
||||
- https://www.seedmm.cfd
|
||||
scraper: sceneScraper
|
||||
sceneByName:
|
||||
action: scrapeXPath
|
||||
queryURL: https://www.javbus.com/en/search/{}&type=&parent=ce
|
||||
scraper: sceneSearch
|
||||
sceneByQueryFragment:
|
||||
action: scrapeXPath
|
||||
queryURL: "{url}"
|
||||
scraper: sceneScraper
|
||||
|
||||
performerByURL:
|
||||
- action: scrapeXPath
|
||||
url:
|
||||
- https://www.javbus.com/en
|
||||
- https://www.seejav.bid
|
||||
- https://www.cdnbus.lol
|
||||
- https://www.dmmbus.lol
|
||||
- https://www.seedmm.cfd
|
||||
scraper: performerScraper
|
||||
performerByName:
|
||||
action: scrapeXPath
|
||||
queryURL: https://www.javbus.com/en/searchstar/{}&type=&parent=ce
|
||||
scraper: performerSearch
|
||||
|
||||
xPathScrapers:
|
||||
performerSearch:
|
||||
performer:
|
||||
Name: //span[@class="mleft"]
|
||||
URLs: //*[@id="waterfall"]/div/a/@href
|
||||
performerScraper:
|
||||
performer:
|
||||
Name: //*[@id="waterfall"]/div[1]/div/div[2]/span
|
||||
Birthdate:
|
||||
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'D.O.B')]
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^(.*? ){1}
|
||||
with:
|
||||
Height:
|
||||
selector: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), 'Height')]
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^(.*? ){1}
|
||||
with:
|
||||
# Measurements: //*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '胸圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '腰圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '臀圍')]//*[@id="waterfall"]/div[1]/div/div[2]/p[contains(text(), '罩杯')]
|
||||
Image:
|
||||
selector: //*[@id="waterfall"]/div[1]/div/div[1]/img/@src
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^
|
||||
with: https://www.javbus.com/en
|
||||
|
||||
sceneSearch:
|
||||
scene:
|
||||
Title: //div[@class="photo-info"]/span
|
||||
URL: //*[@id="waterfall"]/div/a/@href
|
||||
sceneScraper:
|
||||
scene:
|
||||
Title:
|
||||
selector: //div[@class="col-md-3 info"]//span[contains(text(), 'ID')]/../span[2]/text()
|
||||
URL:
|
||||
selector: /html/head/link[@hreflang="zh"]/@href
|
||||
Date:
|
||||
selector: //div[@class="col-md-3 info"]//span[contains(normalize-space(text()), 'Release Date')]/../text()
|
||||
#selector: //div[@class="col-md-3 info"]//span[contains(text(), 'Release Date')]/../text()
|
||||
Details:
|
||||
selector: //div[@class="container"]/h3/text()
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^(.*? ){1}
|
||||
with:
|
||||
Tags:
|
||||
Name: //div[@class="col-md-3 info"]//span[@class="genre"]/label/a/text()
|
||||
Performers:
|
||||
Name: //div[@class="star-name"]/a
|
||||
Director: //div[@id='video_director']/table/tbody/tr/td[@class="text"]/span/a/text()
|
||||
Image:
|
||||
selector: //div[@class="row movie"]/div[@class="col-md-9 screencap"]/a[@class="bigImage"]/img/@src
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^
|
||||
with: https://www.javbus.com/
|
||||
Studio:
|
||||
Name: //div[@class="col-md-3 info"]//span[contains(text(), 'Label')]/../a/text()
|
||||
|
||||
driver:
|
||||
headers:
|
||||
- Key: User-Agent
|
||||
Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36
|
||||
- Key: Accept-Language
|
||||
Value: zh-cn,en-US
|
||||
# Last Updated September 17, 2025
|
||||
9
docker/stash/scripts/scrapers/JavBus_en/manifest
Normal file
9
docker/stash/scripts/scrapers/JavBus_en/manifest
Normal file
@ -0,0 +1,9 @@
|
||||
id: JavBus_en
|
||||
name: Javbus_en
|
||||
metadata: {}
|
||||
version: b4672ccf
|
||||
date: "2025-08-01 16:01:27"
|
||||
requires: []
|
||||
source_repository: https://stashapp.github.io/CommunityScrapers/stable/index.yml
|
||||
files:
|
||||
- JavBus_en.yml
|
||||
Reference in New Issue
Block a user