modify scripts

This commit is contained in:
2025-11-07 10:08:19 +08:00
parent 17356c79f9
commit 15c4f7b823
5 changed files with 115 additions and 105 deletions

View File

@ -9,7 +9,8 @@ import logging
# Paperless 服务器信息
PAPERLESS_URL = "http://localhost:8000/api"
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
#AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证 mac上用这个
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证NAS上用这个
# 日志配置
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

View File

@ -0,0 +1,40 @@
#!/bin/bash
SRC="/volume1/docker/sharedata/stock_data/pdfs"
DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
LOG="./paperless.log"
TARGET_UID=1000
TARGET_GID=1000
# 检查目录
if [ ! -d "$SRC" ]; then
echo "$(date '+%F %T') [ERROR] 源目录不存在: $SRC" | tee -a "$LOG"
exit 1
fi
if [ ! -d "$DST" ]; then
echo "$(date '+%F %T') [ERROR] 目标目录不存在: $DST" | tee -a "$LOG"
exit 1
fi
COUNT=0
for f in "$SRC"/*.pdf; do
[ -f "$f" ] || continue
# 移动 + 改属主 + 改权限
if install -D -o "$TARGET_UID" -g "$TARGET_GID" -m 644 "$f" "$DST"; then
rm -f "$f"
echo "$(date '+%F %T') [OK] Moved: $f" >> "$LOG"
((COUNT++))
# 每移动10个文件输出进度到屏幕同时写入日志
if (( COUNT % 100 == 0 )); then
PROGRESS_MSG="$(date '+%F %T') [PROGRESS] 已移动 $COUNT 个文件"
echo "$PROGRESS_MSG" | tee -a "$LOG"
fi
else
echo "$(date '+%F %T') [FAIL] Failed: $f" >> "$LOG"
fi
done
echo "$(date '+%F %T') [INFO] 搬运完成,共移动 $COUNT 个文件" | tee -a "$LOG"

View File

@ -0,0 +1,73 @@
#!/bin/bash
# 定义文件路径
SOURCE_PARSE_FILENAME="/usr/src/paperless/git_scripts/parse_filename.py"
DEST_PARSE_FILENAME="/usr/src/paperless/scripts/parse_filename.py"
SOURCE_PARSERS="/usr/src/paperless/git_scripts/parsers.py"
DEST_PARSERS="/usr/src/paperless/src/paperless_tesseract/parsers.py"
# 检查文件是否存在
check_files_exist() {
local missing=0
for file in "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME" "$SOURCE_PARSERS" "$DEST_PARSERS"; do
if [ ! -f "$file" ]; then
echo "错误:文件不存在 - $file"
missing=1
fi
done
if [ $missing -eq 1 ]; then
exit 1
fi
}
# 显示文件差异
show_diffs() {
echo "=== 检查 parse_filename.py 差异 ==="
diff -u "$DEST_PARSE_FILENAME" "$SOURCE_PARSE_FILENAME" || true # 差异不存在时不报错
echo -e "\n=== 检查 parsers.py 差异 ==="
diff -u "$DEST_PARSERS" "$SOURCE_PARSERS" || true
}
# 备份并替换文件
replace_files() {
# 备份目标文件
backup_file "$DEST_PARSE_FILENAME"
backup_file "$DEST_PARSERS"
# 执行替换
cp -f "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME"
cp -f "$SOURCE_PARSERS" "$DEST_PARSERS"
echo -e "\n=== 替换完成,以下是替换后的差异(应无差异) ==="
show_diffs
}
# 备份文件(添加 .bak 后缀,保留原权限)
backup_file() {
local file="$1"
local backup="$file.bak"
if [ -f "$backup" ]; then
# 若已有备份,先删除旧备份(避免累积过多)
rm -f "$backup"
fi
cp -a "$file" "$backup" # -a 保留权限和属性
echo "已备份:$file -> $backup"
}
# 主逻辑
check_files_exist
if [ "$1" = "check" ]; then
echo "=== 执行差异检查(不修改文件) ==="
show_diffs
elif [ "$1" = "replace" ]; then
echo "=== 执行文件替换(先备份) ==="
replace_files
else
echo "用法:$0 [check|replace]"
echo " check - 仅检查文件差异,不做修改"
echo " replace - 备份目标文件并替换,然后显示最终差异"
exit 1
fi

View File

@ -1,41 +0,0 @@
-- documents_correspondent definition
CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
-- documents_customfield definition
CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
-- documents_customfieldinstance definition
CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
-- documents_document definition
CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
-- documents_documenttype definition
CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");

View File

@ -1,63 +0,0 @@
我提供的文件,是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下
1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
2我们提取上面的各个字段然后
1 report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表如果对应的name不存在则插入一条记录然后得到对应的 documents_documenttype.id
2 org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表如果对应的name 不存在,则插入一条记录,然后得到对应的 documents_correspondent.id
3 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段,如果不存在,则创建; 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
3我们开始更新数据表
1 更新 documents_document 表对应的记录, reated = publish_date, correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title}
2) 向 documents_customfieldinstance 两条记录,分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
好了请你根据以上需求完成这个python脚本。注意异常情况的处理以及日志输出。如果文件名无法匹配以上的格式则忽略不用处理。
Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
The API provides the following main endpoints:
/api/correspondents/: Full CRUD support.
/api/custom_fields/: Full CRUD support.
/api/documents/: Full CRUD support, except POSTing new documents. See below.
/api/document_types/: Full CRUD support.
/api/groups/: Full CRUD support.
/api/logs/: Read-Only.
/api/mail_accounts/: Full CRUD support.
/api/mail_rules/: Full CRUD support.
/api/profile/: GET, PATCH
/api/share_links/: Full CRUD support.
/api/storage_paths/: Full CRUD support.
/api/tags/: Full CRUD support.
/api/tasks/: Read-only.
/api/users/: Full CRUD support.
/api/workflows/: Full CRUD support.
/api/search/ GET, see below.
All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
The objects served by the document endpoint contain the following fields:
id: ID of the document. Read-only.
title: Title of the document.
content: Plain text content of the document.
tags: List of IDs of tags assigned to this document, or empty list.
document_type: Document type of this document, or null.
correspondent: Correspondent of this document or null.
created: The date time at which this document was created.
created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
modified: The date at which this document was last edited in paperless. Read-only.
added: The date at which this document was added to paperless. Read-only.
archive_serial_number: The identifier of this document in a physical document archive.
original_file_name: Verbose filename of the original document. Read-only.
archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
notes: Array of notes associated with the document.
page_count: Number of pages.
set_permissions: Allows setting document permissions. Optional, write-only. See below.
custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么我想对编号为19的文档进行查询以及更新操作应该如何写对应的python代码