modify scripts
This commit is contained in:
@ -9,7 +9,8 @@ import logging
|
||||
|
||||
# Paperless 服务器信息
|
||||
PAPERLESS_URL = "http://localhost:8000/api"
|
||||
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
|
||||
#AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证, mac上用这个
|
||||
AUTH = HTTPBasicAuth("admin", "paperless") # Basic Auth 认证,NAS上用这个
|
||||
|
||||
# 日志配置
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
|
||||
40
docker/paperless/plugins/consume.sh
Executable file
40
docker/paperless/plugins/consume.sh
Executable file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
SRC="/volume1/docker/sharedata/stock_data/pdfs"
|
||||
DST="/volume1/docker/sharedata/stock_data/em_reports_consume"
|
||||
LOG="./paperless.log"
|
||||
|
||||
TARGET_UID=1000
|
||||
TARGET_GID=1000
|
||||
|
||||
# 检查目录
|
||||
if [ ! -d "$SRC" ]; then
|
||||
echo "$(date '+%F %T') [ERROR] 源目录不存在: $SRC" | tee -a "$LOG"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d "$DST" ]; then
|
||||
echo "$(date '+%F %T') [ERROR] 目标目录不存在: $DST" | tee -a "$LOG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
COUNT=0
|
||||
for f in "$SRC"/*.pdf; do
|
||||
[ -f "$f" ] || continue
|
||||
|
||||
# 移动 + 改属主 + 改权限
|
||||
if install -D -o "$TARGET_UID" -g "$TARGET_GID" -m 644 "$f" "$DST"; then
|
||||
rm -f "$f"
|
||||
echo "$(date '+%F %T') [OK] Moved: $f" >> "$LOG"
|
||||
((COUNT++))
|
||||
|
||||
# 每移动10个文件,输出进度到屏幕(同时写入日志)
|
||||
if (( COUNT % 100 == 0 )); then
|
||||
PROGRESS_MSG="$(date '+%F %T') [PROGRESS] 已移动 $COUNT 个文件"
|
||||
echo "$PROGRESS_MSG" | tee -a "$LOG"
|
||||
fi
|
||||
else
|
||||
echo "$(date '+%F %T') [FAIL] Failed: $f" >> "$LOG"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "$(date '+%F %T') [INFO] 搬运完成,共移动 $COUNT 个文件" | tee -a "$LOG"
|
||||
|
||||
73
docker/paperless/plugins/docker_patch.sh
Executable file
73
docker/paperless/plugins/docker_patch.sh
Executable file
@ -0,0 +1,73 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 定义文件路径
|
||||
SOURCE_PARSE_FILENAME="/usr/src/paperless/git_scripts/parse_filename.py"
|
||||
DEST_PARSE_FILENAME="/usr/src/paperless/scripts/parse_filename.py"
|
||||
|
||||
SOURCE_PARSERS="/usr/src/paperless/git_scripts/parsers.py"
|
||||
DEST_PARSERS="/usr/src/paperless/src/paperless_tesseract/parsers.py"
|
||||
|
||||
# 检查文件是否存在
|
||||
check_files_exist() {
|
||||
local missing=0
|
||||
for file in "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME" "$SOURCE_PARSERS" "$DEST_PARSERS"; do
|
||||
if [ ! -f "$file" ]; then
|
||||
echo "错误:文件不存在 - $file"
|
||||
missing=1
|
||||
fi
|
||||
done
|
||||
if [ $missing -eq 1 ]; then
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示文件差异
|
||||
show_diffs() {
|
||||
echo "=== 检查 parse_filename.py 差异 ==="
|
||||
diff -u "$DEST_PARSE_FILENAME" "$SOURCE_PARSE_FILENAME" || true # 差异不存在时不报错
|
||||
|
||||
echo -e "\n=== 检查 parsers.py 差异 ==="
|
||||
diff -u "$DEST_PARSERS" "$SOURCE_PARSERS" || true
|
||||
}
|
||||
|
||||
# 备份并替换文件
|
||||
replace_files() {
|
||||
# 备份目标文件
|
||||
backup_file "$DEST_PARSE_FILENAME"
|
||||
backup_file "$DEST_PARSERS"
|
||||
|
||||
# 执行替换
|
||||
cp -f "$SOURCE_PARSE_FILENAME" "$DEST_PARSE_FILENAME"
|
||||
cp -f "$SOURCE_PARSERS" "$DEST_PARSERS"
|
||||
|
||||
echo -e "\n=== 替换完成,以下是替换后的差异(应无差异) ==="
|
||||
show_diffs
|
||||
}
|
||||
|
||||
# 备份文件(添加 .bak 后缀,保留原权限)
|
||||
backup_file() {
|
||||
local file="$1"
|
||||
local backup="$file.bak"
|
||||
if [ -f "$backup" ]; then
|
||||
# 若已有备份,先删除旧备份(避免累积过多)
|
||||
rm -f "$backup"
|
||||
fi
|
||||
cp -a "$file" "$backup" # -a 保留权限和属性
|
||||
echo "已备份:$file -> $backup"
|
||||
}
|
||||
|
||||
# 主逻辑
|
||||
check_files_exist
|
||||
|
||||
if [ "$1" = "check" ]; then
|
||||
echo "=== 执行差异检查(不修改文件) ==="
|
||||
show_diffs
|
||||
elif [ "$1" = "replace" ]; then
|
||||
echo "=== 执行文件替换(先备份) ==="
|
||||
replace_files
|
||||
else
|
||||
echo "用法:$0 [check|replace]"
|
||||
echo " check - 仅检查文件差异,不做修改"
|
||||
echo " replace - 备份目标文件并替换,然后显示最终差异"
|
||||
exit 1
|
||||
fi
|
||||
@ -1,41 +0,0 @@
|
||||
-- documents_correspondent definition
|
||||
|
||||
CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
|
||||
|
||||
CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
|
||||
CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
|
||||
|
||||
-- documents_customfield definition
|
||||
|
||||
CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
|
||||
|
||||
CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
|
||||
|
||||
-- documents_customfieldinstance definition
|
||||
|
||||
CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
|
||||
|
||||
CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
|
||||
CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
|
||||
CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
|
||||
|
||||
|
||||
-- documents_document definition
|
||||
|
||||
CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
|
||||
|
||||
CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
|
||||
CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
|
||||
CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
|
||||
CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
|
||||
CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
|
||||
CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
|
||||
CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
|
||||
CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
|
||||
|
||||
-- documents_documenttype definition
|
||||
|
||||
CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
|
||||
|
||||
CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
|
||||
CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");
|
||||
@ -1,63 +0,0 @@
|
||||
我提供的文件,是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下:
|
||||
|
||||
1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
|
||||
2,我们提取上面的各个字段,然后:
|
||||
1) report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表,如果对应的name不存在,则插入一条记录;然后得到对应的 documents_documenttype.id
|
||||
2) org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表,如果对应的name 不存在,则插入一条记录,然后得到对应的 documents_correspondent.id
|
||||
3) 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段,如果不存在,则创建; 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
|
||||
3,我们开始更新数据表:
|
||||
1) 更新 documents_document 表对应的记录, reated = publish_date, correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title}
|
||||
2) 向 documents_customfieldinstance 两条记录,分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
|
||||
|
||||
好了,请你根据以上需求,完成这个python脚本。注意异常情况的处理,以及日志输出。如果文件名无法匹配以上的格式,则忽略,不用处理。
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
|
||||
|
||||
The API provides the following main endpoints:
|
||||
|
||||
/api/correspondents/: Full CRUD support.
|
||||
/api/custom_fields/: Full CRUD support.
|
||||
/api/documents/: Full CRUD support, except POSTing new documents. See below.
|
||||
/api/document_types/: Full CRUD support.
|
||||
/api/groups/: Full CRUD support.
|
||||
/api/logs/: Read-Only.
|
||||
/api/mail_accounts/: Full CRUD support.
|
||||
/api/mail_rules/: Full CRUD support.
|
||||
/api/profile/: GET, PATCH
|
||||
/api/share_links/: Full CRUD support.
|
||||
/api/storage_paths/: Full CRUD support.
|
||||
/api/tags/: Full CRUD support.
|
||||
/api/tasks/: Read-only.
|
||||
/api/users/: Full CRUD support.
|
||||
/api/workflows/: Full CRUD support.
|
||||
/api/search/ GET, see below.
|
||||
All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
|
||||
|
||||
The objects served by the document endpoint contain the following fields:
|
||||
|
||||
id: ID of the document. Read-only.
|
||||
title: Title of the document.
|
||||
content: Plain text content of the document.
|
||||
tags: List of IDs of tags assigned to this document, or empty list.
|
||||
document_type: Document type of this document, or null.
|
||||
correspondent: Correspondent of this document or null.
|
||||
created: The date time at which this document was created.
|
||||
created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
|
||||
modified: The date at which this document was last edited in paperless. Read-only.
|
||||
added: The date at which this document was added to paperless. Read-only.
|
||||
archive_serial_number: The identifier of this document in a physical document archive.
|
||||
original_file_name: Verbose filename of the original document. Read-only.
|
||||
archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
|
||||
notes: Array of notes associated with the document.
|
||||
page_count: Number of pages.
|
||||
set_permissions: Allows setting document permissions. Optional, write-only. See below.
|
||||
custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
|
||||
|
||||
|
||||
以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么,我想对编号为19的文档进行查询,以及更新操作,应该如何写对应的python代码?
|
||||
|
||||
|
||||
Reference in New Issue
Block a user