modify scripts
This commit is contained in:
77
docker/paperless/plugins/batch_del.py
Normal file
77
docker/paperless/plugins/batch_del.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
from requests.auth import HTTPBasicAuth
|
||||||
|
from requests.exceptions import RequestException
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Paperless 服务器信息
|
||||||
|
PAPERLESS_URL = "http://localhost:8000/api"
|
||||||
|
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
|
||||||
|
|
||||||
|
# 日志配置
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
|
|
||||||
|
# 连接到 SQLite 数据库
|
||||||
|
DB_PATH = "/usr/src/paperless/data/db.sqlite3"
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# API 请求封装(带重试),支持GET,POST等
|
||||||
|
def api_request(method, url, data=None, retries=5):
|
||||||
|
for attempt in range(retries):
|
||||||
|
try:
|
||||||
|
response = requests.request(method, url, json=data, auth=AUTH, timeout=5)
|
||||||
|
|
||||||
|
if response.status_code in [200, 201, 204]:
|
||||||
|
return response.json() if response.text else True
|
||||||
|
elif response.status_code == 404:
|
||||||
|
logging.warning(f"API 资源未找到: {method} {url}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
logging.error(f"API 请求失败: {method} {url}, 状态码: {response.status_code}, 响应: {response.text}")
|
||||||
|
|
||||||
|
except RequestException as e:
|
||||||
|
logging.error(f"API 请求异常: {method} {url}, 错误: {e}")
|
||||||
|
|
||||||
|
if attempt < retries - 1:
|
||||||
|
logging.warning(f"请求失败,等待 2 秒后重试 ({attempt+1}/{retries})...")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
logging.error(f"API 请求最终失败: {method} {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 从数据库中获取 page_count <= 6 的文档 id
|
||||||
|
def get_documents_to_delete():
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT id FROM documents_document WHERE page_count <= 6 and (deleted_at IS NULL OR deleted_at = '') ")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
return [row[0] for row in rows]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error querying data: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 删除文档
|
||||||
|
def delete_documents(doc_ids):
|
||||||
|
succ_count = 0
|
||||||
|
for doc_id in doc_ids:
|
||||||
|
url = f"{PAPERLESS_URL}/documents/{doc_id}/"
|
||||||
|
result = api_request("DELETE", url)
|
||||||
|
if result:
|
||||||
|
logging.info(f"✅ 文档 {doc_id} 删除成功")
|
||||||
|
succ_count += 1
|
||||||
|
else:
|
||||||
|
logging.error(f"❌ 文档 {doc_id} 删除失败")
|
||||||
|
logging.info(f"\ntotal count: {len(doc_ids)}, deleted: {succ_count}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
doc_ids = get_documents_to_delete()
|
||||||
|
if doc_ids:
|
||||||
|
delete_documents(doc_ids)
|
||||||
|
else:
|
||||||
|
logging.info("没有需要删除的文档")
|
||||||
|
|
||||||
|
# 关闭数据库连接
|
||||||
|
conn.close()
|
||||||
41
docker/paperless/plugins/paperless.sql
Normal file
41
docker/paperless/plugins/paperless.sql
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
-- documents_correspondent definition
|
||||||
|
|
||||||
|
CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
|
||||||
|
CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
|
||||||
|
|
||||||
|
-- documents_customfield definition
|
||||||
|
|
||||||
|
CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
|
||||||
|
|
||||||
|
CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
|
||||||
|
|
||||||
|
-- documents_customfieldinstance definition
|
||||||
|
|
||||||
|
CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
|
||||||
|
|
||||||
|
CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
|
||||||
|
CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
|
||||||
|
CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
|
||||||
|
|
||||||
|
|
||||||
|
-- documents_document definition
|
||||||
|
|
||||||
|
CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
|
||||||
|
|
||||||
|
CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
|
||||||
|
CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
|
||||||
|
CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
|
||||||
|
CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
|
||||||
|
CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
|
||||||
|
CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
|
||||||
|
CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
|
||||||
|
CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
|
||||||
|
|
||||||
|
-- documents_documenttype definition
|
||||||
|
|
||||||
|
CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
|
||||||
|
CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");
|
||||||
63
docker/paperless/plugins/paperless.txt
Normal file
63
docker/paperless/plugins/paperless.txt
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
我提供的文件,是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下:
|
||||||
|
|
||||||
|
1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
|
||||||
|
2,我们提取上面的各个字段,然后:
|
||||||
|
1) report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表,如果对应的name不存在,则插入一条记录;然后得到对应的 documents_documenttype.id
|
||||||
|
2) org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表,如果对应的name 不存在,则插入一条记录,然后得到对应的 documents_correspondent.id
|
||||||
|
3) 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段,如果不存在,则创建; 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
|
||||||
|
3,我们开始更新数据表:
|
||||||
|
1) 更新 documents_document 表对应的记录, reated = publish_date, correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title}
|
||||||
|
2) 向 documents_customfieldinstance 两条记录,分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
|
||||||
|
|
||||||
|
好了,请你根据以上需求,完成这个python脚本。注意异常情况的处理,以及日志输出。如果文件名无法匹配以上的格式,则忽略,不用处理。
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
|
||||||
|
|
||||||
|
The API provides the following main endpoints:
|
||||||
|
|
||||||
|
/api/correspondents/: Full CRUD support.
|
||||||
|
/api/custom_fields/: Full CRUD support.
|
||||||
|
/api/documents/: Full CRUD support, except POSTing new documents. See below.
|
||||||
|
/api/document_types/: Full CRUD support.
|
||||||
|
/api/groups/: Full CRUD support.
|
||||||
|
/api/logs/: Read-Only.
|
||||||
|
/api/mail_accounts/: Full CRUD support.
|
||||||
|
/api/mail_rules/: Full CRUD support.
|
||||||
|
/api/profile/: GET, PATCH
|
||||||
|
/api/share_links/: Full CRUD support.
|
||||||
|
/api/storage_paths/: Full CRUD support.
|
||||||
|
/api/tags/: Full CRUD support.
|
||||||
|
/api/tasks/: Read-only.
|
||||||
|
/api/users/: Full CRUD support.
|
||||||
|
/api/workflows/: Full CRUD support.
|
||||||
|
/api/search/ GET, see below.
|
||||||
|
All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
|
||||||
|
|
||||||
|
The objects served by the document endpoint contain the following fields:
|
||||||
|
|
||||||
|
id: ID of the document. Read-only.
|
||||||
|
title: Title of the document.
|
||||||
|
content: Plain text content of the document.
|
||||||
|
tags: List of IDs of tags assigned to this document, or empty list.
|
||||||
|
document_type: Document type of this document, or null.
|
||||||
|
correspondent: Correspondent of this document or null.
|
||||||
|
created: The date time at which this document was created.
|
||||||
|
created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
|
||||||
|
modified: The date at which this document was last edited in paperless. Read-only.
|
||||||
|
added: The date at which this document was added to paperless. Read-only.
|
||||||
|
archive_serial_number: The identifier of this document in a physical document archive.
|
||||||
|
original_file_name: Verbose filename of the original document. Read-only.
|
||||||
|
archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
|
||||||
|
notes: Array of notes associated with the document.
|
||||||
|
page_count: Number of pages.
|
||||||
|
set_permissions: Allows setting document permissions. Optional, write-only. See below.
|
||||||
|
custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
|
||||||
|
|
||||||
|
|
||||||
|
以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么,我想对编号为19的文档进行查询,以及更新操作,应该如何写对应的python代码?
|
||||||
|
|
||||||
|
|
||||||
191
docker/paperless/plugins/parse_filename.py
Executable file
191
docker/paperless/plugins/parse_filename.py
Executable file
@ -0,0 +1,191 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
import sqlite3
|
||||||
|
from requests.auth import HTTPBasicAuth
|
||||||
|
from requests.exceptions import RequestException
|
||||||
|
|
||||||
|
# Paperless 服务器信息
|
||||||
|
PAPERLESS_URL = "http://localhost:8000/api"
|
||||||
|
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
|
||||||
|
|
||||||
|
# 日志配置
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
|
|
||||||
|
|
||||||
|
# 连接到 SQLite 数据库
|
||||||
|
#DB_PATH = "/usr/src/paperless/db/paperless.sqlite3" # 根据你的数据库实际路径调整
|
||||||
|
DB_PATH = "/usr/src/paperless/data/db.sqlite3"
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
enable_db = True
|
||||||
|
|
||||||
|
# 正则解析文件名
|
||||||
|
FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")
|
||||||
|
|
||||||
|
# 直接从db里获取数据
|
||||||
|
def query_id_from_db(table, column, value):
|
||||||
|
try:
|
||||||
|
cursor.execute(f"SELECT id FROM {table} WHERE {column} = ?", (value,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
if row:
|
||||||
|
logging.info(f'query {table} where {column}="{value}" get id={row[0]}')
|
||||||
|
return row[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"Error inserting or updating data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# API 请求封装(带重试),支持GET,POST等
|
||||||
|
def api_request(method, url, data=None, retries=5):
|
||||||
|
for attempt in range(retries):
|
||||||
|
try:
|
||||||
|
response = requests.request(method, url, json=data, auth=AUTH, timeout=5)
|
||||||
|
|
||||||
|
if response.status_code in [200, 201]:
|
||||||
|
return response.json()
|
||||||
|
elif response.status_code == 404:
|
||||||
|
logging.warning(f"API 资源未找到: {method} {url}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
logging.error(f"API 请求失败: {method} {url}, 状态码: {response.status_code}, 响应: {response.text}")
|
||||||
|
|
||||||
|
except RequestException as e:
|
||||||
|
logging.error(f"API 请求异常: {method} {url}, 错误: {e}")
|
||||||
|
|
||||||
|
if attempt < retries - 1:
|
||||||
|
logging.warning(f"请求失败,等待 2 秒后重试 ({attempt+1}/{retries})...")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
logging.error(f"API 请求最终失败: {method} {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 从API查询
|
||||||
|
def query_id_from_rest_api(endpoint, name):
|
||||||
|
url = f"{PAPERLESS_URL}/{endpoint}"
|
||||||
|
method = 'GET'
|
||||||
|
while url is not None : # 循环请求所有分页数据
|
||||||
|
json_data = api_request(method, url, data=None, retries=5)
|
||||||
|
if json_data:
|
||||||
|
if "results" in json_data and isinstance(json_data["results"], list):
|
||||||
|
# 找到就直接返回了
|
||||||
|
for item in json_data.get('results', []):
|
||||||
|
if item["name"] == name:
|
||||||
|
return item["id"]
|
||||||
|
# 没找到,继续下一页
|
||||||
|
url = json_data.get("next", None)
|
||||||
|
else:
|
||||||
|
logging.warning(f"API 返回无数据: {method} {url}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logging.warning(f"API 返回无数据: {method} {url}")
|
||||||
|
break
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 获取或创建文档类型
|
||||||
|
def get_or_create_document_type(name):
|
||||||
|
if enable_db:
|
||||||
|
id = query_id_from_db("documents_documenttype", "name", name)
|
||||||
|
else:
|
||||||
|
id = query_id_from_rest_api("document_types/", name)
|
||||||
|
if id:
|
||||||
|
return id
|
||||||
|
|
||||||
|
# 创建新的文档类型
|
||||||
|
new_doc_type = api_request("POST", f"{PAPERLESS_URL}/document_types/", {"name": name})
|
||||||
|
return new_doc_type.get("id") if new_doc_type else None
|
||||||
|
|
||||||
|
# 获取或创建 Correspondent(机构/通信者)
|
||||||
|
def get_or_create_correspondent(name):
|
||||||
|
if enable_db:
|
||||||
|
id = query_id_from_db("documents_correspondent", "name", name)
|
||||||
|
else:
|
||||||
|
id = query_id_from_rest_api("correspondents/", name)
|
||||||
|
if id:
|
||||||
|
return id
|
||||||
|
|
||||||
|
# 创建新的 Correspondent
|
||||||
|
new_correspondent = api_request("POST", f"{PAPERLESS_URL}/correspondents/", {"name": name})
|
||||||
|
return new_correspondent.get("id") if new_correspondent else None
|
||||||
|
|
||||||
|
|
||||||
|
# 获取或创建自定义字段
|
||||||
|
def get_or_create_custom_field(name):
|
||||||
|
if enable_db:
|
||||||
|
id = query_id_from_db("documents_customfield", "name", name)
|
||||||
|
else:
|
||||||
|
id = query_id_from_rest_api("custom_fields/", name)
|
||||||
|
if id:
|
||||||
|
return id
|
||||||
|
|
||||||
|
# 创建新的自定义字段
|
||||||
|
new_field = api_request("POST", f"{PAPERLESS_URL}/custom_fields/", {"name": name, "data_type": "string"})
|
||||||
|
return new_field.get("id") if new_field else None
|
||||||
|
|
||||||
|
# 更新文档信息
|
||||||
|
def update_document(document_id, title, publish_date, doc_type_id, correspondent_id, custom_fields):
|
||||||
|
payload = {
|
||||||
|
"title": title,
|
||||||
|
"created_date": publish_date,
|
||||||
|
"document_type": doc_type_id,
|
||||||
|
"correspondent": correspondent_id,
|
||||||
|
"custom_fields": custom_fields,
|
||||||
|
}
|
||||||
|
return api_request("PATCH", f"{PAPERLESS_URL}/documents/{document_id}/", payload) is not None
|
||||||
|
|
||||||
|
# 主函数 - 解析文件名并更新 Paperless
|
||||||
|
def process_document():
|
||||||
|
doc_id = os.getenv("DOCUMENT_ID")
|
||||||
|
filename = os.getenv("DOCUMENT_ORIGINAL_FILENAME")
|
||||||
|
|
||||||
|
# debug
|
||||||
|
#doc_id = sys.argv[1]
|
||||||
|
#filename = sys.argv[2]
|
||||||
|
|
||||||
|
if not doc_id or not filename:
|
||||||
|
logging.error("❌ 缺少必要的环境变量,无法执行脚本")
|
||||||
|
return
|
||||||
|
|
||||||
|
match = FILENAME_PATTERN.match(filename)
|
||||||
|
if not match:
|
||||||
|
logging.warning(f"⚠️ 文件名格式不匹配,跳过处理: {filename}")
|
||||||
|
return
|
||||||
|
|
||||||
|
publish_date, report_type, org_sname, industry_name, stock_name, title = match.groups()
|
||||||
|
|
||||||
|
logging.info(f"✅ 解析成功: 日期={publish_date}, 类型={report_type}, 机构={org_sname}, 行业={industry_name}, 股票={stock_name}, 标题={title}")
|
||||||
|
|
||||||
|
# 获取或创建文档类型 & Correspondent
|
||||||
|
doc_type_id = get_or_create_document_type(report_type)
|
||||||
|
correspondent_id = get_or_create_correspondent(org_sname)
|
||||||
|
if not doc_type_id or not correspondent_id:
|
||||||
|
logging.error("❌ 文档类型/机构创建失败")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 获取或创建自定义字段
|
||||||
|
industry_field_id = get_or_create_custom_field("行业")
|
||||||
|
stock_field_id = get_or_create_custom_field("股票名称")
|
||||||
|
if not industry_field_id or not stock_field_id:
|
||||||
|
logging.error("❌ 自定义字段创建失败")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 组装自定义字段数据
|
||||||
|
custom_fields = [
|
||||||
|
{"field": industry_field_id, "value": industry_name},
|
||||||
|
{"field": stock_field_id, "value": stock_name}
|
||||||
|
]
|
||||||
|
|
||||||
|
# 更新 Paperless 文档
|
||||||
|
success = update_document(doc_id, title, publish_date, doc_type_id, correspondent_id, custom_fields)
|
||||||
|
if success:
|
||||||
|
logging.info(f"✅ 文档 {doc_id} 更新成功: {title}")
|
||||||
|
else:
|
||||||
|
logging.error(f"❌ 文档 {doc_id} 更新失败")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
process_document()
|
||||||
64
docker/paperless/plugins/redme.txt
Normal file
64
docker/paperless/plugins/redme.txt
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
|
||||||
|
|
||||||
|
-------------------------------------------------------|
|
||||||
|
------------------- paperless 无纸化pdf管理 ------------|
|
||||||
|
-------------------------------------------------------|
|
||||||
|
|
||||||
|
## 最好不要用命令,使用docker-compose.yml来创建,需要制定后端使用的数据库,以及redis!
|
||||||
|
docker run -itd \
|
||||||
|
--name paperless \
|
||||||
|
--network devops \
|
||||||
|
--platform linux/x86_64 \
|
||||||
|
-e TZ="Asia/Shanghai" \
|
||||||
|
-v /etc/localtime:/etc/localtime:ro \
|
||||||
|
-v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \
|
||||||
|
-v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \
|
||||||
|
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
||||||
|
-p 8000:8000 \
|
||||||
|
ghcr.io/paperless-ngx/paperless-ngx
|
||||||
|
|
||||||
|
|
||||||
|
# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin)
|
||||||
|
docker compose run --rm webserver createsuperuser
|
||||||
|
python3 manage.py createsuperuser
|
||||||
|
|
||||||
|
# 已有文档,放在指定目录下,等系统自动加载(或者手工启动)
|
||||||
|
cd /path/to/paperless/src/
|
||||||
|
python3 manage.py document_consumer
|
||||||
|
|
||||||
|
# 自动解析文件名
|
||||||
|
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
|
||||||
|
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
|
||||||
|
|
||||||
|
environment:
|
||||||
|
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
|
||||||
|
|
||||||
|
|
||||||
|
paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:
|
||||||
|
|
||||||
|
/usr/src/paperless/src/documents/consumer.py
|
||||||
|
|
||||||
|
def pre_check_duplicate(self):
|
||||||
|
"""
|
||||||
|
Using the MD5 of the file, check this exact file doesn't already exist
|
||||||
|
"""
|
||||||
|
with open(self.input_doc.original_file, "rb") as f:
|
||||||
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
|
existing_doc = Document.global_objects.filter(
|
||||||
|
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||||
|
)
|
||||||
|
if existing_doc.exists():
|
||||||
|
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||||
|
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||||
|
|
||||||
|
if existing_doc.first().deleted_at is not None:
|
||||||
|
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||||
|
log_msg += " Note: existing document is in the trash."
|
||||||
|
|
||||||
|
## 修改这里,让它删除重复文件。
|
||||||
|
if settings.CONSUMER_DELETE_DUPLICATES or True:
|
||||||
|
os.unlink(self.input_doc.original_file)
|
||||||
|
self._fail(
|
||||||
|
msg,
|
||||||
|
log_msg,
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user