modify scripts

This commit is contained in:
2025-07-12 13:59:28 +08:00
parent 96790a8365
commit 83d0745695
5 changed files with 436 additions and 0 deletions

View File

@ -0,0 +1,77 @@
#!/usr/bin/env python3
import os
import sqlite3
import requests
import time
from requests.auth import HTTPBasicAuth
from requests.exceptions import RequestException
import logging
# Paperless 服务器信息
PAPERLESS_URL = "http://localhost:8000/api"
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
# 日志配置
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
# 连接到 SQLite 数据库
DB_PATH = "/usr/src/paperless/data/db.sqlite3"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# API 请求封装(带重试),支持GETPOST等
def api_request(method, url, data=None, retries=5):
for attempt in range(retries):
try:
response = requests.request(method, url, json=data, auth=AUTH, timeout=5)
if response.status_code in [200, 201, 204]:
return response.json() if response.text else True
elif response.status_code == 404:
logging.warning(f"API 资源未找到: {method} {url}")
return None
else:
logging.error(f"API 请求失败: {method} {url}, 状态码: {response.status_code}, 响应: {response.text}")
except RequestException as e:
logging.error(f"API 请求异常: {method} {url}, 错误: {e}")
if attempt < retries - 1:
logging.warning(f"请求失败,等待 2 秒后重试 ({attempt+1}/{retries})...")
time.sleep(2)
logging.error(f"API 请求最终失败: {method} {url}")
return None
# 从数据库中获取 page_count <= 6 的文档 id
def get_documents_to_delete():
try:
cursor.execute("SELECT id FROM documents_document WHERE page_count <= 6 and (deleted_at IS NULL OR deleted_at = '') ")
rows = cursor.fetchall()
return [row[0] for row in rows]
except sqlite3.Error as e:
logging.error(f"Error querying data: {e}")
return []
# 删除文档
def delete_documents(doc_ids):
succ_count = 0
for doc_id in doc_ids:
url = f"{PAPERLESS_URL}/documents/{doc_id}/"
result = api_request("DELETE", url)
if result:
logging.info(f"✅ 文档 {doc_id} 删除成功")
succ_count += 1
else:
logging.error(f"❌ 文档 {doc_id} 删除失败")
logging.info(f"\ntotal count: {len(doc_ids)}, deleted: {succ_count}")
if __name__ == "__main__":
doc_ids = get_documents_to_delete()
if doc_ids:
delete_documents(doc_ids)
else:
logging.info("没有需要删除的文档")
# 关闭数据库连接
conn.close()

View File

@ -0,0 +1,41 @@
-- documents_correspondent definition
CREATE TABLE "documents_correspondent" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_correspondent_unique_name_owner" UNIQUE ("name", "owner_id"));
CREATE UNIQUE INDEX "documents_correspondent_name_uniq" ON "documents_correspondent" ("name") WHERE "owner_id" IS NULL;
CREATE INDEX "documents_correspondent_owner_id_078f7f8a" ON "documents_correspondent" ("owner_id");
-- documents_customfield definition
CREATE TABLE "documents_customfield" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "name" varchar(128) NOT NULL, "data_type" varchar(50) NOT NULL, "extra_data" text NULL CHECK ((JSON_VALID("extra_data") OR "extra_data" IS NULL)), CONSTRAINT "documents_customfield_unique_name" UNIQUE ("name"));
CREATE INDEX "documents_customfield_created_501ef047" ON "documents_customfield" ("created");
-- documents_customfieldinstance definition
CREATE TABLE "documents_customfieldinstance" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "created" datetime NOT NULL, "value_text" varchar(128) NULL, "value_bool" bool NULL, "value_url" varchar(200) NULL, "value_date" date NULL, "value_int" integer NULL, "value_float" real NULL, "value_monetary" varchar(128) NULL, "document_id" integer NOT NULL REFERENCES "documents_document" ("id") DEFERRABLE INITIALLY DEFERRED, "field_id" integer NOT NULL REFERENCES "documents_customfield" ("id") DEFERRABLE INITIALLY DEFERRED, "value_document_ids" text NULL CHECK ((JSON_VALID("value_document_ids") OR "value_document_ids" IS NULL)), "value_monetary_amount" decimal GENERATED ALWAYS AS ((CAST(CASE WHEN "value_monetary" REGEXP '^\d+' THEN CAST(SUBSTR("value_monetary", 1) AS decimal) ELSE CAST(SUBSTR("value_monetary", 4) AS decimal) END AS NUMERIC))) STORED, "deleted_at" datetime NULL, "restored_at" datetime NULL, "transaction_id" char(32) NULL, "value_select" varchar(16) NULL, CONSTRAINT "documents_customfieldinstance_unique_document_field" UNIQUE ("document_id", "field_id"));
CREATE INDEX "documents_customfieldinstance_created_75f17f1d" ON "documents_customfieldinstance" ("created");
CREATE INDEX "documents_customfieldinstance_document_id_610a968e" ON "documents_customfieldinstance" ("document_id");
CREATE INDEX "documents_customfieldinstance_field_id_6c59e32f" ON "documents_customfieldinstance" ("field_id");
-- documents_document definition
CREATE TABLE "documents_document" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "title" varchar(128) NOT NULL, "content" text NOT NULL, "created" datetime NOT NULL, "modified" datetime NOT NULL, "correspondent_id" integer NULL REFERENCES "documents_correspondent" ("id") DEFERRABLE INITIALLY DEFERRED, "checksum" varchar(32) NOT NULL UNIQUE, "added" datetime NOT NULL, "storage_type" varchar(11) NOT NULL, "filename" varchar(1024) NULL UNIQUE, "archive_serial_number" integer unsigned NULL UNIQUE CHECK ("archive_serial_number" >= 0), "document_type_id" integer NULL REFERENCES "documents_documenttype" ("id") DEFERRABLE INITIALLY DEFERRED, "mime_type" varchar(256) NOT NULL, "archive_checksum" varchar(32) NULL, "archive_filename" varchar(1024) NULL UNIQUE, "storage_path_id" integer NULL REFERENCES "documents_storagepath" ("id") DEFERRABLE INITIALLY DEFERRED, "original_filename" varchar(1024) NULL, "deleted_at" datetime NULL, "restored_at" datetime NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, "transaction_id" char(32) NULL, "page_count" integer unsigned NULL CHECK ("page_count" >= 0));
CREATE INDEX "documents_document_title_6b08e02a" ON "documents_document" ("title");
CREATE INDEX "documents_document_created_bedd0818" ON "documents_document" ("created");
CREATE INDEX "documents_document_modified_2eae15bc" ON "documents_document" ("modified");
CREATE INDEX "documents_document_correspondent_id_6164eb0c" ON "documents_document" ("correspondent_id");
CREATE INDEX "documents_document_added_28cfa360" ON "documents_document" ("added");
CREATE INDEX "documents_document_document_type_id_1f88b50c" ON "documents_document" ("document_type_id");
CREATE INDEX "documents_document_storage_path_id_07d27bdb" ON "documents_document" ("storage_path_id");
CREATE INDEX "documents_document_owner_id_04d2b723" ON "documents_document" ("owner_id");
-- documents_documenttype definition
CREATE TABLE "documents_documenttype" ("id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "name" varchar(128) NOT NULL, "match" varchar(256) NOT NULL, "matching_algorithm" integer unsigned NOT NULL CHECK ("matching_algorithm" >= 0), "is_insensitive" bool NOT NULL, "owner_id" integer NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, CONSTRAINT "documents_documenttype_unique_name_owner" UNIQUE ("name", "owner_id"));
CREATE UNIQUE INDEX "documents_documenttype_name_uniq" ON "documents_documenttype" ("name") WHERE "owner_id" IS NULL;
CREATE INDEX "documents_documenttype_owner_id_a19f201d" ON "documents_documenttype" ("owner_id");

View File

@ -0,0 +1,63 @@
我提供的文件,是 paperless 的SQLite数据库的关键表。现在我们编写它的 PAPERLESS_POST_CONSUME_SCRIPT。需求如下
1, 我们提供的pdf文件格式为 {publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf
2我们提取上面的各个字段然后
1 report_type 对应到 documents_documenttype.name 所以我们要查询 documents_documenttype 表如果对应的name不存在则插入一条记录然后得到对应的 documents_documenttype.id
2 org_sname 对应到 documents_correspondent.name 所以我们要查询 documents_correspondent 表如果对应的name 不存在,则插入一条记录,然后得到对应的 documents_correspondent.id
3 检查 documents_customfield 表是否包含 '行业' 和 '股票名称' 字段,如果不存在,则创建; 查到他们分别对应的 documents_customfield.id , 记为 stockname_id, industry_id
3我们开始更新数据表
1 更新 documents_document 表对应的记录, reated = publish_date, correspondent_id = documents_correspondent.id , document_type_id = documents_documenttype.id, title={title}
2) 向 documents_customfieldinstance 两条记录,分别为 (document_id, stockname_id, stock_name) 和 (document_id, industry_id, industry_name)
好了请你根据以上需求完成这个python脚本。注意异常情况的处理以及日志输出。如果文件名无法匹配以上的格式则忽略不用处理。
Paperless makes use of the Django REST Framework standard API interface. It provides a browsable API for most of its endpoints, which you can inspect at http://<paperless-host>:<port>/api/. This also documents most of the available filters and ordering fields.
The API provides the following main endpoints:
/api/correspondents/: Full CRUD support.
/api/custom_fields/: Full CRUD support.
/api/documents/: Full CRUD support, except POSTing new documents. See below.
/api/document_types/: Full CRUD support.
/api/groups/: Full CRUD support.
/api/logs/: Read-Only.
/api/mail_accounts/: Full CRUD support.
/api/mail_rules/: Full CRUD support.
/api/profile/: GET, PATCH
/api/share_links/: Full CRUD support.
/api/storage_paths/: Full CRUD support.
/api/tags/: Full CRUD support.
/api/tasks/: Read-only.
/api/users/: Full CRUD support.
/api/workflows/: Full CRUD support.
/api/search/ GET, see below.
All of these endpoints except for the logging endpoint allow you to fetch (and edit and delete where appropriate) individual objects by appending their primary key to the path, e.g. /api/documents/454/.
The objects served by the document endpoint contain the following fields:
id: ID of the document. Read-only.
title: Title of the document.
content: Plain text content of the document.
tags: List of IDs of tags assigned to this document, or empty list.
document_type: Document type of this document, or null.
correspondent: Correspondent of this document or null.
created: The date time at which this document was created.
created_date: The date (YYYY-MM-DD) at which this document was created. Optional. If also passed with created, this is ignored.
modified: The date at which this document was last edited in paperless. Read-only.
added: The date at which this document was added to paperless. Read-only.
archive_serial_number: The identifier of this document in a physical document archive.
original_file_name: Verbose filename of the original document. Read-only.
archived_file_name: Verbose filename of the archived document. Read-only. Null if no archived document is available.
notes: Array of notes associated with the document.
page_count: Number of pages.
set_permissions: Allows setting document permissions. Optional, write-only. See below.
custom_fields: Array of custom fields & values, specified as { field: CUSTOM_FIELD_ID, value: VALUE }
以上是paperless提供的api。我们现在使用 http://localhost:8000 来访问它。那么我想对编号为19的文档进行查询以及更新操作应该如何写对应的python代码

View File

@ -0,0 +1,191 @@
#!/usr/bin/env python3
import os
import re
import sys
import time
import logging
import requests
import sqlite3
from requests.auth import HTTPBasicAuth
from requests.exceptions import RequestException
# Paperless 服务器信息
PAPERLESS_URL = "http://localhost:8000/api"
AUTH = HTTPBasicAuth("admin", "admin") # Basic Auth 认证
# 日志配置
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
# 连接到 SQLite 数据库
#DB_PATH = "/usr/src/paperless/db/paperless.sqlite3" # 根据你的数据库实际路径调整
DB_PATH = "/usr/src/paperless/data/db.sqlite3"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
enable_db = True
# 正则解析文件名
FILENAME_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})_(.*?)_(.*?)_(.*?)_(.*?)_(.*)\.pdf")
# 直接从db里获取数据
def query_id_from_db(table, column, value):
try:
cursor.execute(f"SELECT id FROM {table} WHERE {column} = ?", (value,))
row = cursor.fetchone()
if row:
logging.info(f'query {table} where {column}="{value}" get id={row[0]}')
return row[0]
else:
return None
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# API 请求封装(带重试),支持GETPOST等
def api_request(method, url, data=None, retries=5):
for attempt in range(retries):
try:
response = requests.request(method, url, json=data, auth=AUTH, timeout=5)
if response.status_code in [200, 201]:
return response.json()
elif response.status_code == 404:
logging.warning(f"API 资源未找到: {method} {url}")
return None
else:
logging.error(f"API 请求失败: {method} {url}, 状态码: {response.status_code}, 响应: {response.text}")
except RequestException as e:
logging.error(f"API 请求异常: {method} {url}, 错误: {e}")
if attempt < retries - 1:
logging.warning(f"请求失败,等待 2 秒后重试 ({attempt+1}/{retries})...")
time.sleep(2)
logging.error(f"API 请求最终失败: {method} {url}")
return None
# 从API查询
def query_id_from_rest_api(endpoint, name):
url = f"{PAPERLESS_URL}/{endpoint}"
method = 'GET'
while url is not None : # 循环请求所有分页数据
json_data = api_request(method, url, data=None, retries=5)
if json_data:
if "results" in json_data and isinstance(json_data["results"], list):
# 找到就直接返回了
for item in json_data.get('results', []):
if item["name"] == name:
return item["id"]
# 没找到,继续下一页
url = json_data.get("next", None)
else:
logging.warning(f"API 返回无数据: {method} {url}")
break
else:
logging.warning(f"API 返回无数据: {method} {url}")
break
return None
# 获取或创建文档类型
def get_or_create_document_type(name):
if enable_db:
id = query_id_from_db("documents_documenttype", "name", name)
else:
id = query_id_from_rest_api("document_types/", name)
if id:
return id
# 创建新的文档类型
new_doc_type = api_request("POST", f"{PAPERLESS_URL}/document_types/", {"name": name})
return new_doc_type.get("id") if new_doc_type else None
# 获取或创建 Correspondent机构/通信者)
def get_or_create_correspondent(name):
if enable_db:
id = query_id_from_db("documents_correspondent", "name", name)
else:
id = query_id_from_rest_api("correspondents/", name)
if id:
return id
# 创建新的 Correspondent
new_correspondent = api_request("POST", f"{PAPERLESS_URL}/correspondents/", {"name": name})
return new_correspondent.get("id") if new_correspondent else None
# 获取或创建自定义字段
def get_or_create_custom_field(name):
if enable_db:
id = query_id_from_db("documents_customfield", "name", name)
else:
id = query_id_from_rest_api("custom_fields/", name)
if id:
return id
# 创建新的自定义字段
new_field = api_request("POST", f"{PAPERLESS_URL}/custom_fields/", {"name": name, "data_type": "string"})
return new_field.get("id") if new_field else None
# 更新文档信息
def update_document(document_id, title, publish_date, doc_type_id, correspondent_id, custom_fields):
payload = {
"title": title,
"created_date": publish_date,
"document_type": doc_type_id,
"correspondent": correspondent_id,
"custom_fields": custom_fields,
}
return api_request("PATCH", f"{PAPERLESS_URL}/documents/{document_id}/", payload) is not None
# 主函数 - 解析文件名并更新 Paperless
def process_document():
doc_id = os.getenv("DOCUMENT_ID")
filename = os.getenv("DOCUMENT_ORIGINAL_FILENAME")
# debug
#doc_id = sys.argv[1]
#filename = sys.argv[2]
if not doc_id or not filename:
logging.error("❌ 缺少必要的环境变量,无法执行脚本")
return
match = FILENAME_PATTERN.match(filename)
if not match:
logging.warning(f"⚠️ 文件名格式不匹配,跳过处理: {filename}")
return
publish_date, report_type, org_sname, industry_name, stock_name, title = match.groups()
logging.info(f"✅ 解析成功: 日期={publish_date}, 类型={report_type}, 机构={org_sname}, 行业={industry_name}, 股票={stock_name}, 标题={title}")
# 获取或创建文档类型 & Correspondent
doc_type_id = get_or_create_document_type(report_type)
correspondent_id = get_or_create_correspondent(org_sname)
if not doc_type_id or not correspondent_id:
logging.error("❌ 文档类型/机构创建失败")
return
# 获取或创建自定义字段
industry_field_id = get_or_create_custom_field("行业")
stock_field_id = get_or_create_custom_field("股票名称")
if not industry_field_id or not stock_field_id:
logging.error("❌ 自定义字段创建失败")
return
# 组装自定义字段数据
custom_fields = [
{"field": industry_field_id, "value": industry_name},
{"field": stock_field_id, "value": stock_name}
]
# 更新 Paperless 文档
success = update_document(doc_id, title, publish_date, doc_type_id, correspondent_id, custom_fields)
if success:
logging.info(f"✅ 文档 {doc_id} 更新成功: {title}")
else:
logging.error(f"❌ 文档 {doc_id} 更新失败")
if __name__ == "__main__":
process_document()

View File

@ -0,0 +1,64 @@
-------------------------------------------------------
------------------- paperless 无纸化pdf管理 ------------
-------------------------------------------------------
## 最好不要用命令使用docker-compose.yml来创建需要制定后端使用的数据库以及redis
docker run -itd \
--name paperless \
--network devops \
--platform linux/x86_64 \
-e TZ="Asia/Shanghai" \
-v /etc/localtime:/etc/localtime:ro \
-v "$(pwd)/dockers/paperless/pdfs:/usr/src/paperless/data" \
-v "$(pwd)/dockers/paperless/db:/usr/src/paperless/db" \
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
-p 8000:8000 \
ghcr.io/paperless-ngx/paperless-ngx
# 容器创建好之后,要手动设置密码(二选一操作,目前设置的 admin / admin
docker compose run --rm webserver createsuperuser
python3 manage.py createsuperuser
# 已有文档,放在指定目录下,等系统自动加载(或者手工启动)
cd /path/to/paperless/src/
python3 manage.py document_consumer
# 自动解析文件名
https://docs.paperless-ngx.com/advanced_usage/#file-name-handling
https://docs.paperless-ngx.com/configuration/#PAPERLESS_POST_CONSUME_SCRIPT
environment:
PAPERLESS_POST_CONSUME_SCRIPT: "/usr/src/paperless/scripts/parse_filename.py"
paperless 默认不会删除重复的文件,这会导致如果重复添加,会不停扫描,加载,报错。没找到配置,直接修改源码解决:
/usr/src/paperless/src/documents/consumer.py
def pre_check_duplicate(self):
"""
Using the MD5 of the file, check this exact file doesn't already exist
"""
with open(self.input_doc.original_file, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)
if existing_doc.exists():
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
if existing_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
## 修改这里,让它删除重复文件。
if settings.CONSUMER_DELETE_DUPLICATES or True:
os.unlink(self.input_doc.original_file)
self._fail(
msg,
log_msg,
)