From bd6e1b6ed8dcb504880d9a3c9b6cac11306daaf0 Mon Sep 17 00:00:00 2001 From: sophon Date: Mon, 10 Nov 2025 15:22:45 +0800 Subject: [PATCH] modify scripts --- requirements.txt | 20 ++++ ...0b2c66f54410_auto_update_from_resources.py | 70 +++++++++++++ scrapy_proj/my_sqlalchemy/models/resources.py | 97 ++++++++++++++++++- .../my_sqlalchemy/scripts/sync_resources.sh | 7 ++ 4 files changed, 192 insertions(+), 2 deletions(-) create mode 100644 requirements.txt create mode 100644 scrapy_proj/my_sqlalchemy/migrations/resources/versions/0b2c66f54410_auto_update_from_resources.py create mode 100755 scrapy_proj/my_sqlalchemy/scripts/sync_resources.sh diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e6f5809 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +# 公共包,一般情况下都有了 +requests +beautifulsoup4 +lxml +pymysql +cloudscraper +scrapy +sqlalchemy>=2.0.0 +alembic>=1.12.0 +sqlacodegen>=3.0.0 + +# nas 环境下,直接pip安装 scrapy 会报错,cryptography 与系统的有冲突。所以必须要在 虚拟环境下安装和运行 +# apt update && apt install -y python3.12-venv # 安装 python3.12-venv ,版本对应于 python3 --version +# python3 -m venv ~/sharedata/pyenv # 创建虚拟环境 +# source ~/sharedata/pyenv/bin/activate # 激活虚拟环境 +# pip install -r requirements.txt # 安装所需要的包 + +# 每次运行前,都要先激活虚拟环境 +# source ~/sharedata/pyenv/bin/activate +# export DB_ENV=nas && scrapy crawl u3c3 -a begin='2025-11-09' -a end='2025-11-10' -s STATS_PUSH_MSG=False diff --git a/scrapy_proj/my_sqlalchemy/migrations/resources/versions/0b2c66f54410_auto_update_from_resources.py b/scrapy_proj/my_sqlalchemy/migrations/resources/versions/0b2c66f54410_auto_update_from_resources.py new file mode 100644 index 0000000..5f8f2fd --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/migrations/resources/versions/0b2c66f54410_auto_update_from_resources.py @@ -0,0 +1,70 @@ +"""Auto update from resources + +Revision ID: 0b2c66f54410 +Revises: 758b3971a51e +Create Date: 2025-11-10 15:21:58.323573 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '0b2c66f54410' +down_revision: Union[str, Sequence[str], None] = '758b3971a51e' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('clm_keywords', + sa.Column('id', sa.Integer(), autoincrement=True, nullable=False, comment='主键ID'), + sa.Column('words', sa.String(length=512), nullable=True, comment='关键词(唯一)'), + sa.Column('groups', sa.Text(), nullable=True, comment='关键词分组'), + sa.Column('tags', sa.Text(), nullable=True, comment='标签'), + sa.Column('index_count', sa.Integer(), nullable=True, comment='关联索引数量'), + sa.Column('created_at', sa.DateTime(), nullable=True, comment='创建时间(本地时间)'), + sa.Column('updated_at', sa.DateTime(), nullable=True, comment='更新时间(本地时间)'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('words') + ) + op.create_table('sis', + sa.Column('id', sa.Integer(), autoincrement=True, nullable=False, comment='主键ID'), + sa.Column('plate_name', sa.Text(), nullable=True, comment='板块名称'), + sa.Column('title', sa.Text(), nullable=True, comment='标题'), + sa.Column('url', sa.String(length=512), nullable=True, comment='资源链接(唯一)'), + sa.Column('size_text', sa.Text(), nullable=True, comment='大小文本描述'), + sa.Column('size_gb', sa.Float(), nullable=True, comment='大小(GB)'), + sa.Column('update_date', sa.Text(), nullable=True, comment='更新日期'), + sa.Column('created_at', sa.DateTime(), nullable=True, comment='创建时间(本地时间)'), + sa.Column('updated_at', sa.DateTime(), nullable=True, comment='更新时间(本地时间)'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('url') + ) + op.create_table('clm_keywords_index', + sa.Column('id', sa.Integer(), autoincrement=True, nullable=False, comment='主键ID'), + sa.Column('words_id', sa.Integer(), nullable=True, comment='关键词ID(外键)'), + sa.Column('index_id', sa.Integer(), nullable=True, comment='索引ID(外键)'), + sa.Column('wid_iid', sa.String(length=255), nullable=True, comment='关键词与索引的关联标识'), + sa.Column('tags', sa.Text(), nullable=True, comment='关联标签'), + sa.Column('created_at', sa.DateTime(), nullable=True, comment='创建时间(本地时间)'), + sa.Column('updated_at', sa.DateTime(), nullable=True, comment='更新时间(本地时间)'), + sa.ForeignKeyConstraint(['index_id'], ['clm_index.id'], ), + sa.ForeignKeyConstraint(['words_id'], ['clm_keywords.id'], ), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('wid_iid') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('clm_keywords_index') + op.drop_table('sis') + op.drop_table('clm_keywords') + # ### end Alembic commands ### diff --git a/scrapy_proj/my_sqlalchemy/models/resources.py b/scrapy_proj/my_sqlalchemy/models/resources.py index eec6e4e..b336aa1 100644 --- a/scrapy_proj/my_sqlalchemy/models/resources.py +++ b/scrapy_proj/my_sqlalchemy/models/resources.py @@ -1,5 +1,7 @@ -from sqlalchemy import Column, Integer, Text, String, Float, DateTime, func +from sqlalchemy import Column, Integer, Text, String, Float, DateTime, ForeignKey, func from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship, Mapped +from typing import List, Optional # 基础模型基类(如果已有全局 Base 可直接复用) ResourceBase = declarative_base() @@ -26,4 +28,95 @@ class U3C3(ResourceBase): def __repr__(self): """打印实例时显示的信息""" - return f"" \ No newline at end of file + return f"" + +class Sis(ResourceBase): + """对应 MySQL 中的 sis 表""" + __tablename__ = "sis" # 表名与原表保持一致 + + # 字段定义(严格映射原表结构) + id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") + plate_name = Column(Text, comment="板块名称") + title = Column(Text, comment="标题") + url = Column(String(512), unique=True, comment="资源链接(唯一)") + size_text = Column(Text, comment="大小文本描述") + size_gb = Column(Float, comment="大小(GB)") + update_date = Column(Text, comment="更新日期") + # 补充:MySQL 中建议用 func.now() 替代 func.datetime(...),兼容性更好 + created_at = Column(DateTime, default=func.now(), comment="创建时间(本地时间)") + updated_at = Column(DateTime, default=func.now(), onupdate=func.now(), comment="更新时间(本地时间)") + + def __repr__(self): + return f"" + + +class ClmIndex(ResourceBase): + """对应 MySQL 中的 clm_index 表""" + __tablename__ = "clm_index" # 表名与原表保持一致 + + # 字段定义(严格映射原表结构) + id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") + category = Column(Text, comment="分类") + title = Column(Text, comment="标题") + href = Column(String(512), unique=True, comment="资源链接(唯一)") + magnet_href = Column(Text, comment="磁力链接") + size_text = Column(Text, comment="大小文本描述") + size_gb = Column(Float, comment="大小(GB)") + heat = Column(Integer, default=0, comment="热度") + add_date = Column(Text, comment="添加日期") + last_down_date = Column(Text, comment="最后下载日期") + created_at = Column(DateTime, default=func.now(), comment="创建时间(本地时间)") + updated_at = Column(DateTime, default=func.now(), onupdate=func.now(), comment="更新时间(本地时间)") + + # 关系定义:用 Mapped[List["ClmKeywordsIndex"]] 替代 List["ClmKeywordsIndex"] + clm_keywords_index: Mapped[List["ClmKeywordsIndex"]] = relationship( + "ClmKeywordsIndex", back_populates="index" + ) + + + def __repr__(self): + return f"" + + +class ClmKeywords(ResourceBase): + """对应 MySQL 中的 clm_keywords 表""" + __tablename__ = "clm_keywords" # 表名与原表保持一致 + + # 字段定义(严格映射原表结构) + id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") + words = Column(String(512), unique=True, comment="关键词(唯一)") + groups = Column(Text, comment="关键词分组") + tags = Column(Text, comment="标签") + index_count = Column(Integer, default=0, comment="关联索引数量") + created_at = Column(DateTime, default=func.now(), comment="创建时间(本地时间)") + updated_at = Column(DateTime, default=func.now(), onupdate=func.now(), comment="更新时间(本地时间)") + + # 关系定义:用 Mapped[List["ClmKeywordsIndex"]] 替代 List["ClmKeywordsIndex"] + clm_keywords_index: Mapped[List["ClmKeywordsIndex"]] = relationship( + "ClmKeywordsIndex", back_populates="words" + ) + + def __repr__(self): + return f"" + + + +class ClmKeywordsIndex(ResourceBase): + """对应 MySQL 中的 clm_keywords_index 表(关联表)""" + __tablename__ = "clm_keywords_index" # 表名与原表保持一致 + + # 字段定义(严格映射原表结构) + id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") + words_id = Column(Integer, ForeignKey("clm_keywords.id"), comment="关键词ID(外键)") + index_id = Column(Integer, ForeignKey("clm_index.id"), comment="索引ID(外键)") + wid_iid = Column(String(255), unique=True, comment="关键词与索引的关联标识") + tags = Column(Text, comment="关联标签") + created_at = Column(DateTime, default=func.now(), comment="创建时间(本地时间)") + updated_at = Column(DateTime, default=func.now(), onupdate=func.now(), comment="更新时间(本地时间)") + + # 关系定义:用 Mapped 包装单个对象类型 + index: Mapped["ClmIndex"] = relationship("ClmIndex", back_populates="clm_keywords_index") + words: Mapped["ClmKeywords"] = relationship("ClmKeywords", back_populates="clm_keywords_index") + + def __repr__(self): + return f"" diff --git a/scrapy_proj/my_sqlalchemy/scripts/sync_resources.sh b/scrapy_proj/my_sqlalchemy/scripts/sync_resources.sh new file mode 100755 index 0000000..befd32a --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/scripts/sync_resources.sh @@ -0,0 +1,7 @@ +#!/bin/bash +cd $(dirname $0)/.. + +alembic -c migrations/resources/alembic.ini revision --autogenerate -m "Auto update from resources" +alembic -c migrations/resources/alembic.ini upgrade head + +echo "数据库 scrapy 同步完成"