From 1582c140bf84a1b825a7cb38993dd389c7a15929 Mon Sep 17 00:00:00 2001 From: sophon Date: Mon, 10 Nov 2025 11:35:44 +0800 Subject: [PATCH] modify scripts --- .../{sqlalchemy => my_sqlalchemy}/alembic.ini | 0 .../migrations/resources}/README | 0 .../migrations/resources/alembic.ini | 146 ++++++++++++++++ .../my_sqlalchemy/migrations/resources/env.py | 85 +++++++++ .../migrations/resources}/script.py.mako | 0 .../resources/versions/758b3971a51e_init.py | 47 +++++ .../migrations/scrapy}/README | 0 .../migrations/scrapy/alembic.ini | 0 .../migrations/scrapy/env.py | 0 .../migrations/scrapy}/script.py.mako | 0 .../migrations/shared_comm}/README | 0 .../migrations/shared_comm/alembic.ini | 0 .../migrations/shared_comm/env.py | 0 .../migrations/shared_comm}/script.py.mako | 0 ...8c67f16b5e_auto_update_from_shared_comm.py | 0 .../my_sqlalchemy/migrations/testdb/README | 1 + .../migrations/testdb/alembic.ini | 0 .../migrations/testdb/env.py | 0 .../migrations/testdb/script.py.mako | 28 +++ .../854378c8e332_initial_test_schema.py | 0 .../86eea10972c0_initial_test_schema.py | 0 .../bce23e0d0c3a_initial_test_schema.py | 0 scrapy_proj/my_sqlalchemy/models/__init__.py | 1 + scrapy_proj/my_sqlalchemy/models/resources.py | 29 ++++ .../models/scrapy.py | 0 .../models/shared.py | 0 .../{sqlalchemy => my_sqlalchemy}/readme.txt | 0 .../requirements.txt | 0 .../scripts/reset_testdb.sh | 0 .../scripts/sync_scrapy.sh | 0 .../scripts/sync_shared_comm.sh | 0 .../scrapy_proj/db_wapper/mysql_handler.py | 164 ++++++++++++++++++ .../db_wapper/spider_db_handler.py | 2 +- scrapy_proj/scrapy_proj/items.py | 1 + scrapy_proj/scrapy_proj/pipelines.py | 48 ++++- scrapy_proj/scrapy_proj/settings.py | 1 + .../spiders/parser/javbus_parser.py | 5 +- .../scrapy_proj/spiders/u3c3_spider.py | 11 +- 38 files changed, 562 insertions(+), 7 deletions(-) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/alembic.ini (100%) rename scrapy_proj/{sqlalchemy/migrations/scrapy => my_sqlalchemy/migrations/resources}/README (100%) create mode 100644 scrapy_proj/my_sqlalchemy/migrations/resources/alembic.ini create mode 100644 scrapy_proj/my_sqlalchemy/migrations/resources/env.py rename scrapy_proj/{sqlalchemy/migrations/scrapy => my_sqlalchemy/migrations/resources}/script.py.mako (100%) create mode 100644 scrapy_proj/my_sqlalchemy/migrations/resources/versions/758b3971a51e_init.py rename scrapy_proj/{sqlalchemy/migrations/shared_comm => my_sqlalchemy/migrations/scrapy}/README (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/scrapy/alembic.ini (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/scrapy/env.py (100%) rename scrapy_proj/{sqlalchemy/migrations/shared_comm => my_sqlalchemy/migrations/scrapy}/script.py.mako (100%) rename scrapy_proj/{sqlalchemy/migrations/testdb => my_sqlalchemy/migrations/shared_comm}/README (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/shared_comm/alembic.ini (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/shared_comm/env.py (100%) rename scrapy_proj/{sqlalchemy/migrations/testdb => my_sqlalchemy/migrations/shared_comm}/script.py.mako (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py (100%) create mode 100644 scrapy_proj/my_sqlalchemy/migrations/testdb/README rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/testdb/alembic.ini (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/testdb/env.py (100%) create mode 100644 scrapy_proj/my_sqlalchemy/migrations/testdb/script.py.mako rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/testdb/versions/854378c8e332_initial_test_schema.py (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/testdb/versions/86eea10972c0_initial_test_schema.py (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py (100%) create mode 100644 scrapy_proj/my_sqlalchemy/models/__init__.py create mode 100644 scrapy_proj/my_sqlalchemy/models/resources.py rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/models/scrapy.py (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/models/shared.py (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/readme.txt (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/requirements.txt (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/scripts/reset_testdb.sh (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/scripts/sync_scrapy.sh (100%) rename scrapy_proj/{sqlalchemy => my_sqlalchemy}/scripts/sync_shared_comm.sh (100%) create mode 100644 scrapy_proj/scrapy_proj/db_wapper/mysql_handler.py diff --git a/scrapy_proj/sqlalchemy/alembic.ini b/scrapy_proj/my_sqlalchemy/alembic.ini similarity index 100% rename from scrapy_proj/sqlalchemy/alembic.ini rename to scrapy_proj/my_sqlalchemy/alembic.ini diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/README b/scrapy_proj/my_sqlalchemy/migrations/resources/README similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/scrapy/README rename to scrapy_proj/my_sqlalchemy/migrations/resources/README diff --git a/scrapy_proj/my_sqlalchemy/migrations/resources/alembic.ini b/scrapy_proj/my_sqlalchemy/migrations/resources/alembic.ini new file mode 100644 index 0000000..565d7e6 --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/migrations/resources/alembic.ini @@ -0,0 +1,146 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# database URL. This is consumed by the user-maintained env.py script only. +# other means of configuring database URLs may be customized within the env.py +# file. +sqlalchemy.url = driver://user:pass@localhost/dbname + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/scrapy_proj/my_sqlalchemy/migrations/resources/env.py b/scrapy_proj/my_sqlalchemy/migrations/resources/env.py new file mode 100644 index 0000000..3c4ce4a --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/migrations/resources/env.py @@ -0,0 +1,85 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = None + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + +import os +from alembic import context +from sqlalchemy import create_engine +# 导入 resource 数据库的 Base(包含所有表定义) +from models.resources import ResourceBase +target_metadata = ResourceBase.metadata # 关联 MySQL 模型的元数据 + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + + #ini_section = config.get_section_option(config.config_ini_section, "ini_section", fallback="dev") + #url = config.get_section_option(ini_section, "sqlalchemy.url") + #connectable = create_engine(url) + url = "mysql+pymysql://root:mysqlpw@testdb:3306/resources" + connectable = create_engine(url) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/script.py.mako b/scrapy_proj/my_sqlalchemy/migrations/resources/script.py.mako similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/scrapy/script.py.mako rename to scrapy_proj/my_sqlalchemy/migrations/resources/script.py.mako diff --git a/scrapy_proj/my_sqlalchemy/migrations/resources/versions/758b3971a51e_init.py b/scrapy_proj/my_sqlalchemy/migrations/resources/versions/758b3971a51e_init.py new file mode 100644 index 0000000..32e71ab --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/migrations/resources/versions/758b3971a51e_init.py @@ -0,0 +1,47 @@ +"""init + +Revision ID: 758b3971a51e +Revises: +Create Date: 2025-11-10 10:01:19.228932 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '758b3971a51e' +down_revision: Union[str, Sequence[str], None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('u3c3', + sa.Column('id', sa.Integer(), autoincrement=True, nullable=False, comment='主键ID'), + sa.Column('sites', sa.Text(), nullable=True, comment='站点名称'), + sa.Column('category', sa.Text(), nullable=True, comment='分类'), + sa.Column('title', sa.Text(), nullable=True, comment='标题'), + sa.Column('url', sa.String(length=512), nullable=True, comment='资源链接(唯一)'), + sa.Column('torrent_url', sa.Text(), nullable=True, comment='种子链接'), + sa.Column('magnet_url', sa.Text(), nullable=True, comment='磁力链接'), + sa.Column('size_text', sa.Text(), nullable=True, comment='大小文本描述'), + sa.Column('size_gb', sa.Float(), nullable=True, comment='大小(GB)'), + sa.Column('update_date', sa.Text(), nullable=True, comment='更新日期'), + sa.Column('created_at', sa.DateTime(), nullable=True, comment='创建时间(本地时间)'), + sa.Column('updated_at', sa.DateTime(), nullable=True, comment='更新时间(本地时间)'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('url') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('u3c3') + # ### end Alembic commands ### diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/README b/scrapy_proj/my_sqlalchemy/migrations/scrapy/README similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/shared_comm/README rename to scrapy_proj/my_sqlalchemy/migrations/scrapy/README diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/alembic.ini b/scrapy_proj/my_sqlalchemy/migrations/scrapy/alembic.ini similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/scrapy/alembic.ini rename to scrapy_proj/my_sqlalchemy/migrations/scrapy/alembic.ini diff --git a/scrapy_proj/sqlalchemy/migrations/scrapy/env.py b/scrapy_proj/my_sqlalchemy/migrations/scrapy/env.py similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/scrapy/env.py rename to scrapy_proj/my_sqlalchemy/migrations/scrapy/env.py diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/script.py.mako b/scrapy_proj/my_sqlalchemy/migrations/scrapy/script.py.mako similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/shared_comm/script.py.mako rename to scrapy_proj/my_sqlalchemy/migrations/scrapy/script.py.mako diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/README b/scrapy_proj/my_sqlalchemy/migrations/shared_comm/README similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/testdb/README rename to scrapy_proj/my_sqlalchemy/migrations/shared_comm/README diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/alembic.ini b/scrapy_proj/my_sqlalchemy/migrations/shared_comm/alembic.ini similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/shared_comm/alembic.ini rename to scrapy_proj/my_sqlalchemy/migrations/shared_comm/alembic.ini diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/env.py b/scrapy_proj/my_sqlalchemy/migrations/shared_comm/env.py similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/shared_comm/env.py rename to scrapy_proj/my_sqlalchemy/migrations/shared_comm/env.py diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/script.py.mako b/scrapy_proj/my_sqlalchemy/migrations/shared_comm/script.py.mako similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/testdb/script.py.mako rename to scrapy_proj/my_sqlalchemy/migrations/shared_comm/script.py.mako diff --git a/scrapy_proj/sqlalchemy/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py b/scrapy_proj/my_sqlalchemy/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py rename to scrapy_proj/my_sqlalchemy/migrations/shared_comm/versions/098c67f16b5e_auto_update_from_shared_comm.py diff --git a/scrapy_proj/my_sqlalchemy/migrations/testdb/README b/scrapy_proj/my_sqlalchemy/migrations/testdb/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/migrations/testdb/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/alembic.ini b/scrapy_proj/my_sqlalchemy/migrations/testdb/alembic.ini similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/testdb/alembic.ini rename to scrapy_proj/my_sqlalchemy/migrations/testdb/alembic.ini diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/env.py b/scrapy_proj/my_sqlalchemy/migrations/testdb/env.py similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/testdb/env.py rename to scrapy_proj/my_sqlalchemy/migrations/testdb/env.py diff --git a/scrapy_proj/my_sqlalchemy/migrations/testdb/script.py.mako b/scrapy_proj/my_sqlalchemy/migrations/testdb/script.py.mako new file mode 100644 index 0000000..1101630 --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/migrations/testdb/script.py.mako @@ -0,0 +1,28 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + """Upgrade schema.""" + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + """Downgrade schema.""" + ${downgrades if downgrades else "pass"} diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/versions/854378c8e332_initial_test_schema.py b/scrapy_proj/my_sqlalchemy/migrations/testdb/versions/854378c8e332_initial_test_schema.py similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/testdb/versions/854378c8e332_initial_test_schema.py rename to scrapy_proj/my_sqlalchemy/migrations/testdb/versions/854378c8e332_initial_test_schema.py diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/versions/86eea10972c0_initial_test_schema.py b/scrapy_proj/my_sqlalchemy/migrations/testdb/versions/86eea10972c0_initial_test_schema.py similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/testdb/versions/86eea10972c0_initial_test_schema.py rename to scrapy_proj/my_sqlalchemy/migrations/testdb/versions/86eea10972c0_initial_test_schema.py diff --git a/scrapy_proj/sqlalchemy/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py b/scrapy_proj/my_sqlalchemy/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py similarity index 100% rename from scrapy_proj/sqlalchemy/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py rename to scrapy_proj/my_sqlalchemy/migrations/testdb/versions/bce23e0d0c3a_initial_test_schema.py diff --git a/scrapy_proj/my_sqlalchemy/models/__init__.py b/scrapy_proj/my_sqlalchemy/models/__init__.py new file mode 100644 index 0000000..5c61674 --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/models/__init__.py @@ -0,0 +1 @@ +from .resources import ResourceBase, U3C3 \ No newline at end of file diff --git a/scrapy_proj/my_sqlalchemy/models/resources.py b/scrapy_proj/my_sqlalchemy/models/resources.py new file mode 100644 index 0000000..eec6e4e --- /dev/null +++ b/scrapy_proj/my_sqlalchemy/models/resources.py @@ -0,0 +1,29 @@ +from sqlalchemy import Column, Integer, Text, String, Float, DateTime, func +from sqlalchemy.ext.declarative import declarative_base + +# 基础模型基类(如果已有全局 Base 可直接复用) +ResourceBase = declarative_base() + +class U3C3(ResourceBase): + """对应 sqlite 中的 u3c3 表""" + __tablename__ = "u3c3" # 表名与原表保持一致 + + # 字段定义(严格映射原表结构) + id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") + sites = Column(Text, comment="站点名称") + category = Column(Text, comment="分类") + title = Column(Text, comment="标题") + # 关键修改:给 String 加长度(如 512,根据实际链接长度调整) + url = Column(String(512), unique=True, comment="资源链接(唯一)") + torrent_url = Column(Text, comment="种子链接") + magnet_url = Column(Text, comment="磁力链接") + size_text = Column(Text, comment="大小文本描述") + size_gb = Column(Float, comment="大小(GB)") + update_date = Column(Text, comment="更新日期") + # 补充:MySQL 中建议用 func.now() 替代 func.datetime(...),兼容性更好 + created_at = Column(DateTime, default=func.now(), comment="创建时间(本地时间)") + updated_at = Column(DateTime, default=func.now(), onupdate=func.now(), comment="更新时间(本地时间)") + + def __repr__(self): + """打印实例时显示的信息""" + return f"" \ No newline at end of file diff --git a/scrapy_proj/sqlalchemy/models/scrapy.py b/scrapy_proj/my_sqlalchemy/models/scrapy.py similarity index 100% rename from scrapy_proj/sqlalchemy/models/scrapy.py rename to scrapy_proj/my_sqlalchemy/models/scrapy.py diff --git a/scrapy_proj/sqlalchemy/models/shared.py b/scrapy_proj/my_sqlalchemy/models/shared.py similarity index 100% rename from scrapy_proj/sqlalchemy/models/shared.py rename to scrapy_proj/my_sqlalchemy/models/shared.py diff --git a/scrapy_proj/sqlalchemy/readme.txt b/scrapy_proj/my_sqlalchemy/readme.txt similarity index 100% rename from scrapy_proj/sqlalchemy/readme.txt rename to scrapy_proj/my_sqlalchemy/readme.txt diff --git a/scrapy_proj/sqlalchemy/requirements.txt b/scrapy_proj/my_sqlalchemy/requirements.txt similarity index 100% rename from scrapy_proj/sqlalchemy/requirements.txt rename to scrapy_proj/my_sqlalchemy/requirements.txt diff --git a/scrapy_proj/sqlalchemy/scripts/reset_testdb.sh b/scrapy_proj/my_sqlalchemy/scripts/reset_testdb.sh similarity index 100% rename from scrapy_proj/sqlalchemy/scripts/reset_testdb.sh rename to scrapy_proj/my_sqlalchemy/scripts/reset_testdb.sh diff --git a/scrapy_proj/sqlalchemy/scripts/sync_scrapy.sh b/scrapy_proj/my_sqlalchemy/scripts/sync_scrapy.sh similarity index 100% rename from scrapy_proj/sqlalchemy/scripts/sync_scrapy.sh rename to scrapy_proj/my_sqlalchemy/scripts/sync_scrapy.sh diff --git a/scrapy_proj/sqlalchemy/scripts/sync_shared_comm.sh b/scrapy_proj/my_sqlalchemy/scripts/sync_shared_comm.sh similarity index 100% rename from scrapy_proj/sqlalchemy/scripts/sync_shared_comm.sh rename to scrapy_proj/my_sqlalchemy/scripts/sync_shared_comm.sh diff --git a/scrapy_proj/scrapy_proj/db_wapper/mysql_handler.py b/scrapy_proj/scrapy_proj/db_wapper/mysql_handler.py new file mode 100644 index 0000000..dbbab26 --- /dev/null +++ b/scrapy_proj/scrapy_proj/db_wapper/mysql_handler.py @@ -0,0 +1,164 @@ +import os +import sqlite3 +import json +import logging +from datetime import datetime +import scrapy_proj.comm.comm_def as comm +import scrapy_proj.items as items_def +from scrapy_proj.utils.utils import pretty_json_simple, is_valid_url +from my_sqlalchemy.models.resources import U3C3, ResourceBase + +from typing import List, Dict, Optional +from sqlalchemy import create_engine, func, exists +from sqlalchemy.orm import sessionmaker +from sqlalchemy.exc import SQLAlchemyError + +# 注册器字典 +mysql_handler_registry = {} + +# 单例元类 +class SingletonMeta(type): + _instances = {} # 存储每个类的唯一实例 + + def __call__(cls, *args, **kwargs): + # 检查实例是否已存在,不存在则创建 + if cls not in cls._instances: + cls._instances[cls] = super().__call__(*args, **kwargs) + return cls._instances[cls] + +# MySQL 数据库基类(基于 SQLAlchemy) +class BaseMysqlHandler(metaclass=SingletonMeta): + def __init__(self, db_url: Optional[str] = None): + # 默认 MySQL 连接地址(从环境变量或配置文件读取更灵活) + self.db_url = db_url or os.getenv( + "MYSQL_RESOURCES_URL", + "mysql+pymysql://root:mysqlpw@testdb:3306/resources?charset=utf8mb4" + ) + # 初始化引擎和会话 + self.engine = create_engine(self.db_url) + self.Session = sessionmaker(bind=self.engine) + # 确保表结构存在(生产环境建议用 Alembic 迁移,这里仅作为兜底) + ResourceBase.metadata.create_all(self.engine) + self.logger = logging.getLogger(__name__) + + def get_session(self): + """获取数据库会话(用完需手动关闭或用上下文管理器)""" + return self.Session() + + def insert_or_update_common(self, item: dict, tbl_model, uniq_key: str, exists_do_nothing: bool = True): + """ + 通用插入或更新方法(兼容非 DB 字段,避免报错) + :param item: 爬虫item(字典类型) + :param tbl_model: SQLAlchemy 模型类(如 U3C3) + :param uniq_key: 唯一键字段名(如 'url') + :param exists_do_nothing: 若存在则不操作(True),否则更新(False) + """ + session = self.get_session() + try: + # 1. 提取模型的有效字段(避免非 DB 字段干扰) + model_fields = [col.name for col in tbl_model.__table__.columns] + # 2. 检查唯一键是否存在且有效 + uniq_value = item.get(uniq_key) + if not uniq_value or uniq_key not in model_fields: + self.logger.warning(f"唯一键 {uniq_key} 无效或不存在,跳过数据: {item}") + return + + # 3. 检查唯一键是否已存在 + exists_query = session.query( + exists().where(getattr(tbl_model, uniq_key) == uniq_value) + ).scalar() + + if exists_query: + if exists_do_nothing: + self.logger.debug(f"唯一键 {uniq_key}={uniq_value} 已存在,忽略插入") + return + # 4. 存在则更新(只保留模型中存在的非唯一键字段) + update_data = { + k: v for k, v in item.items() + if k != uniq_key and k in model_fields # 双重过滤:排除唯一键 + 非 DB 字段 + } + if not update_data: + self.logger.debug(f"无有效更新字段,跳过更新: {item}") + return + session.query(tbl_model).filter( + getattr(tbl_model, uniq_key) == uniq_value + ).update(update_data) + self.logger.debug(f"更新数据: {update_data}") + else: + # 5. 不存在则插入(只保留模型中存在的字段) + valid_item = {k: v for k, v in item.items() if k in model_fields} + new_record = tbl_model(**valid_item) + session.add(new_record) + self.logger.debug(f"插入新数据: {valid_item}") + + session.commit() + + except SQLAlchemyError as e: + session.rollback() + self.logger.error(f"数据库操作失败: {str(e)}, 数据: {item}") + finally: + session.close() + + def generic_stats_query(self, stats_config: List[Dict]) -> Dict: + """ + 通用统计查询方法 + :param stats_config: 统计配置,格式如 [{'table': 模型类, 'alias': '统计别名'}] + :return: 统计结果字典 + """ + session = self.get_session() + try: + result = {} + for config in stats_config: + tbl_model = config['table'] # 传入 SQLAlchemy 模型类(如 U3C3) + alias = config['alias'] # 统计结果的键名 + # 执行计数查询(默认统计非删除记录,可根据模型调整) + count = session.query(func.count(tbl_model.id)).scalar() + result[alias] = count + return result + except SQLAlchemyError as e: + self.logger.error(f"统计查询失败: {str(e)}") + return {} + finally: + session.close() + + + +def register_handler(spider_name): + def decorator(cls): + mysql_handler_registry[spider_name.lower()] = cls + return cls + return decorator + +@register_handler(comm.SPIDER_NAME_U3C3) +class U3C3MysqlHandler(BaseMysqlHandler): + def __init__(self, db_url: Optional[str] = None): + super().__init__(db_url) + + def insert_item(self, item): + self.insert_or_update_common(item, tbl_model=U3C3, uniq_key='url', exists_do_nothing=True) + + # 统计函数 + def get_stat(self): + stats_config = [ + # 演员相关统计 + {'table': self.tbl_name_u3c3, 'alias': 'cnt'}, + ] + return self.generic_stats_query(stats_config) + + +if __name__ == "__main__": + db = U3C3MysqlHandler() + item = { + 'sites' : "u3c3", + 'category' : 'test', + 'title' : 'test', + 'url' : 'test', + 'torrent_url' : 'test', + 'magnet_url' : 'test', + 'size_text' : 'test', + 'size_gb' : 2, + 'update_date' : 'test' + } + + testdb = U3C3MysqlHandler() + testdb.insert_item(item) \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index eeed01f..9e79035 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -35,7 +35,7 @@ class SisDBHandler(SQLiteDBHandler): ] return self.generic_stats_query(stats_config) -@register_handler(comm.SPIDER_NAME_U3C3) +#@register_handler(comm.SPIDER_NAME_U3C3) class U3C3DBHandler(SQLiteDBHandler): def __init__(self, db_path=default_dbpath): super().__init__(db_path) diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index fb22d57..a748e47 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -19,6 +19,7 @@ class U001Item(scrapy.Item): category = scrapy.Field() title = scrapy.Field() url = scrapy.Field() + sites = scrapy.Field() torrent_url = scrapy.Field() magnet_url = scrapy.Field() size_text = scrapy.Field() diff --git a/scrapy_proj/scrapy_proj/pipelines.py b/scrapy_proj/scrapy_proj/pipelines.py index 825d693..5fbd5f2 100644 --- a/scrapy_proj/scrapy_proj/pipelines.py +++ b/scrapy_proj/scrapy_proj/pipelines.py @@ -13,6 +13,7 @@ import json import scrapy from scrapy_proj.items import U001Item, Sis001Item, PBoxStuItem from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler, PboxDBHandler +from scrapy_proj.db_wapper.mysql_handler import mysql_handler_registry, U3C3MysqlHandler class SQLitePipeline(): def __init__(self): @@ -22,7 +23,9 @@ class SQLitePipeline(): spider_name = spider.name.lower() handler_class = spider_handler_registry.get(spider_name) if not handler_class: - raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类") + #raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类") + spider.logger.warning(f"未注册 Spider {spider_name} 的Sqlite数据库处理类,跳过数据库操作") + return self.db_handlers[spider_name] = handler_class() @@ -42,7 +45,48 @@ class SQLitePipeline(): #spider.logger.debug(f"spider name: {spider_name}, item: {item_json}") if not handler: - raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器") + spider.logger.debug(f"未找到 Spider {spider_name} 的数据库处理器,跳过数据库操作") + return item + #raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器") handler.insert_item(item) + return item + + +class MysqlPipeline(): + def __init__(self): + self.db_handlers = {} + + def open_spider(self, spider): + spider_name = spider.name.lower() + handler_class = mysql_handler_registry.get(spider_name) + if not handler_class: + #raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类") + spider.logger.warning(f"未注册 Spider {spider_name} 的数据库处理类,跳过数据库操作") + return + self.db_handlers[spider_name] = handler_class() + + + def close_spider(self, spider): + spider_name = spider.name.lower() + handler = self.db_handlers.pop(spider_name, None) + if handler: + pass + #handler.close() # 这里不关闭,由统计中间件去关闭 + + def process_item(self, item, spider): + spider_name = spider.name.lower() + handler = self.db_handlers.get(spider_name) + + # 转换为单行JSON格式 + #item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':')) + #spider.logger.debug(f"spider name: {spider_name}, item: {item_json}") + + if not handler: + spider.logger.warning(f"未找到 Spider {spider_name} 的数据库处理器,跳过数据库操作") + return item + #raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器") + + handler.insert_item(item) + return item diff --git a/scrapy_proj/scrapy_proj/settings.py b/scrapy_proj/scrapy_proj/settings.py index 0779b79..b98fec3 100644 --- a/scrapy_proj/scrapy_proj/settings.py +++ b/scrapy_proj/scrapy_proj/settings.py @@ -40,6 +40,7 @@ DOWNLOAD_DELAY = 0.3 # 启用管道 ITEM_PIPELINES = { 'scrapy_proj.pipelines.SQLitePipeline': 300, + 'scrapy_proj.pipelines.MysqlPipeline': 400, } # 用户代理池 diff --git a/scrapy_proj/scrapy_proj/spiders/parser/javbus_parser.py b/scrapy_proj/scrapy_proj/spiders/parser/javbus_parser.py index 61d2305..205bdc0 100644 --- a/scrapy_proj/scrapy_proj/spiders/parser/javbus_parser.py +++ b/scrapy_proj/scrapy_proj/spiders/parser/javbus_parser.py @@ -475,7 +475,8 @@ class JavbusCrawler(GenericCrawler): 'duration': ['長度:', '长度:', 'Length:', '収録時間:'], 'studio': ['製作商:', '制作商:', 'Studio:', 'メーカー:'], 'label': ['發行商:', '发行商:', 'Label:', 'レーベル:'], - 'series': ['系列:', 'Series:', 'シリーズ:'] + 'series': ['系列:', 'Series:', 'シリーズ:'], + 'director': ['導演:', '导演:', 'Director:', '監督:'] } # 遍历所有p标签查找信息 @@ -491,7 +492,7 @@ class JavbusCrawler(GenericCrawler): if target_key: # 获取值(处理文本和链接) - if target_key in ['studio', 'label', 'series']: + if target_key in ['studio', 'label', 'series', 'director']: # 处理有链接的字段 a_tag = p.find('a') if a_tag: diff --git a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py index 8da4789..31fe0dc 100644 --- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py @@ -9,7 +9,7 @@ class U001Spider(BaseSpider): name = SPIDER_NAME_U3C3 allowed_domains = ["u001.25img.com", 'u9a9.com'] start_urls = ["https://u001.25img.com/?p=1", 'https://u9a9.com/?type=2&p=1'] - #start_urls = ['https://u9a9.com/?type=2&p=1'] + #start_urls = ['https://u001.25img.com/?p=1'] def __init__(self, debug='False', begin=None, *args, **kwargs): super().__init__(*args, **kwargs) @@ -23,7 +23,13 @@ class U001Spider(BaseSpider): need_next = False for row in response.css('table.torrent-list tbody tr'): item = U001Item() - item['category'] = row.css('td:nth-child(1) a::attr(title)').get() + #item['category'] = row.css('td:nth-child(1) a::attr(title)').get() + # 优先获取 a 标签的 title,若为空则获取 img 标签的 alt + item['category'] = row.css( + 'td:nth-child(1) a::attr(title), ' # 第一种情况:a标签的title + 'td:nth-child(1) img::attr(alt)' # 第二种情况:img标签的alt(注意逗号分隔表示“或”) + ).get() + item['title'] = extract_text_from_element(row.css('td:nth-child(2) a'), use_title=True) item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get()) @@ -36,6 +42,7 @@ class U001Spider(BaseSpider): item['size_gb'] = parse_size(size_text) item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip() + item['sites'] = 'u9a9' if "u9a9" in response.url else 'u3c3' # 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页 up_date = parse_date_to_datetime(item['update_date'])