From 02334a800520648131a96e7373e8198be20304a1 Mon Sep 17 00:00:00 2001 From: oscarz Date: Sat, 5 Jul 2025 16:47:46 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/scrapy_proj/comm/comm_def.py | 17 +++ .../{iafd_query.py => spider_db_handler.py} | 73 +++++++++++- .../scrapy_proj/db_wapper/sqlite_base.py | 7 ++ scrapy_proj/scrapy_proj/items.py | 7 ++ scrapy_proj/scrapy_proj/pipelines.py | 112 +++++------------- .../scrapy_proj/spiders/iafd_spider.py | 7 +- scrapy_proj/scrapy_proj/spiders/sis_spider.py | 7 +- .../scrapy_proj/spiders/u3c3_spider.py | 7 +- 8 files changed, 144 insertions(+), 93 deletions(-) create mode 100644 scrapy_proj/scrapy_proj/comm/comm_def.py rename scrapy_proj/scrapy_proj/db_wapper/{iafd_query.py => spider_db_handler.py} (61%) diff --git a/scrapy_proj/scrapy_proj/comm/comm_def.py b/scrapy_proj/scrapy_proj/comm/comm_def.py new file mode 100644 index 0000000..e5cc83b --- /dev/null +++ b/scrapy_proj/scrapy_proj/comm/comm_def.py @@ -0,0 +1,17 @@ +# +# +# +# +# +# + + +SPIDER_NAME_SIS = 'sis' +SPIDER_NAME_U3C3 = 'u3c3' +SPIDER_NAME_IAFD = 'iafd' + +ITEM_TYPE_LIST = 'list' +ITEM_TYPE_MOVIE_INDEX = 'movie_index' +ITEM_TYPE_ACTOR_INDEX = 'actor_index' +ITEM_TYPE_MOVIE_DETAIL = 'movie_detail' +ITEM_TYPE_ACTOR_DETAIL = 'actor_detail' \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py similarity index 61% rename from scrapy_proj/scrapy_proj/db_wapper/iafd_query.py rename to scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 06f7508..fa9371d 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -2,16 +2,85 @@ import os import sqlite3 import logging from datetime import datetime -from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path +from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path +import scrapy_proj.comm.comm_def as comm + +# 注册器字典 +spider_handler_registry = {} + +def register_handler(spider_name): + def decorator(cls): + spider_handler_registry[spider_name.lower()] = cls + return cls + return decorator + +@register_handler(comm.SPIDER_NAME_SIS) +class SisDBHandler(SQLiteDBHandler): + def __init__(self, db_path=default_dbpath): + super().__init__(db_path) + self.tbl_name_sis = 'sis' + + def insert_item(self, item): + self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True) + + def _create_tables(self): + # 创建 sis001 数据表 + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + plate_name TEXT, + title TEXT, + url TEXT UNIQUE, + size_text TEXT, + size_gb REAL, + update_date TEXT, + created_at TEXT DEFAULT (datetime('now', 'localtime')), + updated_at TEXT DEFAULT (datetime('now', 'localtime')) + ) + ''') + self.conn.commit() -class IAFDQuery(SQLiteDBHandler): +@register_handler(comm.SPIDER_NAME_U3C3) +class U3C3DBHandler(SQLiteDBHandler): + def __init__(self, db_path=default_dbpath): + super().__init__(db_path) + self.tbl_name_u3c3 = 'sis' + + def insert_item(self, item): + self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True) + + def _create_tables(self): + # 创建 u001 数据表 + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + category TEXT, + title TEXT, + url TEXT UNIQUE, + torrent_url TEXT, + magnet_url TEXT, + size_text TEXT, + size_gb REAL, + update_date TEXT, + created_at TEXT DEFAULT (datetime('now', 'localtime')), + updated_at TEXT DEFAULT (datetime('now', 'localtime')) + ) + ''') + self.conn.commit() + + +@register_handler(comm.SPIDER_NAME_IAFD) +class IAFDDBHandler(SQLiteDBHandler): def __init__(self, db_path=shared_db_path): super().__init__(db_path) self.tbl_name_performers = 'iafd_performers' self.tbl_name_movies = 'iafd_movies' self.uniq_key = 'href' + def insert_item(self, item): + pass + # 按条件查询 href 列表 def get_performers(self, **filters): try: diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py index b349e91..8b69417 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py +++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py @@ -31,6 +31,13 @@ class SQLiteDBHandler: if sqlite_version < (3, 24, 0): self.lower_sqlite_version = True + def _create_tables(self): + pass + + # 接口函数,必须在各个子类中实现 + def insert_item(self, item): + raise NotImplementedError("子类必须实现 insert_item 方法") + def get_table_columns_and_defaults(self, tbl_name): try: self.cursor.execute(f"PRAGMA table_info({tbl_name})") diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index 02da5ff..bf83ae7 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -5,9 +5,11 @@ # items.py import scrapy +import scrapy_proj.comm.comm_def as comm # u3c3.in class U001Item(scrapy.Item): + item_tpye = comm.ITEM_TYPE_LIST category = scrapy.Field() title = scrapy.Field() url = scrapy.Field() @@ -19,6 +21,7 @@ class U001Item(scrapy.Item): # sis001.com class Sis001Item(scrapy.Item): + item_tpye = comm.ITEM_TYPE_LIST title = scrapy.Field() url = scrapy.Field() plate_name = scrapy.Field() @@ -27,6 +30,7 @@ class Sis001Item(scrapy.Item): update_date = scrapy.Field() class IAFDPersonItem(scrapy.Item): + item_tpye = comm.ITEM_TYPE_ACTOR_INDEX name = scrapy.Field() href = scrapy.Field() from_astro_list = scrapy.Field() @@ -35,6 +39,7 @@ class IAFDPersonItem(scrapy.Item): from_movie_list = scrapy.Field() class IAFDMovieItem(scrapy.Item): + item_tpye = comm.ITEM_TYPE_MOVIE_INDEX title = scrapy.Field() href = scrapy.Field() release_year = scrapy.Field() @@ -43,6 +48,7 @@ class IAFDMovieItem(scrapy.Item): from_stu_list = scrapy.Field() class IAFDPersonDetailItem(scrapy.Item): + item_tpye = comm.ITEM_TYPE_ACTOR_DETAIL href = scrapy.Field() person = scrapy.Field() gender = scrapy.Field() @@ -67,6 +73,7 @@ class IAFDPersonDetailItem(scrapy.Item): performer_aka = scrapy.Field() class IAFDMovieDetailItem(scrapy.Item): + item_tpye = comm.ITEM_TYPE_MOVIE_DETAIL title = scrapy.Field() href = scrapy.Field() # 可以根据实际需求添加更多影片详情字段 \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/pipelines.py b/scrapy_proj/scrapy_proj/pipelines.py index 5bd9a8a..4f19112 100644 --- a/scrapy_proj/scrapy_proj/pipelines.py +++ b/scrapy_proj/scrapy_proj/pipelines.py @@ -9,91 +9,39 @@ #class ScrapyProjPipeline: # def process_item(self, item, spider): # return item - - -import os -import sqlite3 +import json import scrapy -import logging -from datetime import datetime from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem -from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler +from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler -class SQLitePipeline(SQLiteDBHandler): - def __init__(self, db_path=None): - super().__init__(db_path) - self.tbl_name_u3c3 = 'u3c3' - self.tbl_name_sis = 'sis' - self._create_tables() - - def _create_tables(self): - # 创建 u001 数据表 - self.cursor.execute(f''' - CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - category TEXT, - title TEXT, - url TEXT UNIQUE, - torrent_url TEXT, - magnet_url TEXT, - size_text TEXT, - size_gb REAL, - update_date TEXT, - created_at TEXT DEFAULT (datetime('now', 'localtime')), - updated_at TEXT DEFAULT (datetime('now', 'localtime')) - ) - ''') +class SQLitePipeline(): + def __init__(self): + self.db_handlers = {} + + def open_spider(self, spider): + spider_name = spider.name.lower() + handler_class = spider_handler_registry.get(spider_name) + if not handler_class: + raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类") + self.db_handlers[spider_name] = handler_class() - # 创建 sis001 数据表 - self.cursor.execute(f''' - CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - plate_name TEXT, - title TEXT, - url TEXT UNIQUE, - size_text TEXT, - size_gb REAL, - update_date TEXT, - created_at TEXT DEFAULT (datetime('now', 'localtime')), - updated_at TEXT DEFAULT (datetime('now', 'localtime')) - ) - ''') - self.conn.commit() - def process_item(self, item, spider): - if isinstance(item, U001Item): - self._process_u001_item(item, spider) - elif isinstance(item, Sis001Item): - self._process_sis001_item(item, spider) - elif isinstance(item, IAFDPersonItem): - self._process_iafd_person_item(item, spider) - elif isinstance(item, IAFDPersonDetailItem): - self._process_iafd_person_detail_item(item, spider) - elif isinstance(item, IAFDMovieItem): - self._process_iafd_movie_item(item, spider) - elif isinstance(item, IAFDMovieDetailItem): - self._process_iafd_movie_detail_item(item, spider) - return item - - def _process_u001_item(self, item, spider): - spider.logger.debug(f"insert one item. href:{spider.name}") - return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True) - - def _process_sis001_item(self, item, spider): - spider.logger.debug(f"insert one item. href:{spider.name}") - return self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True) - - def _process_iafd_person_item(self, item, spider): - spider.logger.debug(f"deal with persion item. {item}") - - def _process_iafd_movie_item(self, item, spider): - spider.logger.debug(f"deal with movie item. {item}") - - def _process_iafd_person_detail_item(self, item, spider): - spider.logger.debug(f"deal with persion item. {item}") - - def _process_iafd_movie_detail_item(self, item, spider): - spider.logger.debug(f"deal with movie item. {item}") - def close_spider(self, spider): - self.conn.close() \ No newline at end of file + spider_name = spider.name.lower() + handler = self.db_handlers.pop(spider_name, None) + if handler: + handler.close() + + def process_item(self, item, spider): + spider_name = spider.name.lower() + handler = self.db_handlers.get(spider_name) + + if not handler: + raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器") + + # 转换为单行JSON格式 + item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':')) + spider.logger.debug(f"spider name: {spider_name}, item: {item_json}") + + handler.insert_item(item) + diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py index e3113ef..0041194 100644 --- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py @@ -2,12 +2,13 @@ import scrapy import re from scrapy_proj.spiders.base_spider import BaseSpider from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem -from scrapy_proj.db_wapper.iafd_query import IAFDQuery +from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler +from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD -db_tools = IAFDQuery() +db_tools = IAFDDBHandler() class IAFDSpider(BaseSpider): - name = "iafd" + name = SPIDER_NAME_IAFD allowed_domains = ["iafd.com"] host_url = "https://www.iafd.com" diff --git a/scrapy_proj/scrapy_proj/spiders/sis_spider.py b/scrapy_proj/scrapy_proj/spiders/sis_spider.py index d019102..0a7d15e 100644 --- a/scrapy_proj/scrapy_proj/spiders/sis_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/sis_spider.py @@ -4,9 +4,10 @@ from urllib.parse import urljoin from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element from scrapy_proj.items import Sis001Item from scrapy_proj.utils.utils import parse_size_format, parse_date_to_datetime +from scrapy_proj.comm.comm_def import SPIDER_NAME_SIS class Sis001Spider(BaseSpider): - name = "sis" + name = SPIDER_NAME_SIS allowed_domains = ["sis001.com"] def __init__(self, debug='False', begin=None, *args, **kwargs): @@ -111,9 +112,9 @@ class Sis001Spider(BaseSpider): # 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页 up_date = parse_date_to_datetime(item['update_date']) - self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") + #self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) : - self.logger.debug(f"find early data.") + pass else: need_next = True diff --git a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py index 7268032..a839887 100644 --- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py @@ -3,9 +3,10 @@ import scrapy from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element from scrapy_proj.items import U001Item from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime +from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3 class U001Spider(BaseSpider): - name = "u3c3" + name = SPIDER_NAME_U3C3 allowed_domains = ["u001.25img.com"] start_urls = ["https://u001.25img.com/?p=1"] @@ -37,9 +38,9 @@ class U001Spider(BaseSpider): # 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页 up_date = parse_date_to_datetime(item['update_date']) - self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") + #self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()): - self.logger.debug(f"find early data.") + pass else: need_next = True yield item