From 02334a800520648131a96e7373e8198be20304a1 Mon Sep 17 00:00:00 2001
From: oscarz <oscar@vip.qq.com>
Date: Sat, 5 Jul 2025 16:47:46 +0800
Subject: [PATCH] modify scripts

---
 scrapy_proj/scrapy_proj/comm/comm_def.py      |  17 +++
 .../{iafd_query.py => spider_db_handler.py}   |  73 +++++++++++-
 .../scrapy_proj/db_wapper/sqlite_base.py      |   7 ++
 scrapy_proj/scrapy_proj/items.py              |   7 ++
 scrapy_proj/scrapy_proj/pipelines.py          | 112 +++++-------------
 .../scrapy_proj/spiders/iafd_spider.py        |   7 +-
 scrapy_proj/scrapy_proj/spiders/sis_spider.py |   7 +-
 .../scrapy_proj/spiders/u3c3_spider.py        |   7 +-
 8 files changed, 144 insertions(+), 93 deletions(-)
 create mode 100644 scrapy_proj/scrapy_proj/comm/comm_def.py
 rename scrapy_proj/scrapy_proj/db_wapper/{iafd_query.py => spider_db_handler.py} (61%)

diff --git a/scrapy_proj/scrapy_proj/comm/comm_def.py b/scrapy_proj/scrapy_proj/comm/comm_def.py
new file mode 100644
index 0000000..e5cc83b
--- /dev/null
+++ b/scrapy_proj/scrapy_proj/comm/comm_def.py
@@ -0,0 +1,17 @@
+#
+#
+#
+#
+#
+#
+
+
+SPIDER_NAME_SIS = 'sis'
+SPIDER_NAME_U3C3 = 'u3c3'
+SPIDER_NAME_IAFD = 'iafd'
+
+ITEM_TYPE_LIST = 'list'
+ITEM_TYPE_MOVIE_INDEX = 'movie_index'
+ITEM_TYPE_ACTOR_INDEX = 'actor_index'
+ITEM_TYPE_MOVIE_DETAIL = 'movie_detail'
+ITEM_TYPE_ACTOR_DETAIL = 'actor_detail'
\ No newline at end of file
diff --git a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
similarity index 61%
rename from scrapy_proj/scrapy_proj/db_wapper/iafd_query.py
rename to scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
index 06f7508..fa9371d 100644
--- a/scrapy_proj/scrapy_proj/db_wapper/iafd_query.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py
@@ -2,16 +2,85 @@ import os
 import sqlite3
 import logging
 from datetime import datetime
-from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, shared_db_path
+from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler, default_dbpath, shared_db_path
+import scrapy_proj.comm.comm_def as comm
+
+# 注册器字典
+spider_handler_registry = {}
+
+def register_handler(spider_name):
+    def decorator(cls):
+        spider_handler_registry[spider_name.lower()] = cls
+        return cls
+    return decorator
+
+@register_handler(comm.SPIDER_NAME_SIS)
+class SisDBHandler(SQLiteDBHandler):
+    def __init__(self, db_path=default_dbpath):
+        super().__init__(db_path)
+        self.tbl_name_sis = 'sis'
+
+    def insert_item(self, item):
+        self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
+
+    def _create_tables(self): 
+        # 创建 sis001 数据表
+        self.cursor.execute(f'''
+            CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                plate_name TEXT,
+                title TEXT,
+                url TEXT UNIQUE,
+                size_text TEXT,
+                size_gb REAL,
+                update_date TEXT,
+                created_at TEXT DEFAULT (datetime('now', 'localtime')),
+                updated_at TEXT DEFAULT (datetime('now', 'localtime'))
+            )
+        ''')
+        self.conn.commit()
 
 
-class IAFDQuery(SQLiteDBHandler):
+@register_handler(comm.SPIDER_NAME_U3C3)
+class U3C3DBHandler(SQLiteDBHandler):
+    def __init__(self, db_path=default_dbpath):
+        super().__init__(db_path)
+        self.tbl_name_u3c3 = 'sis'
+
+    def insert_item(self, item):
+        self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
+
+    def _create_tables(self): 
+        # 创建 u001 数据表
+        self.cursor.execute(f'''
+            CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                category TEXT,
+                title TEXT,
+                url TEXT UNIQUE,
+                torrent_url TEXT,
+                magnet_url TEXT,
+                size_text TEXT,
+                size_gb REAL,
+                update_date TEXT,
+                created_at TEXT DEFAULT (datetime('now', 'localtime')),
+                updated_at TEXT DEFAULT (datetime('now', 'localtime'))
+            )
+        ''')
+        self.conn.commit()
+
+
+@register_handler(comm.SPIDER_NAME_IAFD)
+class IAFDDBHandler(SQLiteDBHandler):
     def __init__(self, db_path=shared_db_path):
         super().__init__(db_path)
         self.tbl_name_performers = 'iafd_performers'
         self.tbl_name_movies = 'iafd_movies'
         self.uniq_key = 'href'
 
+    def insert_item(self, item):
+        pass
+
     # 按条件查询 href 列表 
     def get_performers(self, **filters):
         try:
diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
index b349e91..8b69417 100644
--- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
+++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py
@@ -31,6 +31,13 @@ class SQLiteDBHandler:
         if sqlite_version < (3, 24, 0):
             self.lower_sqlite_version = True
 
+    def _create_tables(self): 
+        pass
+
+    # 接口函数，必须在各个子类中实现
+    def insert_item(self, item):
+        raise NotImplementedError("子类必须实现 insert_item 方法")
+
     def get_table_columns_and_defaults(self, tbl_name):
         try:
             self.cursor.execute(f"PRAGMA table_info({tbl_name})")
diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py
index 02da5ff..bf83ae7 100644
--- a/scrapy_proj/scrapy_proj/items.py
+++ b/scrapy_proj/scrapy_proj/items.py
@@ -5,9 +5,11 @@
 
 # items.py
 import scrapy
+import scrapy_proj.comm.comm_def as comm
 
 # u3c3.in 
 class U001Item(scrapy.Item):
+    item_tpye = comm.ITEM_TYPE_LIST
     category = scrapy.Field()
     title = scrapy.Field()
     url = scrapy.Field()
@@ -19,6 +21,7 @@ class U001Item(scrapy.Item):
 
 # sis001.com
 class Sis001Item(scrapy.Item):
+    item_tpye = comm.ITEM_TYPE_LIST
     title = scrapy.Field()
     url = scrapy.Field()
     plate_name = scrapy.Field()
@@ -27,6 +30,7 @@ class Sis001Item(scrapy.Item):
     update_date = scrapy.Field()
 
 class IAFDPersonItem(scrapy.Item):
+    item_tpye = comm.ITEM_TYPE_ACTOR_INDEX
     name = scrapy.Field()
     href = scrapy.Field()
     from_astro_list = scrapy.Field()
@@ -35,6 +39,7 @@ class IAFDPersonItem(scrapy.Item):
     from_movie_list = scrapy.Field()
 
 class IAFDMovieItem(scrapy.Item):
+    item_tpye = comm.ITEM_TYPE_MOVIE_INDEX
     title = scrapy.Field()
     href = scrapy.Field()
     release_year = scrapy.Field()
@@ -43,6 +48,7 @@ class IAFDMovieItem(scrapy.Item):
     from_stu_list = scrapy.Field()
 
 class IAFDPersonDetailItem(scrapy.Item):
+    item_tpye = comm.ITEM_TYPE_ACTOR_DETAIL
     href = scrapy.Field()
     person = scrapy.Field()
     gender = scrapy.Field()
@@ -67,6 +73,7 @@ class IAFDPersonDetailItem(scrapy.Item):
     performer_aka = scrapy.Field()
 
 class IAFDMovieDetailItem(scrapy.Item):
+    item_tpye = comm.ITEM_TYPE_MOVIE_DETAIL
     title = scrapy.Field()
     href = scrapy.Field()
     # 可以根据实际需求添加更多影片详情字段
\ No newline at end of file
diff --git a/scrapy_proj/scrapy_proj/pipelines.py b/scrapy_proj/scrapy_proj/pipelines.py
index 5bd9a8a..4f19112 100644
--- a/scrapy_proj/scrapy_proj/pipelines.py
+++ b/scrapy_proj/scrapy_proj/pipelines.py
@@ -9,91 +9,39 @@
 #class ScrapyProjPipeline:
 #    def process_item(self, item, spider):
 #        return item
-
-
-import os
-import sqlite3
+import json
 import scrapy
-import logging
-from datetime import datetime
 from scrapy_proj.items import U001Item, Sis001Item, IAFDPersonItem, IAFDPersonDetailItem, IAFDMovieItem, IAFDMovieDetailItem
-from scrapy_proj.db_wapper.sqlite_base import SQLiteDBHandler
+from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler
 
-class SQLitePipeline(SQLiteDBHandler):
-    def __init__(self, db_path=None):
-        super().__init__(db_path)
-        self.tbl_name_u3c3 = 'u3c3'
-        self.tbl_name_sis = 'sis'
-        self._create_tables()
-            
-    def _create_tables(self):
-        # 创建 u001 数据表
-        self.cursor.execute(f'''
-            CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                category TEXT,
-                title TEXT,
-                url TEXT UNIQUE,
-                torrent_url TEXT,
-                magnet_url TEXT,
-                size_text TEXT,
-                size_gb REAL,
-                update_date TEXT,
-                created_at TEXT DEFAULT (datetime('now', 'localtime')),
-                updated_at TEXT DEFAULT (datetime('now', 'localtime'))
-            )
-        ''')
+class SQLitePipeline():
+    def __init__(self):
+        self.db_handlers = {}
+    
+    def open_spider(self, spider):
+        spider_name = spider.name.lower()
+        handler_class = spider_handler_registry.get(spider_name)        
+        if not handler_class:
+            raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")        
+        self.db_handlers[spider_name] = handler_class()
         
-        # 创建 sis001 数据表
-        self.cursor.execute(f'''
-            CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                plate_name TEXT,
-                title TEXT,
-                url TEXT UNIQUE,
-                size_text TEXT,
-                size_gb REAL,
-                update_date TEXT,
-                created_at TEXT DEFAULT (datetime('now', 'localtime')),
-                updated_at TEXT DEFAULT (datetime('now', 'localtime'))
-            )
-        ''')
-        self.conn.commit()
     
-    def process_item(self, item, spider):
-        if isinstance(item, U001Item):
-            self._process_u001_item(item, spider)
-        elif isinstance(item, Sis001Item):
-            self._process_sis001_item(item, spider)
-        elif isinstance(item, IAFDPersonItem):
-            self._process_iafd_person_item(item, spider)
-        elif isinstance(item, IAFDPersonDetailItem):
-            self._process_iafd_person_detail_item(item, spider)
-        elif isinstance(item, IAFDMovieItem):
-            self._process_iafd_movie_item(item, spider)
-        elif isinstance(item, IAFDMovieDetailItem):
-            self._process_iafd_movie_detail_item(item, spider)
-        return item
-    
-    def _process_u001_item(self, item, spider):
-        spider.logger.debug(f"insert one item. href:{spider.name}")
-        return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
-    
-    def _process_sis001_item(self, item, spider):
-        spider.logger.debug(f"insert one item. href:{spider.name}")
-        return self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
-    
-    def _process_iafd_person_item(self, item, spider):
-        spider.logger.debug(f"deal with persion item. {item}")
-
-    def _process_iafd_movie_item(self, item, spider):
-        spider.logger.debug(f"deal with movie item. {item}")
-
-    def _process_iafd_person_detail_item(self, item, spider):
-        spider.logger.debug(f"deal with persion item. {item}")
-
-    def _process_iafd_movie_detail_item(self, item, spider):
-        spider.logger.debug(f"deal with movie item. {item}")
-
     def close_spider(self, spider):
-        self.conn.close()
\ No newline at end of file
+        spider_name = spider.name.lower()
+        handler = self.db_handlers.pop(spider_name, None)
+        if handler:
+            handler.close()
+
+    def process_item(self, item, spider):   
+        spider_name = spider.name.lower()
+        handler = self.db_handlers.get(spider_name)
+        
+        if not handler:
+            raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
+        
+        # 转换为单行JSON格式
+        item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
+        spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
+        
+        handler.insert_item(item)
+
diff --git a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
index e3113ef..0041194 100644
--- a/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/iafd_spider.py
@@ -2,12 +2,13 @@ import scrapy
 import re
 from scrapy_proj.spiders.base_spider import BaseSpider
 from scrapy_proj.items import IAFDPersonItem, IAFDMovieItem, IAFDPersonDetailItem, IAFDMovieDetailItem
-from scrapy_proj.db_wapper.iafd_query import IAFDQuery
+from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
+from scrapy_proj.comm.comm_def import SPIDER_NAME_IAFD
 
-db_tools = IAFDQuery()
+db_tools = IAFDDBHandler()
 
 class IAFDSpider(BaseSpider):
-    name = "iafd"
+    name = SPIDER_NAME_IAFD
     allowed_domains = ["iafd.com"]
 
     host_url = "https://www.iafd.com"
diff --git a/scrapy_proj/scrapy_proj/spiders/sis_spider.py b/scrapy_proj/scrapy_proj/spiders/sis_spider.py
index d019102..0a7d15e 100644
--- a/scrapy_proj/scrapy_proj/spiders/sis_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/sis_spider.py
@@ -4,9 +4,10 @@ from urllib.parse import urljoin
 from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
 from scrapy_proj.items import Sis001Item
 from scrapy_proj.utils.utils import parse_size_format, parse_date_to_datetime
+from scrapy_proj.comm.comm_def import SPIDER_NAME_SIS
 
 class Sis001Spider(BaseSpider):
-    name = "sis"
+    name = SPIDER_NAME_SIS
     allowed_domains = ["sis001.com"]
 
     def __init__(self, debug='False', begin=None, *args, **kwargs):
@@ -111,9 +112,9 @@ class Sis001Spider(BaseSpider):
 
                 # 判断是否还要翻页，只有满足所有页面的数据，日期均小于开始日期时，停止翻页
                 up_date = parse_date_to_datetime(item['update_date'])
-                self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
+                #self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
                 if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()) :
-                    self.logger.debug(f"find early data.")
+                    pass
                 else:
                     need_next = True
 
diff --git a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
index 7268032..a839887 100644
--- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
@@ -3,9 +3,10 @@ import scrapy
 from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
 from scrapy_proj.items import U001Item
 from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
+from scrapy_proj.comm.comm_def import SPIDER_NAME_U3C3
 
 class U001Spider(BaseSpider):
-    name = "u3c3"
+    name = SPIDER_NAME_U3C3
     allowed_domains = ["u001.25img.com"]
     start_urls = ["https://u001.25img.com/?p=1"]
 
@@ -37,9 +38,9 @@ class U001Spider(BaseSpider):
 
             # 判断是否还要翻页，只有满足所有页面的数据，日期均小于开始日期时，停止翻页
             up_date = parse_date_to_datetime(item['update_date'])
-            self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
+            #self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
             if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
-                self.logger.debug(f"find early data.")
+                pass
             else:
                 need_next = True
             yield item