modify scripts

2025-07-02 17:59:56 +08:00
parent f1a9287834
commit 2ea1eec072
12 changed files with 706 additions and 0 deletions
--- a/scrapy_proj/scrapy.cfg
+++ b/scrapy_proj/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = scrapy_proj.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = scrapy_proj
--- a/scrapy_proj/scrapy_proj/init.py
+++ b/scrapy_proj/scrapy_proj/init.py
--- a/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh
+++ b/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+
+: << 'EOF'
+执行本地脚本，以实现任务的状态监控。
+远程机上部署发送通知（企微）的脚本，把结果发送出来。
+EOF
+
+# 颜色定义
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m' # 无颜色
+
+REMOTE_SERVER="101.33.230.186"
+REMOTE_USER="root"
+SSH_OTRS="-o StrictHostKeyChecking=no -o ConnectTimeout=10"
+
+# 主函数
+main() {
+    # 检查是否提供了命令参数
+    if [ $# -eq 0 ]; then
+        result='test'  # 无参数时默认值
+    else
+        result=$1      # 使用第一个参数作为结果
+    fi
+
+    # 调用远程脚本并传递结果
+    ssh $SSH_OTRS $REMOTE_USER@$REMOTE_SERVER "cd /root/projects/devops/tools; python3 ./send_to_wecom.py '$result'"
+    return $?  # 返回远程命令的执行状态
+}
+
+# 执行主函数
+main "$@"
--- a/scrapy_proj/scrapy_proj/extensions/stats_extension.py
+++ b/scrapy_proj/scrapy_proj/extensions/stats_extension.py
@ -0,0 +1,116 @@
+import subprocess
+import time
+import logging
+from datetime import datetime
+from scrapy import signals
+from scrapy.exceptions import NotConfigured
+from twisted.internet import task
+
+logger = logging.getLogger()  # 修改点：使用全局 logger
+
+class StatsExtension:
+    def __init__(self, stats, interval, script_path=None):
+        self.stats = stats
+        self.interval = interval
+        self.script_path = script_path
+        self.spider_name = None
+        self.loop = None  # 添加循环任务
+        
+    @classmethod
+    def from_crawler(cls, crawler):
+        interval = crawler.settings.getint('STATS_EXPORT_INTERVAL', 600)
+        script_path = crawler.settings.get('STATS_EXPORT_SCRIPT')
+        
+        if interval <= 0:
+            raise NotConfigured
+            
+        ext = cls(crawler.stats, interval, script_path)
+        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
+        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
+        return ext
+    
+    def spider_opened(self, spider):
+        self.spider_name = spider.name
+        logger.info(f"Spider {spider.name} opened - StatsExtension initialized")
+        #self._export_stats(spider)
+        
+        # 创建并启动循环任务
+        self.loop = task.LoopingCall(self._export_stats, spider)
+        self.loop.start(self.interval)  # 每隔interval秒执行一次
+    
+    def spider_closed(self, spider, reason):
+        # 停止循环任务
+        if self.loop and self.loop.running:
+            self.loop.stop()
+            
+        self._export_stats(spider)
+        logger.info(f"Spider {spider.name} closed - reason: {reason}")
+    
+    def _export_stats(self, spider):
+        # 获取当前统计信息
+        stats = self.stats.get_stats()
+
+        # 修正：计算爬虫运行时间
+        start_time = stats.get('start_time')
+        if start_time:
+            # 将 datetime 对象转换为时间戳
+            start_timestamp = start_time.timestamp()
+            uptime = time.time() - start_timestamp
+        else:
+            uptime = 0
+
+        # 构建统计摘要
+        stats_summary = {
+            't': datetime.now().strftime('%H:%M:%S'),
+            'spider': self.spider_name,
+            'interval(s)': int(uptime),
+            'recv_cnt': stats.get('response_received_count', 0),
+            'total_req': stats.get('downloader/request_count', 0),
+            '200_cnt': stats.get('downloader/response_status_count/200', 0),
+            '404_cnt': stats.get('downloader/response_status_count/404', 0),
+            'log_err_cnt': stats.get('log_count/ERROR', 0)
+        }
+        
+        # 打印统计信息
+        logger.info(f"Stats Summary: {stats_summary}")
+        
+        # 如果配置了shell脚本，则调用它
+        if self.script_path:
+            self._call_shell_script_async(stats_summary)
+    
+    def _call_shell_script(self, stats):
+        try:
+            # 将统计信息转换为JSON字符串作为参数传递给shell脚本
+            import json
+            stats_json = json.dumps(stats)
+            
+            # 使用subprocess调用shell脚本
+            result = subprocess.run(
+                [self.script_path, stats_json],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            
+            logger.info(f"Shell script executed successfully: {result.stdout}")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Error executing shell script: {e.stderr}")
+        except Exception as e:
+            logger.error(f"Unexpected error calling shell script: {e}")
+
+    def _call_shell_script_async(self, stats):
+        try:
+            import json
+            stats_json = json.dumps(stats)
+            
+            # 非阻塞执行shell脚本
+            subprocess.Popen(
+                [self.script_path, stats_json],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            logger.info(f"Shell script started in background")
+        except Exception as e:
+            logger.error(f"Error starting shell script: {e}")
--- a/scrapy_proj/scrapy_proj/items.py
+++ b/scrapy_proj/scrapy_proj/items.py
@ -0,0 +1,22 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+# items.py
+import scrapy
+
+class U001Item(scrapy.Item):
+    category = scrapy.Field()
+    title = scrapy.Field()
+    url = scrapy.Field()
+    torrent_url = scrapy.Field()
+    magnet_url = scrapy.Field()
+    size_text = scrapy.Field()
+    size_gb = scrapy.Field()
+    update_date = scrapy.Field()
+
+class Sis001Item(scrapy.Item):
+    title = scrapy.Field()
+    url = scrapy.Field()
+    plate_name = scrapy.Field()
--- a/scrapy_proj/scrapy_proj/middlewares.py
+++ b/scrapy_proj/scrapy_proj/middlewares.py
@ -0,0 +1,100 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScrapyProjSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    async def process_start(self, start):
+        # Called with an async iterator over the spider start() method or the
+        # maching method of an earlier spider middleware.
+        async for item_or_request in start:
+            yield item_or_request
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScrapyProjDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/scrapy_proj/scrapy_proj/pipelines.py
+++ b/scrapy_proj/scrapy_proj/pipelines.py
@ -0,0 +1,209 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+#from itemadapter import ItemAdapter
+#class ScrapyProjPipeline:
+#    def process_item(self, item, spider):
+#        return item
+
+
+import os
+import sqlite3
+import logging
+from datetime import datetime
+from scrapy_proj.items import U001Item, Sis001Item
+
+home_dir = os.path.expanduser("~")
+global_share_data_dir = f'{home_dir}/sharedata'
+default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
+
+# 数据库基类，封装了通用的操作。
+class SQLiteDBHandler:
+    def __init__(self, db_path=None):
+        # 使用传入的 db_path 或默认路径
+        self.DB_PATH = db_path or default_dbpath
+
+        # 验证路径是否存在（可选）
+        if db_path and not os.path.exists(os.path.dirname(db_path)):
+            os.makedirs(os.path.dirname(db_path))
+
+        self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
+        self.cursor = self.conn.cursor()
+
+        # 检查 SQLite 版本
+        self.lower_sqlite_version = False
+        sqlite_version = sqlite3.sqlite_version_info
+        if sqlite_version < (3, 24, 0):
+            self.lower_sqlite_version = True
+
+    def get_table_columns_and_defaults(self, tbl_name):
+        try:
+            self.cursor.execute(f"PRAGMA table_info({tbl_name})")
+            columns = self.cursor.fetchall()
+            column_info = {}
+            for col in columns:
+                col_name = col[1]
+                default_value = col[4]
+                column_info[col_name] = default_value
+            return column_info
+        except sqlite3.Error as e:
+            logging.error(f"Error getting table columns: {e}")
+            return None
+
+    def check_and_process_data(self, data, tbl_name):
+        column_info = self.get_table_columns_and_defaults(tbl_name)
+        if column_info is None:
+            return None
+        processed_data = {}
+        for col, default in column_info.items():
+            if col == 'id' or col == 'created_at':  # 自增主键，不需要用户提供; 创建日期，使用建表默认值
+                continue
+            if col == 'updated_at':  # 日期函数，用户自己指定即可
+                processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            if col in data:
+                processed_data[col] = data[col]
+
+        return processed_data
+
+    def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
+        if self.lower_sqlite_version:
+            return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
+
+        try:
+            processed_data = self.check_and_process_data(data, tbl_name)
+            if processed_data is None:
+                return None
+
+            columns = ', '.join(processed_data.keys())
+            values = list(processed_data.values())
+            placeholders = ', '.join(['?' for _ in values])
+            update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
+
+            sql = f'''
+                INSERT INTO {tbl_name} ({columns})
+                VALUES ({placeholders})
+                ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
+            '''
+            self.cursor.execute(sql, values)
+            self.conn.commit()
+
+            # 获取插入或更新后的记录 ID
+            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
+            record_id = self.cursor.fetchone()[0]
+            return record_id
+        except sqlite3.Error as e:
+            logging.error(f"Error inserting or updating data: {e}")
+            return None
+
+    def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
+        try:
+            processed_data = self.check_and_process_data(data, tbl_name)
+            if processed_data is None:
+                return None
+
+            columns = ', '.join(processed_data.keys())
+            values = list(processed_data.values())
+            placeholders = ', '.join(['?' for _ in values])
+
+            # 先尝试插入数据
+            try:
+                sql = f'''
+                    INSERT INTO {tbl_name} ({columns})
+                    VALUES ({placeholders})
+                '''
+                self.cursor.execute(sql, values)
+                self.conn.commit()
+            except sqlite3.IntegrityError:  # 唯一键冲突，执行更新操作
+                update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
+                update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
+                update_values.append(data[uniq_key])
+                sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
+                self.cursor.execute(sql, update_values)
+                self.conn.commit()
+
+            # 获取插入或更新后的记录 ID
+            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
+            record_id = self.cursor.fetchone()[0]
+            return record_id
+        except sqlite3.Error as e:
+            logging.error(f"Error inserting or updating data: {e}")
+            return None
+
+    def get_id_by_key(self, tbl, uniq_key, val):
+        self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
+        row = self.cursor.fetchone()
+        return row[0] if row else None
+
+    def close(self):
+        self.cursor.close()
+        self.conn.close()
+
+class SQLitePipeline(SQLiteDBHandler):
+    def __init__(self, db_path=None):
+        super().__init__(db_path)
+        self.tbl_name_u3c3 = 'u3c3'
+        self.tbl_name_sis = 'sis'
+        self._create_tables()
+            
+    def _create_tables(self):
+        # 创建 u001 数据表
+        self.cursor.execute(f'''
+            CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                category TEXT,
+                title TEXT,
+                url TEXT UNIQUE,
+                torrent_url TEXT,
+                magnet_url TEXT,
+                size_text TEXT,
+                size_gb REAL,
+                update_date TEXT,
+                created_at TEXT DEFAULT (datetime('now', 'localtime')),
+                updated_at TEXT DEFAULT (datetime('now', 'localtime'))
+            )
+        ''')
+        
+        # 创建 sis001 数据表
+        self.cursor.execute(f'''
+            CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                plate_name TEXT,
+                title TEXT,
+                url TEXT UNIQUE,
+                size_text TEXT,
+                size_gb REAL,
+                update_date TEXT,
+                created_at TEXT DEFAULT (datetime('now', 'localtime')),
+                updated_at TEXT DEFAULT (datetime('now', 'localtime'))
+            )
+        ''')
+        self.conn.commit()
+    
+    def process_item(self, item, spider):
+        if isinstance(item, U001Item):
+            self._process_u001_item(item)
+        elif isinstance(item, Sis001Item):
+            self._process_sis001_item(item)
+        return item
+    
+    def _process_u001_item(self, item):
+        return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url')
+    
+    def _process_sis001_item(self, item):
+        self.cursor.execute('''
+            INSERT OR IGNORE INTO sis001_data 
+            (title, url, plate_name)
+            VALUES (?,?,?)
+        ''', (
+            item.get('title'),
+            item.get('url'),
+            item.get('plate_name')
+        ))
+        self.conn.commit()
+    
+    def close_spider(self, spider):
+        self.conn.close()
--- a/scrapy_proj/scrapy_proj/settings.py
+++ b/scrapy_proj/scrapy_proj/settings.py
@ -0,0 +1,140 @@
+# Scrapy settings for scrapy_proj project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+import os
+from datetime import datetime
+
+# 创建日志目录
+LOG_DIR = './log'
+os.makedirs(LOG_DIR, exist_ok=True)
+log_date = datetime.now().strftime('%Y%m%d')
+# 配置全局日志
+LOG_LEVEL = 'INFO'  # 设置为INFO级别
+LOG_FILE = os.path.join(LOG_DIR, f'scrapy_{log_date}.log')  # 日志文件路径
+# 配置日志格式
+LOG_FORMAT = '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'
+
+BOT_NAME = "scrapy_proj"
+
+SPIDER_MODULES = ["scrapy_proj.spiders"]
+NEWSPIDER_MODULE = "scrapy_proj.spiders"
+
+ADDONS = {}
+
+# 并发设置
+CONCURRENT_REQUESTS = 1
+CONCURRENT_ITEMS = 100
+
+# 下载延迟
+DOWNLOAD_DELAY = 1
+
+# 启用管道
+ITEM_PIPELINES = {
+    'scrapy_proj.pipelines.SQLitePipeline': 300,
+}
+
+# 用户代理池
+USER_AGENT_LIST = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
+    # 更多 UA...
+]
+
+# 随机用户代理中间件
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
+    'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None,
+}
+
+# settings.py
+EXTENSIONS = {
+    'scrapy_proj.extensions.stats_extension.StatsExtension': 500,
+}
+
+# 配置统计导出参数
+STATS_EXPORT_INTERVAL = 1800  # 每10分钟导出一次
+STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh'  # 本地shell脚本路径
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "scrapy_proj.middlewares.ScrapyProjSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "scrapy_proj.middlewares.ScrapyProjDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "scrapy_proj.pipelines.ScrapyProjPipeline": 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+FEED_EXPORT_ENCODING = "utf-8"
--- a/scrapy_proj/scrapy_proj/spiders/init.py
+++ b/scrapy_proj/scrapy_proj/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/scrapy_proj/scrapy_proj/spiders/sis_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/sis_spider.py
@ -0,0 +1,20 @@
+import scrapy
+from scrapy_proj.items import Sis001Item
+
+class Sis001Spider(scrapy.Spider):
+    name = "sis"
+    allowed_domains = ["sis001.com"]
+    start_urls = ["https://sis001.com/forum/forum-25-1.html"]
+
+    def parse(self, response):
+        for row in response.css('table[id="forum_25"] tbody[id^="normalthread_"] tr'):
+            item = Sis001Item()
+            item['title'] = row.css('td a::text').get()
+            item['url'] = response.urljoin(row.css('td a::attr(href)').get())
+            item['plate_name'] = '亚无转帖'
+            yield item
+
+        # 翻页逻辑
+        next_page = response.css('a.nxt::attr(href)').get()
+        if next_page:
+            yield response.follow(next_page, self.parse)
--- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
+++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
@ -0,0 +1,32 @@
+import scrapy
+from scrapy_proj.items import U001Item
+from scrapy_proj.utils.size_converter import parse_size
+
+class U001Spider(scrapy.Spider):
+    name = "u3c3"
+    allowed_domains = ["u001.25img.com"]
+    start_urls = ["https://u001.25img.com/?p=1"]
+
+    def parse(self, response):
+        for row in response.css('table.torrent-list tbody tr'):
+            item = U001Item()
+            item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
+            item['title'] = row.css('td:nth-child(2) a::attr(title)').get()
+            item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
+            
+            links = row.css('td:nth-child(3) a::attr(href)').getall()
+            item['torrent_url'] = response.urljoin(links[0]) if links else ''
+            item['magnet_url'] = links[1] if len(links) > 1 else ''
+            
+            size_text = row.css('td:nth-child(4)::text').get(default='').strip()
+            item['size_text'] = size_text
+            item['size_gb'] = parse_size(size_text)
+            
+            item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip()
+            yield item
+
+        # 翻页逻辑
+        current_page = int(response.url.split('=')[-1])
+        total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
+        if current_page < total_pages:
+            yield response.follow(f"?p={current_page + 1}", self.parse)
--- a/scrapy_proj/scrapy_proj/utils/size_converter.py
+++ b/scrapy_proj/scrapy_proj/utils/size_converter.py
@ -0,0 +1,19 @@
+import re
+
+def parse_size(size_text):
+    try:
+        match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_text)
+        if not match:
+            return 0.0
+        value, unit = match.groups()
+        value = float(value)
+        if unit.lower() == 'mb':
+            return round(value / 1024, 2)
+        elif unit.lower() == 'kb':
+            return round(value / 1024 / 1024, 2)
+        elif unit.lower() == 'gb':
+            return round(value, 2)
+        else:
+            return 0.0
+    except Exception:
+        return 0.0