From 767858f7a4ccb59dc2e6c8b3791a923554dd557c Mon Sep 17 00:00:00 2001
From: oscarz <oscar@vip.qq.com>
Date: Thu, 19 Jun 2025 16:55:21 +0800
Subject: [PATCH] modify scripts

---
 u9a9/src/fetch.py   |  93 ++++++++++++++++++++++++++++++-
 u9a9/src/scraper.py | 133 +++++++++++++++++++++++++++++++++++++++++++-
 u9a9/src/utils.py   |  61 ++++++++++++++++++--
 3 files changed, 277 insertions(+), 10 deletions(-)

diff --git a/u9a9/src/fetch.py b/u9a9/src/fetch.py
index cedd307..318d848 100644
--- a/u9a9/src/fetch.py
+++ b/u9a9/src/fetch.py
@@ -27,6 +27,8 @@ target_torrent_dir = f"{config.global_share_data_dir}/u3c3_torrents"
 def fetch_list(start_p=1):
     p = start_p
     total_results = []
+    # 备份已有文件
+    utils.backup_existing_file(target_csv)
     while True:
         url = f"https://u001.25img.com/?p={p}"
         logging.info(f"fetching url {url}")
@@ -40,10 +42,15 @@ def fetch_list(start_p=1):
             if total_pages:
                 if p >= total_pages:
                     url = None
+                    break
                 else:
                     p += 1
                     if p % 10 == 0 :
-                        utils.write_to_csv(total_results, target_csv)
+                        #utils.write_to_csv(total_results, target_csv)
+                        lines = utils.append_to_csv(total_results, target_csv)
+                        if lines:
+                            logging.info(f"write to csv file. new lines: {len(total_results)}, total lines: {lines}")
+                        total_results.clear()  # 清空缓冲区
                     time.sleep(1)
             else:
                 logging.warning(f"fetch_list failed. url: {url} ")
@@ -51,14 +58,18 @@ def fetch_list(start_p=1):
                 
         else:
             logging.warning(f'fetch_page error. url: {url}, status_code: {status_code}')
+
+        if not url:
+            break
             
         if debug:
             break
 
     # 写入csv文件
     lines = utils.write_to_csv(total_results, target_csv)
+    total_results.clear()
     if lines:
-        logging.info(f"write to file succ. total lines: {lines}, file: {target_csv}")
+        logging.info(f"write to file succ. file: {target_csv}. total lines: {lines}")
     logging.info(f"fetch list finished. total pages: {p}")
 
 
@@ -112,10 +123,86 @@ def down_torrents():
             break
         time.sleep(1)
 
+
+# 获取演员列表
+def fetch_sis_list(url = 'https://sis001.com/forum/forum-25-1.html', target_csv_sis = f"{config.global_share_data_dir}/sis_asia_zt.csv", ident='forum_25'):
+    total_results = []
+    cnt = 0
+    # 备份已有文件
+    utils.backup_existing_file(target_csv_sis)
+    while url:
+        logging.info(f"fetching url {url}")
+        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier=ident, attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_sis_list(soup, url, ident)
+            if list_data :
+                total_results.extend(list_data)
+            else:
+                logging.warning(f"fetch_list failed. url: {url} ")
+            if next_url:
+                url = next_url
+                cnt += 1
+                if cnt % 10 == 0 :
+                    #utils.write_to_csv(total_results, target_csv_sis)
+                    lines = utils.append_to_csv(total_results, target_csv_sis)
+                    if lines:
+                        logging.info(f"write to csv file. new lines: {len(total_results)}, total lines: {lines}")
+                    total_results.clear()
+                time.sleep(1)
+            else:
+                logging.warning(f"fetch_list failed. url: {url} ")
+                url = None
+                
+        else:
+            logging.warning(f'fetch_page error. url: {url}, status_code: {status_code}')
+            
+        if debug:
+            break
+
+    # 写入csv文件
+    lines = utils.write_to_csv(total_results, target_csv_sis)
+    total_results.clear()
+    if lines:
+        logging.info(f"write to file succ. file: {target_csv_sis}, total lines: {lines}")
+    logging.info(f"fetch list finished. total pages: {cnt}")
+
+def fetch_sis_all():
+    sections = [
+        {
+            'plate' : 'sis_asia_yc',
+            'url'   : 'https://sis001.com/forum/forum-143-1.html',
+            'ident' : 'forum_143'
+        },
+        {
+            'plate' : 'sis_asia_zt',
+            'url'   : 'https://sis001.com/forum/forum-25-1.html',
+            'ident' : 'forum_25'
+        },
+        {
+            'plate' : 'sis_oumei_yc',
+            'url'   : 'https://sis001.com/forum/forum-229-1.html',
+            'ident' : 'forum_229'
+        },
+        {
+            'plate' : 'sis_oumei_zt',
+            'url'   : 'https://sis001.com/forum/forum-77-1.html',
+            'ident' : 'forum_77'
+        },
+    ]
+    for item in sections:
+        section = item['plate']
+        url = item['url']
+        logging.info(f"---------------start fetching {section}, begin url: {url}")
+        csv_file = f"{config.global_share_data_dir}/{section}.csv"
+        fetch_sis_list(url=url, target_csv_sis=csv_file, ident=item['ident'])
+
+
 # 建立缩写到函数的映射
 function_map = {
     "list": fetch_list,
     "down" : down_torrents,
+    "sis": fetch_sis_list,
+    "sis_all": fetch_sis_all,
 }   
 
 # 主函数
@@ -168,6 +255,8 @@ if __name__ == "__main__":
         python3 ./fetch.py                              # 刷新列表，并下载新增资源
         python3 ./fetch.py --cmd=list                   # 刷新列表
         python3 ./fetch.py --cmd=down                   # 并下载新增资源
+        python3 ./fetch.py --cmd=sis                    # 刷新sis列表, 亚无转帖版面
+        python3 ./fetch.py --cmd=sis_all                # 刷新sis列表, 所有版面
     ''')
 
     parser = argparse.ArgumentParser(
diff --git a/u9a9/src/scraper.py b/u9a9/src/scraper.py
index 0e48f40..7c50599 100644
--- a/u9a9/src/scraper.py
+++ b/u9a9/src/scraper.py
@@ -11,6 +11,7 @@ import random
 from bs4 import BeautifulSoup
 from requests.exceptions import RequestException
 from functools import partial
+from urllib.parse import urljoin
 import config
 import utils
 
@@ -19,6 +20,8 @@ host_url = 'https://u001.25img.com'
 list_url_update    = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
 #list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
 
+sis_host_url = 'https://sis001.com'
+
 # User-Agent 列表
 user_agents = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
@@ -32,7 +35,7 @@ user_agents = [
 def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
     for attempt in range(max_retries):
         try:
-            if '25img.com' not in url.lower():
+            if '25img.com' not in url.lower() and 'sis001.com' not in url.lower():
                 logging.error(f'wrong url format: {url}')
                 return None, None
             
@@ -212,6 +215,121 @@ def download_torrent(torrent_url, target_file):
         logging.warning(f"Error downloading {torrent_url}: {str(e)}")
         return False
 
+def parse_size_format(size_text: str):
+    """解析大小和格式"""
+    try:
+        if not size_text:
+            return 0.0, "未知格式"
+        
+        # 分割大小和格式
+        parts = size_text.split('/')
+        format_part = parts[1].strip() if len(parts) > 1 else "未知格式"
+        
+        # 解析大小
+        size_part = parts[0].strip()
+        match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_part)
+        
+        if not match:
+            logging.warning(f"无法解析大小: {size_part}")
+            return 0.0, format_part
+        
+        value, unit = match.groups()
+        value = float(value)
+        
+        if unit.lower() == 'mb' or unit.lower() == 'm':
+            return round(value / 1024, 2), format_part
+        elif unit.lower() == 'gb' or unit.lower() == 'g':
+            return round(value, 2), format_part
+        else:
+            logging.warning(f"未知单位: {unit}")
+            return 0.0, format_part
+            
+    except Exception as e:
+        logging.error(f"解析大小格式时出错: {e}")
+        return 0.0, "未知格式"
+
+def parse_sis_list(soup, curr_url, ident):
+    """解析符合条件的表格"""
+    tables = soup.find_all('table', {'id': ident})
+    if not tables:
+        logging.warning(f"cannot found table. url: {curr_url}")
+        return None, None
+    
+    main_table = None
+    for table in tables:
+        try:
+            # 检查表头是否包含"版块主题"
+            thead = table.find('thead')
+            if thead and '版块主题' in thead.get_text():
+                main_table = table
+                break
+        except Exception as e:
+            logging.warning(f"解析表格时出错: {e} url: {curr_url}")
+
+    if not main_table:
+        logging.warning(f"cannot found table in right topic. url: {curr_url}")
+        return None, None
+
+    results = []
+    bodies = main_table.find_all('tbody', id=re.compile(r'normalthread_\d+'))
+    for body in bodies:
+        try:
+            rows = body.find_all('tr')
+            for row in rows:
+                tds = row.find_all('td')
+                if len(tds) < 6:
+                    logging.warning(f"跳过不完整的行，列数: {len(tds)}")
+                    continue
+                
+                # 解析类别和标题
+                th_lock = row.find('th')
+                if not th_lock:
+                    logging.warning("未找到th.lock元素")
+                    continue
+                
+                # 解析类别链接
+                category_links = th_lock.find_all('a', href=re.compile(r'forumdisplay.php'))
+                category = category_links[0].text.strip() if category_links else "未知类别"
+                
+                # 解析标题链接
+                title_links = th_lock.find_all('a', href=re.compile(r'thread-\d+-\d+-\d+.html'))
+                title = title_links[0].text.strip() if title_links else "未知标题"
+                url = title_links[0]['href'] if title_links else ""
+                url = urljoin(curr_url, url)
+                
+                # 解析发布日期
+                author_td = tds[2]
+                date = author_td.find('em').text.strip() if author_td.find('em') else "未知日期"
+                
+                # 解析大小和格式
+                size_td = tds[4]
+                size_text = size_td.text.strip()
+                size_gb, file_format = parse_size_format(size_text)
+                
+                # 添加到结果
+                results.append({
+                    "category": category,
+                    "title": title,
+                    "url": url,
+                    "date": date,
+                    "size_text": size_text,
+                    "size_gb": size_gb,
+                    "format": file_format
+                })
+        except Exception as e:
+            logging.error(f"解析tbody时出错: {e}")
+
+    next_url = None
+    pages_btns = soup.find('div', class_='pages_btns')
+    if not pages_btns:
+        logging.debug("未找到页面导航栏")
+    else:
+        next_link = pages_btns.find('a', class_='next')
+        if next_link:
+            next_url = urljoin(curr_url, next_link['href'])
+
+    return results, next_url
+
 
 def test_chapter_page(url):
     soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="table-responsive", attr_type="class"))
@@ -222,7 +340,18 @@ def test_chapter_page(url):
         if total_pages :
             print(total_pages)
 
+
+def test_sis_page(url):
+    soup, status_code = fetch_page(url, partial(generic_validator, tag="table", identifier="forum_25", attr_type="id"))
+    if soup:
+        data, next_url = parse_sis_list(soup, url)
+        if data:
+            print(data)
+        if next_url :
+            print(next_url)
+
 if __name__ == "__main__":
-    test_chapter_page('https://u001.25img.com/?p=1')
+    #test_chapter_page('https://u001.25img.com/?p=1')
+    test_sis_page('https://sis001.com/forum/forum-25-1.html')
 
     
\ No newline at end of file
diff --git a/u9a9/src/utils.py b/u9a9/src/utils.py
index 5781680..bea2b6b 100644
--- a/u9a9/src/utils.py
+++ b/u9a9/src/utils.py
@@ -1,5 +1,28 @@
 import csv
 import os
+import time
+
+def backup_existing_file(file_path):
+    """检查文件是否存在，如果存在则添加bak扩展名进行备份"""
+    if os.path.exists(file_path):
+        # 获取文件所在目录和文件名
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        
+        # 构建备份文件名，添加时间戳避免覆盖
+        timestamp = time.strftime("%Y%m%d-%H%M%S")
+        backup_name = f"{os.path.splitext(base_name)[0]}_{timestamp}.bak"
+        backup_path = os.path.join(dir_name, backup_name)
+        
+        try:
+            # 重命名文件
+            os.rename(file_path, backup_path)
+            print(f"已将现有文件备份为: {backup_path}")
+        except Exception as e:
+            print(f"备份文件时出错: {e}")
+            return False
+    
+    return True
 
 def write_to_csv(data, filename='output.csv'):
     """将资源数据写入CSV文件"""
@@ -7,12 +30,8 @@ def write_to_csv(data, filename='output.csv'):
         print("没有数据可写入")
         return None
     
-    # 定义CSV文件的列名
-    fieldnames = [
-        'category', 'title', 'url', 
-        'torrent_url', 'magnet_url', 
-        'size_text', 'size_gb', 'update_date'
-    ]
+    # 从第一条数据中提取所有可能的字段名
+    fieldnames = list(data[0].keys()) if data else []
     
     try:
         # 写入CSV文件
@@ -32,6 +51,36 @@ def write_to_csv(data, filename='output.csv'):
         print(f"写入CSV文件时出错: {e}")
         return None
 
+def append_to_csv(data, filename='output.csv'):
+    """将单条数据追加到CSV文件"""
+    if not data:
+        print("没有数据可写入")
+        return None
+    
+    # 从第一条数据中提取所有可能的字段名
+    fieldnames = list(data[0].keys()) if data else []
+    file_exists = os.path.exists(filename)
+
+    try:
+        # 追加模式打开文件
+        with open(filename, 'a', newline='', encoding='utf-8-sig') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            
+            # 如果文件不存在，先写入表头
+            if not file_exists:
+                writer.writeheader()
+            
+            # 写入数据行
+            writer.writerows(data)  # 使用writerows处理批量数据
+            
+        # 计算文件的总行数
+        with open(filename, 'r', encoding='utf-8-sig') as f:
+            return sum(1 for _ in f)
+                    
+    except Exception as e:
+        print(f"写入CSV文件时出错: {e}")
+        return None
+
 
 def read_csv_data(csv_file):
     """读取CSV文件并返回数据列表"""