From 767858f7a4ccb59dc2e6c8b3791a923554dd557c Mon Sep 17 00:00:00 2001 From: oscarz Date: Thu, 19 Jun 2025 16:55:21 +0800 Subject: [PATCH] modify scripts --- u9a9/src/fetch.py | 93 ++++++++++++++++++++++++++++++- u9a9/src/scraper.py | 133 +++++++++++++++++++++++++++++++++++++++++++- u9a9/src/utils.py | 61 ++++++++++++++++++-- 3 files changed, 277 insertions(+), 10 deletions(-) diff --git a/u9a9/src/fetch.py b/u9a9/src/fetch.py index cedd307..318d848 100644 --- a/u9a9/src/fetch.py +++ b/u9a9/src/fetch.py @@ -27,6 +27,8 @@ target_torrent_dir = f"{config.global_share_data_dir}/u3c3_torrents" def fetch_list(start_p=1): p = start_p total_results = [] + # 备份已有文件 + utils.backup_existing_file(target_csv) while True: url = f"https://u001.25img.com/?p={p}" logging.info(f"fetching url {url}") @@ -40,10 +42,15 @@ def fetch_list(start_p=1): if total_pages: if p >= total_pages: url = None + break else: p += 1 if p % 10 == 0 : - utils.write_to_csv(total_results, target_csv) + #utils.write_to_csv(total_results, target_csv) + lines = utils.append_to_csv(total_results, target_csv) + if lines: + logging.info(f"write to csv file. new lines: {len(total_results)}, total lines: {lines}") + total_results.clear() # 清空缓冲区 time.sleep(1) else: logging.warning(f"fetch_list failed. url: {url} ") @@ -51,14 +58,18 @@ def fetch_list(start_p=1): else: logging.warning(f'fetch_page error. url: {url}, status_code: {status_code}') + + if not url: + break if debug: break # 写入csv文件 lines = utils.write_to_csv(total_results, target_csv) + total_results.clear() if lines: - logging.info(f"write to file succ. total lines: {lines}, file: {target_csv}") + logging.info(f"write to file succ. file: {target_csv}. total lines: {lines}") logging.info(f"fetch list finished. total pages: {p}") @@ -112,10 +123,86 @@ def down_torrents(): break time.sleep(1) + +# 获取演员列表 +def fetch_sis_list(url = 'https://sis001.com/forum/forum-25-1.html', target_csv_sis = f"{config.global_share_data_dir}/sis_asia_zt.csv", ident='forum_25'): + total_results = [] + cnt = 0 + # 备份已有文件 + utils.backup_existing_file(target_csv_sis) + while url: + logging.info(f"fetching url {url}") + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier=ident, attr_type="id")) + if soup: + list_data, next_url = scraper.parse_sis_list(soup, url, ident) + if list_data : + total_results.extend(list_data) + else: + logging.warning(f"fetch_list failed. url: {url} ") + if next_url: + url = next_url + cnt += 1 + if cnt % 10 == 0 : + #utils.write_to_csv(total_results, target_csv_sis) + lines = utils.append_to_csv(total_results, target_csv_sis) + if lines: + logging.info(f"write to csv file. new lines: {len(total_results)}, total lines: {lines}") + total_results.clear() + time.sleep(1) + else: + logging.warning(f"fetch_list failed. url: {url} ") + url = None + + else: + logging.warning(f'fetch_page error. url: {url}, status_code: {status_code}') + + if debug: + break + + # 写入csv文件 + lines = utils.write_to_csv(total_results, target_csv_sis) + total_results.clear() + if lines: + logging.info(f"write to file succ. file: {target_csv_sis}, total lines: {lines}") + logging.info(f"fetch list finished. total pages: {cnt}") + +def fetch_sis_all(): + sections = [ + { + 'plate' : 'sis_asia_yc', + 'url' : 'https://sis001.com/forum/forum-143-1.html', + 'ident' : 'forum_143' + }, + { + 'plate' : 'sis_asia_zt', + 'url' : 'https://sis001.com/forum/forum-25-1.html', + 'ident' : 'forum_25' + }, + { + 'plate' : 'sis_oumei_yc', + 'url' : 'https://sis001.com/forum/forum-229-1.html', + 'ident' : 'forum_229' + }, + { + 'plate' : 'sis_oumei_zt', + 'url' : 'https://sis001.com/forum/forum-77-1.html', + 'ident' : 'forum_77' + }, + ] + for item in sections: + section = item['plate'] + url = item['url'] + logging.info(f"---------------start fetching {section}, begin url: {url}") + csv_file = f"{config.global_share_data_dir}/{section}.csv" + fetch_sis_list(url=url, target_csv_sis=csv_file, ident=item['ident']) + + # 建立缩写到函数的映射 function_map = { "list": fetch_list, "down" : down_torrents, + "sis": fetch_sis_list, + "sis_all": fetch_sis_all, } # 主函数 @@ -168,6 +255,8 @@ if __name__ == "__main__": python3 ./fetch.py # 刷新列表,并下载新增资源 python3 ./fetch.py --cmd=list # 刷新列表 python3 ./fetch.py --cmd=down # 并下载新增资源 + python3 ./fetch.py --cmd=sis # 刷新sis列表, 亚无转帖版面 + python3 ./fetch.py --cmd=sis_all # 刷新sis列表, 所有版面 ''') parser = argparse.ArgumentParser( diff --git a/u9a9/src/scraper.py b/u9a9/src/scraper.py index 0e48f40..7c50599 100644 --- a/u9a9/src/scraper.py +++ b/u9a9/src/scraper.py @@ -11,6 +11,7 @@ import random from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial +from urllib.parse import urljoin import config import utils @@ -19,6 +20,8 @@ host_url = 'https://u001.25img.com' list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update' #list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount' +sis_host_url = 'https://sis001.com' + # User-Agent 列表 user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36", @@ -32,7 +35,7 @@ user_agents = [ def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10): for attempt in range(max_retries): try: - if '25img.com' not in url.lower(): + if '25img.com' not in url.lower() and 'sis001.com' not in url.lower(): logging.error(f'wrong url format: {url}') return None, None @@ -212,6 +215,121 @@ def download_torrent(torrent_url, target_file): logging.warning(f"Error downloading {torrent_url}: {str(e)}") return False +def parse_size_format(size_text: str): + """解析大小和格式""" + try: + if not size_text: + return 0.0, "未知格式" + + # 分割大小和格式 + parts = size_text.split('/') + format_part = parts[1].strip() if len(parts) > 1 else "未知格式" + + # 解析大小 + size_part = parts[0].strip() + match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_part) + + if not match: + logging.warning(f"无法解析大小: {size_part}") + return 0.0, format_part + + value, unit = match.groups() + value = float(value) + + if unit.lower() == 'mb' or unit.lower() == 'm': + return round(value / 1024, 2), format_part + elif unit.lower() == 'gb' or unit.lower() == 'g': + return round(value, 2), format_part + else: + logging.warning(f"未知单位: {unit}") + return 0.0, format_part + + except Exception as e: + logging.error(f"解析大小格式时出错: {e}") + return 0.0, "未知格式" + +def parse_sis_list(soup, curr_url, ident): + """解析符合条件的表格""" + tables = soup.find_all('table', {'id': ident}) + if not tables: + logging.warning(f"cannot found table. url: {curr_url}") + return None, None + + main_table = None + for table in tables: + try: + # 检查表头是否包含"版块主题" + thead = table.find('thead') + if thead and '版块主题' in thead.get_text(): + main_table = table + break + except Exception as e: + logging.warning(f"解析表格时出错: {e} url: {curr_url}") + + if not main_table: + logging.warning(f"cannot found table in right topic. url: {curr_url}") + return None, None + + results = [] + bodies = main_table.find_all('tbody', id=re.compile(r'normalthread_\d+')) + for body in bodies: + try: + rows = body.find_all('tr') + for row in rows: + tds = row.find_all('td') + if len(tds) < 6: + logging.warning(f"跳过不完整的行,列数: {len(tds)}") + continue + + # 解析类别和标题 + th_lock = row.find('th') + if not th_lock: + logging.warning("未找到th.lock元素") + continue + + # 解析类别链接 + category_links = th_lock.find_all('a', href=re.compile(r'forumdisplay.php')) + category = category_links[0].text.strip() if category_links else "未知类别" + + # 解析标题链接 + title_links = th_lock.find_all('a', href=re.compile(r'thread-\d+-\d+-\d+.html')) + title = title_links[0].text.strip() if title_links else "未知标题" + url = title_links[0]['href'] if title_links else "" + url = urljoin(curr_url, url) + + # 解析发布日期 + author_td = tds[2] + date = author_td.find('em').text.strip() if author_td.find('em') else "未知日期" + + # 解析大小和格式 + size_td = tds[4] + size_text = size_td.text.strip() + size_gb, file_format = parse_size_format(size_text) + + # 添加到结果 + results.append({ + "category": category, + "title": title, + "url": url, + "date": date, + "size_text": size_text, + "size_gb": size_gb, + "format": file_format + }) + except Exception as e: + logging.error(f"解析tbody时出错: {e}") + + next_url = None + pages_btns = soup.find('div', class_='pages_btns') + if not pages_btns: + logging.debug("未找到页面导航栏") + else: + next_link = pages_btns.find('a', class_='next') + if next_link: + next_url = urljoin(curr_url, next_link['href']) + + return results, next_url + def test_chapter_page(url): soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="table-responsive", attr_type="class")) @@ -222,7 +340,18 @@ def test_chapter_page(url): if total_pages : print(total_pages) + +def test_sis_page(url): + soup, status_code = fetch_page(url, partial(generic_validator, tag="table", identifier="forum_25", attr_type="id")) + if soup: + data, next_url = parse_sis_list(soup, url) + if data: + print(data) + if next_url : + print(next_url) + if __name__ == "__main__": - test_chapter_page('https://u001.25img.com/?p=1') + #test_chapter_page('https://u001.25img.com/?p=1') + test_sis_page('https://sis001.com/forum/forum-25-1.html') \ No newline at end of file diff --git a/u9a9/src/utils.py b/u9a9/src/utils.py index 5781680..bea2b6b 100644 --- a/u9a9/src/utils.py +++ b/u9a9/src/utils.py @@ -1,5 +1,28 @@ import csv import os +import time + +def backup_existing_file(file_path): + """检查文件是否存在,如果存在则添加bak扩展名进行备份""" + if os.path.exists(file_path): + # 获取文件所在目录和文件名 + dir_name = os.path.dirname(file_path) + base_name = os.path.basename(file_path) + + # 构建备份文件名,添加时间戳避免覆盖 + timestamp = time.strftime("%Y%m%d-%H%M%S") + backup_name = f"{os.path.splitext(base_name)[0]}_{timestamp}.bak" + backup_path = os.path.join(dir_name, backup_name) + + try: + # 重命名文件 + os.rename(file_path, backup_path) + print(f"已将现有文件备份为: {backup_path}") + except Exception as e: + print(f"备份文件时出错: {e}") + return False + + return True def write_to_csv(data, filename='output.csv'): """将资源数据写入CSV文件""" @@ -7,12 +30,8 @@ def write_to_csv(data, filename='output.csv'): print("没有数据可写入") return None - # 定义CSV文件的列名 - fieldnames = [ - 'category', 'title', 'url', - 'torrent_url', 'magnet_url', - 'size_text', 'size_gb', 'update_date' - ] + # 从第一条数据中提取所有可能的字段名 + fieldnames = list(data[0].keys()) if data else [] try: # 写入CSV文件 @@ -32,6 +51,36 @@ def write_to_csv(data, filename='output.csv'): print(f"写入CSV文件时出错: {e}") return None +def append_to_csv(data, filename='output.csv'): + """将单条数据追加到CSV文件""" + if not data: + print("没有数据可写入") + return None + + # 从第一条数据中提取所有可能的字段名 + fieldnames = list(data[0].keys()) if data else [] + file_exists = os.path.exists(filename) + + try: + # 追加模式打开文件 + with open(filename, 'a', newline='', encoding='utf-8-sig') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + # 如果文件不存在,先写入表头 + if not file_exists: + writer.writeheader() + + # 写入数据行 + writer.writerows(data) # 使用writerows处理批量数据 + + # 计算文件的总行数 + with open(filename, 'r', encoding='utf-8-sig') as f: + return sum(1 for _ in f) + + except Exception as e: + print(f"写入CSV文件时出错: {e}") + return None + def read_csv_data(csv_file): """读取CSV文件并返回数据列表"""