This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/u9a9/src/fetch.py
2025-06-18 16:59:23 +08:00

184 lines
6.0 KiB
Python

import json
import time
import os
import argparse
import textwrap
import logging
from datetime import datetime, timedelta
from functools import partial
import config
import scraper
import utils
from urllib.parse import urljoin, urlparse
config.setup_logging()
debug = False
skip_local = False
scan_mode = 0
update_mode = 0
current_date_str = datetime.now().strftime("%Y-%m-%d")
target_csv = f"{config.global_share_data_dir}/u3c3.csv"
target_torrent_dir = f"{config.global_share_data_dir}/u3c3_torrents"
# 获取演员列表
def fetch_list(start_p=1):
p = start_p
total_results = []
while True:
url = f"https://u001.25img.com/?p={p}"
logging.info(f"fetching url {url}")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="table-responsive", attr_type="class"))
if soup:
list_data, total_pages = scraper.parse_page(soup, url)
if list_data :
total_results.extend(list_data)
else:
logging.warning(f"fetch_list failed. url: {url} ")
if total_pages:
if p >= total_pages:
url = None
else:
p += 1
time.sleep(1)
else:
logging.warning(f"fetch_list failed. url: {url} ")
url = None
else:
logging.warning(f'fetch_page error. url: {url}, status_code: {status_code}')
if debug:
break
# 写入csv文件
lines = utils.write_to_csv(total_results, target_csv)
if lines:
logging.info(f"write to file succ. total lines: {lines}, file: {target_csv}")
logging.info(f"fetch list finished. total pages: {p}")
# 下载资源
def down_torrents():
# 读取CSV数据
rows = utils.read_csv_data(target_csv)
if not rows:
return
# 创建主下载目录
os.makedirs(target_torrent_dir, exist_ok=True)
for row in rows:
title = row.get('title', '')
torrent_url = row.get('torrent_url', '')
# 检查URL是否合法
if not (torrent_url.startswith('https') and torrent_url.endswith('.torrent')):
logging.warning(f"跳过非法torrent链接: {torrent_url}")
continue
# 解析文件名
try:
parsed_url = urlparse(torrent_url)
filename = os.path.basename(parsed_url.path)
if not filename:
logging.warning(f"无法从URL解析文件名: {torrent_url}")
continue
except Exception as e:
logging.warning(f"解析URL时出错: {e}")
continue
# 创建子目录(按文件名首字母小写)
first_char = filename[0].lower()
subdir = os.path.join(target_torrent_dir, first_char)
os.makedirs(subdir, exist_ok=True)
# 检查文件是否已存在
local_path = os.path.join(subdir, filename)
if os.path.exists(local_path):
logging.info(f"文件已存在,跳过下载: {title}, {local_path}")
continue
succ = scraper.download_torrent(torrent_url, local_path)
if succ:
logging.info(f"download succ. {title}, {local_path}")
if debug:
break
time.sleep(1)
# 建立缩写到函数的映射
function_map = {
"list": fetch_list,
"down" : down_torrents,
}
# 主函数
def main(cmd, args):
# 执行指定的函数
if cmd:
function_names = args.cmd.split(",") # 拆分输入
for short_name in function_names:
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
if callable(func):
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
else: # 全量执行
for name, func in function_map.items():
if callable(func):
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
logging.info(f'all process completed!')
# TODO:
# 1,
# 设置环境变量
def set_env(args):
global debug
debug = args.debug
if debug:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
global skip_local
skip_local = args.skip_local
global scan_mode
scan_mode = args.scan_mode
global update_mode
if args.update:
update_mode = args.update
if __name__ == "__main__":
# 命令行参数处理
keys_str = ",".join(function_map.keys())
usage_examples = textwrap.dedent('''
示例用法:
python3 ./fetch.py # 刷新列表,并下载新增资源
python3 ./fetch.py --cmd=list # 刷新列表
python3 ./fetch.py --cmd=down # 并下载新增资源
''')
parser = argparse.ArgumentParser(
description='fetch javhd data.\n\n' + usage_examples,
formatter_class=argparse.RawDescriptionHelpFormatter
)
#parser = argparse.ArgumentParser(description='fetch javdb data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=1, help='1-只遍历所有 uncensored 的 makers/series/actors/movies(默认), 0-与前者相反, 2-全量')
parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面,则跳过数据库操作')
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
args = parser.parse_args()
set_env(args)
main(args.cmd, args)