diff --git a/scripts/iafd/src_json/movie_detail_fetch.py b/scripts/iafd/src_json/movie_detail_fetch.py index 247dd34..911208d 100644 --- a/scripts/iafd/src_json/movie_detail_fetch.py +++ b/scripts/iafd/src_json/movie_detail_fetch.py @@ -22,6 +22,7 @@ INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json") OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json") OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv") BATCH_SIZE = 100 # 每100条数据写入文件 +movies_dir = f'{RESULT_DIR}/movies' # 初始化 Cloudflare 绕过工具 scraper = cloudscraper.create_scraper() @@ -197,6 +198,34 @@ def parse_movie_details(html, href, title): "AppearsIn": appears_in, } +# 创建目录 +def create_sub_directory(base_dir, str): + # 获取 person 的前两个字母并转为小写 + sub_dir = str[:1].lower() + full_path = os.path.join(base_dir, sub_dir) + if not os.path.exists(full_path): + os.makedirs(full_path) + return full_path + +# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值 +def extract_id_from_href(href): + """从href中提取id参数""" + match = re.search(r'id=([a-f0-9\-]+)', href) + return match.group(1) if match else '' + +# 写入每个 performer 的单独 JSON 文件 +def write_movie_json(href, data): + # 获取目录 + movie_id = extract_id_from_href(href) + person_dir = create_sub_directory(movies_dir, movie_id) + person_filename = f"{movie_id}.json" # 用 - 替换空格 + full_path = os.path.join(person_dir, person_filename) + + try: + with open(full_path, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, indent=4, ensure_ascii=False) + except Exception as e: + logging.error(f"Error writing file {full_path}: {e}") def process_movies(): """处理电影数据""" @@ -233,6 +262,9 @@ def process_movies(): else: all_movies.append(movie) count += 1 + + # 写入本地文件 + write_movie_json(href, movie) break # 每 BATCH_SIZE 条数据刷新一次文件 @@ -270,15 +302,10 @@ def process_one(href): logging.warning(f'fetching {href} error. retrying...') continue # 获取失败,跳过 - id = extract_id_from_href(href) - filename = f"./log/{id}.json" # 用 - 替换空格 + if movie: + write_movie_json(href, movie) - try: - with open(filename, 'w', encoding='utf-8') as json_file: - json.dump(movie, json_file, indent=4, ensure_ascii=False) - except Exception as e: - logging.error(f"Error writing file {filename}: {e}") - print(f'fetch succ. saved result in {filename}') + print(f'fetch succ. saved result in {movies_dir}') # 处理程序被终止时的数据 def handle_exit_signal(signal, frame):