modify some scripts.

2025-03-05 16:36:34 +08:00
parent 3f0a8acb6b
commit 977338a281
1 changed files with 35 additions and 8 deletions
--- a/scripts/iafd/src_json/movie_detail_fetch.py
+++ b/scripts/iafd/src_json/movie_detail_fetch.py
@ -22,6 +22,7 @@ INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
 OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
 OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
 BATCH_SIZE = 100  # 每100条数据写入文件
+movies_dir = f'{RESULT_DIR}/movies'

 # 初始化 Cloudflare 绕过工具
 scraper = cloudscraper.create_scraper()
@ -197,6 +198,34 @@ def parse_movie_details(html, href, title):
        "AppearsIn": appears_in,
    }

+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_movie_json(href, data):
+    # 获取目录
+    movie_id = extract_id_from_href(href)
+    person_dir = create_sub_directory(movies_dir, movie_id)
+    person_filename = f"{movie_id}.json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")

 def process_movies():
    """处理电影数据"""
@ -233,6 +262,9 @@ def process_movies():
                else:
                    all_movies.append(movie)
                    count += 1
+
+                    # 写入本地文件
+                    write_movie_json(href, movie)
                    break

        # 每 BATCH_SIZE 条数据刷新一次文件
@ -270,15 +302,10 @@ def process_one(href):
            logging.warning(f'fetching {href} error. retrying...')
            continue  # 获取失败，跳过
    
-    id = extract_id_from_href(href)
-    filename = f"./log/{id}.json"  # 用 - 替换空格
+    if movie:
+        write_movie_json(href, movie)

-    try:
-        with open(filename, 'w', encoding='utf-8') as json_file:
-            json.dump(movie, json_file, indent=4, ensure_ascii=False)
-    except Exception as e:
-        logging.error(f"Error writing file {filename}: {e}")
-    print(f'fetch succ. saved result in {filename}')
+    print(f'fetch succ. saved result in {movies_dir}')

 # 处理程序被终止时的数据
 def handle_exit_signal(signal, frame):