modify some scripts.

2025-03-03 19:01:41 +08:00
parent 8fd48687fc
commit f1e5abd6b3
10 changed files with 1642 additions and 0 deletions
--- a/scripts/iafd/src/utils.py
+++ b/scripts/iafd/src/utils.py
@ -0,0 +1,92 @@
+import re
+import os
+import json
+import time
+import csv
+import logging
+
+# 解析 height 和 weight（转换成数字）
+def parse_height(height_str):
+    return 0
+    try:
+        return int(height_str.split("(")[-1].replace(" cm)", ""))
+    except:
+        return None
+
+def parse_weight(weight_str):
+    return 0
+    try:
+        return int(weight_str.split(" ")[0])
+    except:
+        return None
+    
+update_dir = '../result'
+performers_dir = f'{update_dir}/performers'
+movies_dir = f'{update_dir}/movies'
+
+def uniq_performers(new_performers):
+    try:
+        if not isinstance(new_performers, list):
+            raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
+
+        seen = set()
+        unique_performers = []
+
+        for item in new_performers:
+            if not item or item['href'] is None:
+                raise ValueError(f"Invalid item in new_performers: {item}")
+
+            if item["href"] not in seen:
+                seen.add(item["href"])
+                unique_performers.append(item)
+
+        return unique_performers
+
+    except Exception as e:
+        logging.error(f"Error in remove_duplicate_performers: {e}")
+        return []  # 返回空列表，避免程序崩溃
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_person_json(person, href, data):
+    # 获取目录
+    person_dir = create_sub_directory(performers_dir, person)
+    person_id = extract_id_from_href(href)
+    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+
+# 写入每个 performer 的单独 JSON 文件
+def write_movie_json(href, data):
+    # 获取目录
+    movie_id = extract_id_from_href(href)
+    person_dir = create_sub_directory(movies_dir, movie_id)
+    person_filename = f"{movie_id}.json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+