resources/scripts/iafd/src/utils.py

import re
import os
import json
import time
import csv
import logging

# 解析 height 和 weight（转换成数字）
def parse_height(height_str):
    return 0
    try:
        return int(height_str.split("(")[-1].replace(" cm)", ""))
    except:
        return None

def parse_weight(weight_str):
    return 0
    try:
        return int(weight_str.split(" ")[0])
    except:
        return None

update_dir = '../result'
performers_dir = f'{update_dir}/performers'
movies_dir = f'{update_dir}/movies'

def to_number(value):
    """将字符串转换为数字，如果无效则返回 0"""
    try:
        return float(value)
    except (ValueError, TypeError):
        return 0

def dist_stu_href_rewrite(href):
    # 提取 ID（适用于 distrib 或 studio）
    import re
    match = re.search(r"(distrib|studio)=(\d+)", href)
    if not match:
        return None  # 不是目标 URL，返回 None

    key, id_number = match.groups()
    new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
    return new_url

# 创建目录
def create_sub_directory(base_dir, str):
    # 获取 person 的前两个字母并转为小写
    sub_dir = str[:1].lower()
    full_path = os.path.join(base_dir, sub_dir)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    return full_path

# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
    """从href中提取id参数"""
    match = re.search(r'id=([a-f0-9\-]+)', href)
    return match.group(1) if match else ''

# 写入每个 performer 的单独 JSON 文件
def write_person_json(person, href, data):
    # 获取目录
    person_dir = create_sub_directory(performers_dir, person)
    person_id = extract_id_from_href(href)
    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
    full_path = os.path.join(person_dir, person_filename)

    try:
        with open(full_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        logging.error(f"Error writing file {full_path}: {e}")


# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
    # 获取目录
    movie_id = extract_id_from_href(href)
    person_dir = create_sub_directory(movies_dir, movie_id)
    person_filename = f"{movie_id}.json"  # 用 - 替换空格
    full_path = os.path.join(person_dir, person_filename)

    try:
        with open(full_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        logging.error(f"Error writing file {full_path}: {e}")


# 读取json文件并返回内容
def read_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到.")
        return None
    except json.JSONDecodeError:
        print(f"文件 {file_path} 解析错误.")
        return None