resources/iafd/src/utils.py

import re
import os
import json
import time
import csv
from datetime import datetime
import logging
import config

# 解析 height 和 weight（转换成数字）
def parse_height(height_str):
    return 0
    try:
        return int(height_str.split("(")[-1].replace(" cm)", ""))
    except:
        return None

def parse_weight(weight_str):
    return 0
    try:
        return int(weight_str.split(" ")[0])
    except:
        return None

update_dir = f'{config.global_host_data_dir}/iafd'
performers_dir = f'{update_dir}/performers'
movies_dir = f'{update_dir}/movies'

def to_number(value):
    """将字符串转换为数字，如果无效则返回 0"""
    try:
        return float(value)
    except (ValueError, TypeError):
        return 0

def is_valid_person_url(url):
    if 'person.rme' in url.lower():
        return True
    return False

def dist_stu_href_rewrite(href):
    # 提取 ID（适用于 distrib 或 studio）
    import re
    match = re.search(r"(distrib|studio)=(\d+)", href)
    if not match:
        return None  # 不是目标 URL，返回 None

    key, id_number = match.groups()
    new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
    return new_url

# 创建目录
def create_sub_directory(base_dir, str):
    # 获取 person 的前两个字母并转为小写
    sub_dir = str[:1].lower()
    full_path = os.path.join(base_dir, sub_dir)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    return full_path

# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
    """从href中提取id参数"""
    match = re.search(r'id=([a-f0-9\-]+)', href)
    return match.group(1) if match else ''

# 写入每个 performer 的单独 JSON 文件
def write_person_json(person, href, data):
    # 获取目录
    person_dir = create_sub_directory(performers_dir, person)
    person_id = extract_id_from_href(href)
    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
    full_path = os.path.join(person_dir, person_filename)

    try:
        with open(full_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        logging.error(f"Error writing file {full_path}: {e}")


# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
    # 获取目录
    movie_id = extract_id_from_href(href)
    person_dir = create_sub_directory(movies_dir, movie_id)
    person_filename = f"{movie_id}.json"  # 用 - 替换空格
    full_path = os.path.join(person_dir, person_filename)

    try:
        with open(full_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        logging.error(f"Error writing file {full_path}: {e}")

# 保存抓取到的原始HTML，方便后续核验
def write_raw_html(href, html_text):
    # 获取目录
    id = extract_id_from_href(href)
    if 'person.rme' in href.lower():
        dir_prefix = 'raw_performers'
    elif 'title.rme' in href.lower():
        dir_prefix = 'raw_movies'
    else:
        return

    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
    file_name = f"{id}.html"  # 用 - 替换空格
    full_path = os.path.join(file_dir, file_name)

    try:
        with open(full_path, 'w', encoding='utf-8') as file:
            file.write(html_text)
    except FileNotFoundError:
        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
    except PermissionError:
        logging.warning(f"错误：没有权限写入文件 {full_path}。")
    except Exception as e:
        logging.warning(f"发生未知错误：{e}")


# 保存抓取到的原始HTML，方便后续核验
def read_raw_html(href, expire_date="2025-03-01"):
    # 获取目录
    id = extract_id_from_href(href)
    if 'person.rme' in href.lower():
        dir_prefix = 'raw_performers'
    elif 'title.rme' in href.lower():
        dir_prefix = 'raw_movies'
    else:
        return None

    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
    file_name = f"{id}.html"  # 用 - 替换空格
    full_path = os.path.join(file_dir, file_name)

    try:
        if os.path.exists(full_path):
            # 获取文件的最后修改时间
            last_modified_timestamp = os.path.getmtime(full_path)
            # 将时间戳转换为 datetime 对象
            last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
            # 检查文件最后修改时间是否晚于给定日期
            if last_modified_date > expire_date:
                logging.debug(f"find local file on href {href}")
                with open(full_path, 'r', encoding='utf-8') as file:
                    return file.read()
            else:
                logging.debug(f"expired file {last_modified_date} on href {href}")
                return None
        else:
            return None
    except FileNotFoundError:
        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
    except PermissionError:
        logging.warning(f"错误：没有权限读取文件 {full_path}。")
    except Exception as e:
        logging.warning(f"发生未知错误：{e}")
    return None


# 读取json文件并返回内容
def read_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到.")
        return None
    except json.JSONDecodeError:
        print(f"文件 {file_path} 解析错误.")
        return None