import re import os import json import time import csv from datetime import datetime import logging import config # 解析 height 和 weight(转换成数字) def parse_height(height_str): return 0 try: return int(height_str.split("(")[-1].replace(" cm)", "")) except: return None def parse_weight(weight_str): return 0 try: return int(weight_str.split(" ")[0]) except: return None update_dir = f'{config.global_host_data_dir}/iafd' performers_dir = f'{update_dir}/performers' movies_dir = f'{update_dir}/movies' def to_number(value): """将字符串转换为数字,如果无效则返回 0""" try: return float(value) except (ValueError, TypeError): return 0 def is_valid_person_url(url): if 'person.rme' in url.lower(): return True return False def dist_stu_href_rewrite(href): # 提取 ID(适用于 distrib 或 studio) import re match = re.search(r"(distrib|studio)=(\d+)", href) if not match: return None # 不是目标 URL,返回 None key, id_number = match.groups() new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}" return new_url # 创建目录 def create_sub_directory(base_dir, str): # 获取 person 的前两个字母并转为小写 sub_dir = str[:1].lower() full_path = os.path.join(base_dir, sub_dir) if not os.path.exists(full_path): os.makedirs(full_path) return full_path # 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值 def extract_id_from_href(href): """从href中提取id参数""" match = re.search(r'id=([a-f0-9\-]+)', href) return match.group(1) if match else '' # 写入每个 performer 的单独 JSON 文件 def write_person_json(person, href, data): # 获取目录 person_dir = create_sub_directory(performers_dir, person) person_id = extract_id_from_href(href) person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格 full_path = os.path.join(person_dir, person_filename) try: with open(full_path, 'w', encoding='utf-8') as json_file: json.dump(data, json_file, indent=4, ensure_ascii=False) except Exception as e: logging.error(f"Error writing file {full_path}: {e}") # 写入每个 performer 的单独 JSON 文件 def write_movie_json(href, data): # 获取目录 movie_id = extract_id_from_href(href) person_dir = create_sub_directory(movies_dir, movie_id) person_filename = f"{movie_id}.json" # 用 - 替换空格 full_path = os.path.join(person_dir, person_filename) try: with open(full_path, 'w', encoding='utf-8') as json_file: json.dump(data, json_file, indent=4, ensure_ascii=False) except Exception as e: logging.error(f"Error writing file {full_path}: {e}") # 保存抓取到的原始HTML,方便后续核验 def write_raw_html(href, html_text): # 获取目录 id = extract_id_from_href(href) if 'person.rme' in href.lower(): dir_prefix = 'raw_performers' elif 'title.rme' in href.lower(): dir_prefix = 'raw_movies' else: return file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) file_name = f"{id}.html" # 用 - 替换空格 full_path = os.path.join(file_dir, file_name) try: with open(full_path, 'w', encoding='utf-8') as file: file.write(html_text) except FileNotFoundError: logging.warning(f"错误:指定的路径 {full_path} 不存在。") except PermissionError: logging.warning(f"错误:没有权限写入文件 {full_path}。") except Exception as e: logging.warning(f"发生未知错误:{e}") # 保存抓取到的原始HTML,方便后续核验 def read_raw_html(href, expire_date="2025-03-01"): # 获取目录 id = extract_id_from_href(href) if 'person.rme' in href.lower(): dir_prefix = 'raw_performers' elif 'title.rme' in href.lower(): dir_prefix = 'raw_movies' else: return None file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) file_name = f"{id}.html" # 用 - 替换空格 full_path = os.path.join(file_dir, file_name) try: if os.path.exists(full_path): # 获取文件的最后修改时间 last_modified_timestamp = os.path.getmtime(full_path) # 将时间戳转换为 datetime 对象 last_modified_date = datetime.fromtimestamp(last_modified_timestamp) # 检查文件最后修改时间是否晚于给定日期 if last_modified_date > expire_date: logging.debug(f"find local file on href {href}") with open(full_path, 'r', encoding='utf-8') as file: return file.read() else: logging.debug(f"expired file {last_modified_date} on href {href}") return None else: return None except FileNotFoundError: logging.warning(f"错误:指定的路径 {full_path} 不存在。") except PermissionError: logging.warning(f"错误:没有权限读取文件 {full_path}。") except Exception as e: logging.warning(f"发生未知错误:{e}") return None # 读取json文件并返回内容 def read_json(file_path): try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: print(f"文件 {file_path} 未找到.") return None except json.JSONDecodeError: print(f"文件 {file_path} 解析错误.") return None