This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/iafd/src/utils.py
2025-04-05 17:20:28 +08:00

173 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import os
import json
import time
import csv
from datetime import datetime
import logging
import config
# 解析 height 和 weight转换成数字
def parse_height(height_str):
return 0
try:
return int(height_str.split("(")[-1].replace(" cm)", ""))
except:
return None
def parse_weight(weight_str):
return 0
try:
return int(weight_str.split(" ")[0])
except:
return None
update_dir = f'{config.global_host_data_dir}/iafd'
performers_dir = f'{update_dir}/performers'
movies_dir = f'{update_dir}/movies'
def to_number(value):
"""将字符串转换为数字,如果无效则返回 0"""
try:
return float(value)
except (ValueError, TypeError):
return 0
def is_valid_person_url(url):
if 'person.rme' in url.lower():
return True
return False
def dist_stu_href_rewrite(href):
# 提取 ID适用于 distrib 或 studio
import re
match = re.search(r"(distrib|studio)=(\d+)", href)
if not match:
return None # 不是目标 URL返回 None
key, id_number = match.groups()
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
return new_url
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_person_json(person, href, data):
# 获取目录
person_dir = create_sub_directory(performers_dir, person)
person_id = extract_id_from_href(href)
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
# 获取目录
movie_id = extract_id_from_href(href)
person_dir = create_sub_directory(movies_dir, movie_id)
person_filename = f"{movie_id}.json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 保存抓取到的原始HTML方便后续核验
def write_raw_html(href, html_text):
# 获取目录
id = extract_id_from_href(href)
if 'person.rme' in href.lower():
dir_prefix = 'raw_performers'
elif 'title.rme' in href.lower():
dir_prefix = 'raw_movies'
else:
return
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
with open(full_path, 'w', encoding='utf-8') as file:
file.write(html_text)
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限写入文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
# 保存抓取到的原始HTML方便后续核验
def read_raw_html(href, expire_date="2025-03-01"):
# 获取目录
id = extract_id_from_href(href)
if 'person.rme' in href.lower():
dir_prefix = 'raw_performers'
elif 'title.rme' in href.lower():
dir_prefix = 'raw_movies'
else:
return None
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
file_name = f"{id}.html" # 用 - 替换空格
full_path = os.path.join(file_dir, file_name)
try:
if os.path.exists(full_path):
# 获取文件的最后修改时间
last_modified_timestamp = os.path.getmtime(full_path)
# 将时间戳转换为 datetime 对象
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
# 检查文件最后修改时间是否晚于给定日期
if last_modified_date > expire_date:
logging.debug(f"find local file on href {href}")
with open(full_path, 'r', encoding='utf-8') as file:
return file.read()
else:
logging.debug(f"expired file {last_modified_date} on href {href}")
return None
else:
return None
except FileNotFoundError:
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
except PermissionError:
logging.warning(f"错误:没有权限读取文件 {full_path}")
except Exception as e:
logging.warning(f"发生未知错误:{e}")
return None
# 读取json文件并返回内容
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"文件 {file_path} 未找到.")
return None
except json.JSONDecodeError:
print(f"文件 {file_path} 解析错误.")
return None