173 lines
5.6 KiB
Python
173 lines
5.6 KiB
Python
import re
|
||
import os
|
||
import json
|
||
import time
|
||
import csv
|
||
from datetime import datetime
|
||
import logging
|
||
import config
|
||
|
||
# 解析 height 和 weight(转换成数字)
|
||
def parse_height(height_str):
|
||
return 0
|
||
try:
|
||
return int(height_str.split("(")[-1].replace(" cm)", ""))
|
||
except:
|
||
return None
|
||
|
||
def parse_weight(weight_str):
|
||
return 0
|
||
try:
|
||
return int(weight_str.split(" ")[0])
|
||
except:
|
||
return None
|
||
|
||
update_dir = f'{config.global_host_data_dir}/iafd'
|
||
performers_dir = f'{update_dir}/performers'
|
||
movies_dir = f'{update_dir}/movies'
|
||
|
||
def to_number(value):
|
||
"""将字符串转换为数字,如果无效则返回 0"""
|
||
try:
|
||
return float(value)
|
||
except (ValueError, TypeError):
|
||
return 0
|
||
|
||
def is_valid_person_url(url):
|
||
if 'person.rme' in url.lower():
|
||
return True
|
||
return False
|
||
|
||
def dist_stu_href_rewrite(href):
|
||
# 提取 ID(适用于 distrib 或 studio)
|
||
import re
|
||
match = re.search(r"(distrib|studio)=(\d+)", href)
|
||
if not match:
|
||
return None # 不是目标 URL,返回 None
|
||
|
||
key, id_number = match.groups()
|
||
new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
|
||
return new_url
|
||
|
||
# 创建目录
|
||
def create_sub_directory(base_dir, str):
|
||
# 获取 person 的前两个字母并转为小写
|
||
sub_dir = str[:1].lower()
|
||
full_path = os.path.join(base_dir, sub_dir)
|
||
if not os.path.exists(full_path):
|
||
os.makedirs(full_path)
|
||
return full_path
|
||
|
||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||
def extract_id_from_href(href):
|
||
"""从href中提取id参数"""
|
||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||
return match.group(1) if match else ''
|
||
|
||
# 写入每个 performer 的单独 JSON 文件
|
||
def write_person_json(person, href, data):
|
||
# 获取目录
|
||
person_dir = create_sub_directory(performers_dir, person)
|
||
person_id = extract_id_from_href(href)
|
||
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
|
||
full_path = os.path.join(person_dir, person_filename)
|
||
|
||
try:
|
||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||
except Exception as e:
|
||
logging.error(f"Error writing file {full_path}: {e}")
|
||
|
||
|
||
# 写入每个 performer 的单独 JSON 文件
|
||
def write_movie_json(href, data):
|
||
# 获取目录
|
||
movie_id = extract_id_from_href(href)
|
||
person_dir = create_sub_directory(movies_dir, movie_id)
|
||
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||
full_path = os.path.join(person_dir, person_filename)
|
||
|
||
try:
|
||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||
except Exception as e:
|
||
logging.error(f"Error writing file {full_path}: {e}")
|
||
|
||
# 保存抓取到的原始HTML,方便后续核验
|
||
def write_raw_html(href, html_text):
|
||
# 获取目录
|
||
id = extract_id_from_href(href)
|
||
if 'person.rme' in href.lower():
|
||
dir_prefix = 'raw_performers'
|
||
elif 'title.rme' in href.lower():
|
||
dir_prefix = 'raw_movies'
|
||
else:
|
||
return
|
||
|
||
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||
file_name = f"{id}.html" # 用 - 替换空格
|
||
full_path = os.path.join(file_dir, file_name)
|
||
|
||
try:
|
||
with open(full_path, 'w', encoding='utf-8') as file:
|
||
file.write(html_text)
|
||
except FileNotFoundError:
|
||
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||
except PermissionError:
|
||
logging.warning(f"错误:没有权限写入文件 {full_path}。")
|
||
except Exception as e:
|
||
logging.warning(f"发生未知错误:{e}")
|
||
|
||
|
||
# 保存抓取到的原始HTML,方便后续核验
|
||
def read_raw_html(href, expire_date="2025-03-01"):
|
||
# 获取目录
|
||
id = extract_id_from_href(href)
|
||
if 'person.rme' in href.lower():
|
||
dir_prefix = 'raw_performers'
|
||
elif 'title.rme' in href.lower():
|
||
dir_prefix = 'raw_movies'
|
||
else:
|
||
return None
|
||
|
||
file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
|
||
file_name = f"{id}.html" # 用 - 替换空格
|
||
full_path = os.path.join(file_dir, file_name)
|
||
|
||
try:
|
||
if os.path.exists(full_path):
|
||
# 获取文件的最后修改时间
|
||
last_modified_timestamp = os.path.getmtime(full_path)
|
||
# 将时间戳转换为 datetime 对象
|
||
last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
|
||
# 检查文件最后修改时间是否晚于给定日期
|
||
if last_modified_date > expire_date:
|
||
logging.debug(f"find local file on href {href}")
|
||
with open(full_path, 'r', encoding='utf-8') as file:
|
||
return file.read()
|
||
else:
|
||
logging.debug(f"expired file {last_modified_date} on href {href}")
|
||
return None
|
||
else:
|
||
return None
|
||
except FileNotFoundError:
|
||
logging.warning(f"错误:指定的路径 {full_path} 不存在。")
|
||
except PermissionError:
|
||
logging.warning(f"错误:没有权限读取文件 {full_path}。")
|
||
except Exception as e:
|
||
logging.warning(f"发生未知错误:{e}")
|
||
return None
|
||
|
||
|
||
# 读取json文件并返回内容
|
||
def read_json(file_path):
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
except FileNotFoundError:
|
||
print(f"文件 {file_path} 未找到.")
|
||
return None
|
||
except json.JSONDecodeError:
|
||
print(f"文件 {file_path} 解析错误.")
|
||
return None
|