modify some scripts.
This commit is contained in:
92
scripts/iafd/src/utils.py
Normal file
92
scripts/iafd/src/utils.py
Normal file
@ -0,0 +1,92 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import logging
|
||||
|
||||
# 解析 height 和 weight(转换成数字)
|
||||
def parse_height(height_str):
|
||||
return 0
|
||||
try:
|
||||
return int(height_str.split("(")[-1].replace(" cm)", ""))
|
||||
except:
|
||||
return None
|
||||
|
||||
def parse_weight(weight_str):
|
||||
return 0
|
||||
try:
|
||||
return int(weight_str.split(" ")[0])
|
||||
except:
|
||||
return None
|
||||
|
||||
update_dir = '../result'
|
||||
performers_dir = f'{update_dir}/performers'
|
||||
movies_dir = f'{update_dir}/movies'
|
||||
|
||||
def uniq_performers(new_performers):
|
||||
try:
|
||||
if not isinstance(new_performers, list):
|
||||
raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
|
||||
|
||||
seen = set()
|
||||
unique_performers = []
|
||||
|
||||
for item in new_performers:
|
||||
if not item or item['href'] is None:
|
||||
raise ValueError(f"Invalid item in new_performers: {item}")
|
||||
|
||||
if item["href"] not in seen:
|
||||
seen.add(item["href"])
|
||||
unique_performers.append(item)
|
||||
|
||||
return unique_performers
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error in remove_duplicate_performers: {e}")
|
||||
return [] # 返回空列表,避免程序崩溃
|
||||
|
||||
# 创建目录
|
||||
def create_sub_directory(base_dir, str):
|
||||
# 获取 person 的前两个字母并转为小写
|
||||
sub_dir = str[:1].lower()
|
||||
full_path = os.path.join(base_dir, sub_dir)
|
||||
if not os.path.exists(full_path):
|
||||
os.makedirs(full_path)
|
||||
return full_path
|
||||
|
||||
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
|
||||
def extract_id_from_href(href):
|
||||
"""从href中提取id参数"""
|
||||
match = re.search(r'id=([a-f0-9\-]+)', href)
|
||||
return match.group(1) if match else ''
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_person_json(person, href, data):
|
||||
# 获取目录
|
||||
person_dir = create_sub_directory(performers_dir, person)
|
||||
person_id = extract_id_from_href(href)
|
||||
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
|
||||
# 写入每个 performer 的单独 JSON 文件
|
||||
def write_movie_json(href, data):
|
||||
# 获取目录
|
||||
movie_id = extract_id_from_href(href)
|
||||
person_dir = create_sub_directory(movies_dir, movie_id)
|
||||
person_filename = f"{movie_id}.json" # 用 - 替换空格
|
||||
full_path = os.path.join(person_dir, person_filename)
|
||||
|
||||
try:
|
||||
with open(full_path, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Error writing file {full_path}: {e}")
|
||||
|
||||
Reference in New Issue
Block a user