This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/iafd/tools/data_merge.py
2025-03-17 11:30:35 +08:00

236 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import os
import json
import csv
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 输入目录和输出文件
input_dir = 'data'
output_json_file = f'{input_dir}/iafd_merge.json'
output_csv_file = f'{input_dir}/iafd_merge.csv'
output_person_txt = f'{input_dir}/all_person.txt'
# 读取iafd_meta.json
try:
with open(os.path.join(input_dir, 'iafd_meta.json'), 'r', encoding='utf-8') as file:
iafd_data = json.load(file)
logger.info("Loaded iafd_meta.json")
except Exception as e:
logger.error(f"Error loading iafd_meta.json: {e}")
iafd_data = []
# 读取stashdb.json
try:
with open(os.path.join(input_dir, 'stashdb.json'), 'r', encoding='utf-8') as file:
stashdb_data = json.load(file)
logger.info("Loaded stashdb.json")
except Exception as e:
logger.error(f"Error loading stashdb.json: {e}")
stashdb_data = []
# 读取javhd_meta.json
try:
with open(os.path.join(input_dir, 'javhd_meta.json'), 'r', encoding='utf-8') as file:
javhd_data = json.load(file)
logger.info("Loaded javhd_meta.json")
except Exception as e:
logger.error(f"Error loading javhd_meta.json: {e}")
javhd_data = []
# 读取thelordofporn_meta.json
try:
with open(os.path.join(input_dir, 'thelordofporn_meta.json'), 'r', encoding='utf-8') as file:
lordporn_data = json.load(file)
logger.info("Loaded thelordofporn_meta.json")
except Exception as e:
logger.error(f"Error loading thelordofporn_meta.json: {e}")
lordporn_data = []
# 构建all_meta_data去重
all_meta_data = set()
# 从各数据源提取unique的姓名数据
for person_entry in iafd_data:
all_meta_data.add(person_entry['person'])
for stashdb_entry in stashdb_data:
all_meta_data.add(stashdb_entry['name'])
for javhd_entry in javhd_data:
all_meta_data.add(javhd_entry['ja_name'])
for lordporn_entry in lordporn_data:
all_meta_data.add(lordporn_entry['pornstar'])
# 合并数据的列表
merged_data = []
# 遍历all_meta_data按规则合并
for person in all_meta_data:
# 初始化合并的数据结构体
merged_entry = {
'person': person
}
# 初始化stashdb_entry所有字段为空
stashdb_entry = {
'stashdb_gender': '',
'stashdb_birthdate': '',
'stashdb_ethnicity': '',
'stashdb_country': '',
'stashdb_height': '',
'stashdb_measurements': '',
'stashdb_fake_tits': '',
'stashdb_career_length': '',
'stashdb_aliases': ''
}
# 初始化javhd_entry所有字段为空
javhd_entry = {
'javhd_rank': '',
'javhd_height': '',
'javhd_weight': '',
'javhd_breast_size': '',
'javhd_breast_factor': '',
'javhd_birth_date': '',
'javhd_ethnicity': ''
}
# 初始化lordporn_entry所有字段为空
lordporn_entry = {
'lordporn_rating': '',
'lordporn_rank': '',
'lordporn_career_start': '',
'lordporn_measurements': '',
'lordporn_born': '',
'lordporn_height': '',
'lordporn_weight': ''
}
# 初始化in_iafd字段默认为N
in_iafd = 'N'
iafd_match = next((item for item in iafd_data if item.get('person') == person), None)
if iafd_match:
in_iafd = 'Y'
# 1. 检查是否存在于 stashdb 数据
in_stashdb = 'N'
stashdb_match = next((item for item in stashdb_data if item.get('name') == person), None)
if stashdb_match:
in_stashdb = 'Y'
# 更新stashdb_entry字段
stashdb_entry.update({
'stashdb_gender': stashdb_match.get('gender', ''),
'stashdb_birthdate': stashdb_match.get('birthdate', ''),
'stashdb_ethnicity': stashdb_match.get('ethnicity', ''),
'stashdb_country': stashdb_match.get('country', ''),
'stashdb_height': stashdb_match.get('height', ''),
'stashdb_measurements': stashdb_match.get('measurements', ''),
'stashdb_fake_tits': stashdb_match.get('fake_tits', ''),
'stashdb_career_length': stashdb_match.get('career_length', ''),
'stashdb_aliases': stashdb_match.get('aliases', '')
})
# 2. 检查是否存在于 javhd 数据
in_javhd = 'N'
javhd_match = next((item for item in javhd_data if item.get('ja_name') == person), None)
if javhd_match:
in_javhd = 'Y'
# 更新javhd_entry字段
javhd_entry.update({
'javhd_rank': javhd_match.get('rank', ''),
'javhd_height': javhd_match.get('height', ''),
'javhd_weight': javhd_match.get('weight', ''),
'javhd_breast_size': javhd_match.get('breast size', ''),
'javhd_breast_factor': javhd_match.get('breast factor', ''),
'javhd_birth_date': javhd_match.get('birth date', ''),
'javhd_ethnicity': javhd_match.get('ethnicity', '')
})
# 3. 检查是否存在于 thelordofporn 数据
in_lordporn = 'N'
lordporn_match = next((item for item in lordporn_data if item.get('pornstar') == person), None)
if lordporn_match:
in_lordporn = 'Y'
# 更新lordporn_entry字段
lordporn_entry.update({
'lordporn_rating': lordporn_match.get('rating', ''),
'lordporn_rank': lordporn_match.get('rank', ''),
'lordporn_career_start': lordporn_match.get('career_start', ''),
'lordporn_measurements': lordporn_match.get('measurements', ''),
'lordporn_born': lordporn_match.get('born', ''),
'lordporn_height': lordporn_match.get('height', ''),
'lordporn_weight': lordporn_match.get('weight', '')
})
# 添加 in_stashdb, in_javhd, in_lordporn 字段,确保都输出
merged_entry.update({
'in_iafd': in_iafd,
'in_stashdb': in_stashdb,
'in_javhd': in_javhd,
'in_lordporn': in_lordporn
})
# 将stashdb_entry, javhd_entry, lordporn_entry合并到结果中
merged_entry.update(stashdb_entry)
merged_entry.update(javhd_entry)
merged_entry.update(lordporn_entry)
# 将合并后的条目加入到结果列表
merged_data.append(merged_entry)
# 写入iafd_merge.json
try:
with open(output_json_file, 'w', encoding='utf-8') as json_file:
json.dump(merged_data, json_file, ensure_ascii=False, indent=4)
logger.info(f"Data successfully written to {output_json_file}")
except Exception as e:
logger.error(f"Error writing {output_json_file}: {e}")
# 写入iafd_merge.csv
try:
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=merged_data[0].keys(), delimiter='\t')
writer.writeheader()
writer.writerows(merged_data)
logger.info(f"Data successfully written to {output_csv_file}")
except Exception as e:
logger.error(f"Error writing {output_csv_file}: {e}")
# 输出 all_meta_data 到 all_person.txt并按字母顺序排序
try:
# 排序 all_meta_data
all_meta_data_list = sorted(list(all_meta_data)) # 将集合转换为列表并排序
all_meta_data_str = ','.join(all_meta_data_list) # 使用逗号连接元素
with open(output_person_txt, 'w', encoding='utf-8') as txt_file:
txt_file.write(all_meta_data_str)
logger.info(f"all_meta_data successfully written to all_person.txt")
except Exception as e:
logger.error(f"Error writing all_person.txt: {e}")