modify scripts

This commit is contained in:
oscarz
2025-03-17 11:30:35 +08:00
parent e6327fbe73
commit d5dc76b87f
178 changed files with 44 additions and 184447 deletions

26
iafd/src_json/config.py Normal file
View File

@ -0,0 +1,26 @@
import logging
import os
import inspect
from datetime import datetime
global_share_data_dir = '/root/sharedata'
global_host_data_dir = '/root/hostdir/scripts_data'
# 设置日志配置
def setup_logging(log_filename=None):
# 如果未传入 log_filename则使用当前脚本名称作为日志文件名
if log_filename is None:
# 获取调用 setup_logging 的脚本文件名
caller_frame = inspect.stack()[1]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
# 获取当前日期,格式为 yyyymmdd
current_date = datetime.now().strftime('%Y%m%d')
# 拼接 log 文件名,将日期加在扩展名前
log_filename = f'../log/{caller_filename}_{current_date}.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler()
])

View File

@ -0,0 +1,334 @@
import os
import json
import csv
import time
import logging
import sys
import signal
import re
import cloudscraper
from bs4 import BeautifulSoup
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
# 目录和文件路径
RESULT_DIR = "../result"
OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
BATCH_SIZE = 100 # 每100条数据写入文件
movies_dir = f'{RESULT_DIR}/movies'
# 初始化 Cloudflare 绕过工具
scraper = cloudscraper.create_scraper()
# 全量数据
all_movies = []
def load_existing_data():
"""加载已处理的数据,支持续传"""
if os.path.exists(OUTPUT_JSON):
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
try:
return json.load(f)
except json.JSONDecodeError:
return []
return []
def save_data():
"""保存数据到 JSON 和 CSV 文件"""
logging.info("Saving data...")
global all_movies
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump(all_movies, f, indent=4, ensure_ascii=False)
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
"AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
for movie in all_movies:
writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
# 请求网页并返回 HTML 内容
def fetch_html(href):
"""请求网页并返回 HTML 内容"""
for attempt in range(3):
try:
response = scraper.get(href, timeout=10)
if response.status_code == 200:
return response.text
except Exception as e:
logging.warning(f"Error fetching {href}: {e}")
time.sleep(2)
logging.error(f"Failed to fetch {href} after 3 attempts")
return None
# 解析网页 HTML 并提取电影信息
def parse_movie_details(html, href, title):
"""解析网页 HTML 并提取电影信息"""
soup = BeautifulSoup(html, "html.parser")
# 解析电影基础信息
movie_data = {}
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
performer["tags"] = [
tag.strip() for br in cast.find_all("br")
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
]
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
# 获取目录
movie_id = extract_id_from_href(href)
person_dir = create_sub_directory(movies_dir, movie_id)
person_filename = f"{movie_id}.json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
def process_movies():
"""处理电影数据"""
global all_movies
all_movies = load_existing_data()
processed_hrefs = {movie["href"] for movie in all_movies}
# 读取 distributors.json 文件
with open(INPUT_FILE, "r", encoding="utf-8") as f:
movies = json.load(f)
count = 0
for entry in movies:
href = entry["href"]
title = entry["title"]
if href in processed_hrefs:
logging.info(f"Skiping existed: {title} ({href})")
continue # 跳过已处理数据
logging.info(f"Processing: {title} ({href})")
while True:
html = fetch_html(href)
if not html:
logging.warning(f'Retring {title} ({href}) ')
continue # 获取失败,跳过
else:
movie = parse_movie_details(html, href, title)
if not movie:
logging.warning(f'Retring {title} ({href}) ')
continue
else:
all_movies.append(movie)
count += 1
# 写入本地文件
write_movie_json(href, movie)
break
# 每 BATCH_SIZE 条数据刷新一次文件
if count % BATCH_SIZE == 0:
save_data()
# 最终保存文件
save_data()
logging.info("Task completed.")
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 指定url访问
def process_one(href):
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 获取并解析数据
movie = {}
while True:
html = fetch_html(href)
if not html:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
movie = parse_movie_details(html, href, 'title')
if movie:
break
else:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
if movie:
write_movie_json(href, movie)
print(f'fetch succ. saved result in {movies_dir}')
# 处理程序被终止时的数据
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
save_data()
sys.exit(0)
# 全量访问
def main():
try:
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
process_movies()
finally:
# 清理操作,保证在程序正常退出时执行
save_data()
logging.info("Data processing completed.")
# 程序入口,读取参数
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
process_one(url)
else:
main()

View File

@ -0,0 +1,255 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
import argparse
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
# 结果路径
res_dir = f"{config.global_share_data_dir}/iafd"
fetch_config = {
'dist': {
'base_url': f"{host_url}/distrib.rme/distrib=",
'list_page_url': f"{host_url}/distrib.asp",
'html_table_id': 'distable',
'html_select_name': 'Distrib',
'output_key_id': 'distributors',
'json_file': f'{res_dir}/distributors.json',
'csv_file': f'{res_dir}/distributors.csv',
},
'stu': {
'base_url': f"{host_url}/studio.rme/studio=",
'list_page_url': f"{host_url}/studio.asp",
'html_table_id': 'studio',
'html_select_name': 'Studio',
'output_key_id': 'studios',
'json_file': f'{res_dir}/studios.json',
'csv_file': f'{res_dir}/studios.csv',
}
}
distr_map = {
6812 : 'nubilefilms.com',
8563 : 'teenmegaworld network',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
12454: 'vip4k.com',
13541: 'wow network',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
studio_map = {
6812 : 'nubilefilms.com',
9811 : 'Teen Mega World',
6779 : 'x-art.com',
7133 : 'tushy.com',
6496 : 'blacked.com',
7758 : 'vixen.com',
6791 : 'teamskeet.com',
8052: 'wowgirls.com',
9702 : 'cum4k.com',
6778 : 'tiny4k.com',
12667: 'anal4k.com',
7419 : 'exotic4k.com',
13594: 'facials4k.com',
13633: 'mom4k.com',
12335: 'slim4k.com',
16709: 'strippers4k.com',
}
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
all_data = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, name, config):
table_id = config['html_table_id']
key_id = config['output_key_id']
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id=table_id)
if not table:
logging.warning(f"Warning: No {table_id} table found in {name}")
return None
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
global all_data
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
title = cols[0].text.strip()
label = cols[1].text.strip()
year = cols[2].text.strip()
rev = cols[3].text.strip()
a_href = cols[0].find('a')
href = host_url + a_href['href'] if a_href else ''
all_data.append({
key_id: name,
'title': title,
'label': label,
'year': year,
'rev': rev,
'href': href
})
return soup
# 处理翻页,星座的无需翻页
def handle_pagination(soup, astro):
return None
# 获取列表页
def process_list_gage(config):
list_page_url=config['list_page_url']
select_name = config['html_select_name']
list_map = {}
logging.info(f"Fetching data for {list_page_url} ...")
select_element = None
while True:
html = fetch_page(list_page_url)
if html:
soup = BeautifulSoup(html, "html.parser")
select_element = soup.find('select', {'name': select_name})
if select_element :
break
else:
logging.info(f"wrong html content. retring {list_page_url} ...")
else:
logging.info(f"wrong html content. retring {list_page_url} ...")
if not select_element:
return None
options = select_element.find_all('option')
for option in options:
value = option.get('value') # 获取 value 属性
text = option.text.strip() # 获取文本内容
list_map[int(value)] = text
logging.info(f'fetch {list_page_url} succ. total lines: {len(list_map)}')
return list_map
# 主逻辑函数:循环处理每个种族
def process_main_data(list_data, config):
base_url = config['base_url']
for key, name in list_data.items():
url = base_url + str(key)
next_url = url
logging.info(f"Fetching data for {name}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, name, config)
if soup:
next_url = handle_pagination(soup, name)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data(config)
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data(config):
with open(config['json_file'], 'w', encoding='utf-8') as json_file:
json.dump(all_data, json_file, indent=4, ensure_ascii=False)
with open(config['csv_file'], 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=[config['output_key_id'], 'title', 'label', 'year', 'rev', 'href'])
writer.writeheader()
writer.writerows(all_data)
# 执行主逻辑
if __name__ == '__main__':
# 命令行参数处理
parser = argparse.ArgumentParser(description='fetch movie list from iafd.com')
parser.add_argument('--type', type=str, default='dist', help='fetch by ... (dist , stu)')
parser.add_argument('--kind', type=str, default='parts', help='fetch all or parts (parts , all)')
args = parser.parse_args()
config = fetch_config[args.type]
if not config:
logging.warning(f'unkwon type: {args.type} {args.kind}')
else:
list_data = {}
if args.kind == 'all':
list_data = process_list_gage(config)
elif args.type == 'dist':
list_data = distr_map
else:
list_data = studio_map
process_main_data(list_data, config)
logging.info("Data fetching and saving completed.")

View File

@ -0,0 +1,393 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import config
# 配置日志
config.setup_logging()
# 结果路径
res_dir = '../result'
res_json_file = f'{res_dir}/detail.json'
res_csv_file = f'{res_dir}/detail.csv'
input_json_file = f'{res_dir}/merged.json'
performers_dir = f'{res_dir}/performers'
# 存储结果
final_data = []
# 读取 detail.json 中的 数据,以便于断点续传
def load_existing_hrefs():
existing_hrefs = set()
global final_data
try:
with open(res_json_file, 'r') as file:
final_data = json.load(file)
for entry in final_data:
existing_hrefs.add(entry['href'])
except FileNotFoundError:
logging.info("detail.json not found, starting fresh.")
return existing_hrefs
# 解析 作品列表,有个人出演,也有导演的
def parse_credits_table(table, distributor_list):
# 找到thead并跳过
thead = table.find('thead')
if thead:
thead.decompose() # 去掉thead部分不需要解析
# 现在只剩下tbody部分
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else []
movies = []
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
# rows = table.find_all('tr', class_='we')
for row in rows:
cols = row.find_all('td')
if len(cols) >= 6:
title = cols[0].text.strip()
year = cols[1].text.strip()
distributor = cols[2].text.strip().lower()
notes = cols[3].text.strip()
rev = cols[4].text.strip()
formats = cols[5].text.strip()
for key in distributor_list:
if key in distributor:
distributor_count[key] += 1
movies.append({
'title': title,
'year': year,
'distributor': distributor,
'notes': notes,
'rev': rev,
'formats': formats
})
return movies, distributor_count
# 请求网页并提取所需数据
def fetch_and_parse_page(url, scraper):
try:
response = scraper.get(url)
if response.status_code != 200:
logging.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None, None
# 解析 HTML 内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取数据
data = {}
# 定义我们需要的字段名称和HTML中对应的标签
fields = {
'performer_aka': 'Performer AKA',
'birthday': 'Birthday',
'astrology': 'Astrology',
'birthplace': 'Birthplace',
'gender': 'Gender',
'years_active': 'Years Active',
'ethnicity': 'Ethnicity',
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
}
reversed_map = {v: k for k, v in fields.items()}
# 解析表格数据, 获取参演或者导演的列表
role_list = ['personal', 'directoral']
distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']
credits_list = {}
# 使用字典来存储统计
distributor_count = {key: 0 for key in distributor_list} # 初始化每个 distributor 的计数
for role in role_list:
table = soup.find('table', id=role)
if table :
movies, stat_map = parse_credits_table(table, distributor_list)
credits_list[role] = movies
# 更新 distributor 统计
for distributor in distributor_list:
distributor_count[distributor] += stat_map.get(distributor, 0)
# 统计 movies 数量
#movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
# 如果没有找到
if len(credits_list) == 0 :
logging.warning(f"movie table empty. url: {url} ")
# 遍历每个 bioheading, 获取metadata
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
biodata = None
# 如果包含 "Performer",需要特殊处理
if 'Performer' in heading:
heading = 'Performer AKA'
biodata_div = bio.find_next('div', class_='biodata')
if biodata_div:
div_text = biodata_div.get_text(separator='|').strip()
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
else:
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
# 保存数据
if heading in reversed_map:
kkey = reversed_map[heading]
data[kkey] = biodata
# 添加统计数据到 data
data['movies_cnt'] = movies_cnt
data['vixen_cnt'] = distributor_count['vixen']
data['blacked_cnt'] = distributor_count['blacked']
data['tushy_cnt'] = distributor_count['tushy']
data['x_art_cnt'] = distributor_count['x-art']
return data, credits_list
except RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None, None
# 写入 detail.json
def write_to_detail_json(data):
with open(res_json_file, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
# 写入 CSV 文件
def write_to_csv(data):
try:
with open(res_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity',
'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings',
'movies_cnt', 'vixen_cnt', 'blacked_cnt', 'tushy_cnt', 'x_art_cnt']
writer.writerow(header)
for entry in data:
# 确保 performer_aka 始终为列表类型
performer_aka = entry.get('performer_aka', [])
# 如果是 None 或非列表类型,转换为一个空列表
if performer_aka is None:
performer_aka = []
elif not isinstance(performer_aka, list):
performer_aka = [performer_aka]
writer.writerow([
entry.get('person', ''),
entry.get('href', ''),
'|'.join(performer_aka),
entry.get('birthday', ''),
entry.get('astrology', ''),
entry.get('birthplace', ''),
entry.get('gender', ''),
entry.get('years_active', ''),
entry.get('ethnicity', ''),
entry.get('nationality', ''),
entry.get('hair_colors', ''),
entry.get('eye_color', ''),
entry.get('height', ''),
entry.get('weight', ''),
entry.get('measurements', ''),
entry.get('tattoos', ''),
entry.get('piercings', ''),
entry.get('movies_cnt', 0),
entry.get('vixen_cnt', 0),
entry.get('blacked_cnt', 0),
entry.get('tushy_cnt', 0),
entry.get('x_art_cnt', 0)
])
except Exception as e:
logging.error(f"Error writing to CSV: {e}")
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
write_to_csv(final_data) # Ensure final data is written when exiting
write_to_detail_json(final_data)
sys.exit(0)
# 创建目录
def create_directory_for_person(person):
# 获取 person 的前两个字母并转为小写
person_dir = person[:1].lower()
full_path = os.path.join(performers_dir, person_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_person_json(person, href, data):
# 获取目录
person_dir = create_directory_for_person(person)
person_id = extract_id_from_href(href)
person_filename = f"{person.replace(' ', '-')}({person_id}).json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
# 指定url访问
def process_one(href):
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 获取并解析数据
while True:
data, movies = fetch_and_parse_page(href, scraper)
if data is None:
logging.warning(f'Retring {href} ')
time.sleep(3)
else:
break
# 写入 performer 的独立 JSON 文件
full_data = {
**data,
'credits': movies if movies else {}
}
person_id = extract_id_from_href(href)
person_filename = f"{person_id}.json" # 用 - 替换空格
try:
with open(person_filename, 'w', encoding='utf-8') as json_file:
json.dump(full_data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {person_filename}: {e}")
print(f'fetch succ. saved result in {person_filename}')
def process_all():
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 加载已存在的 href 列表
global final_data
existing_hrefs = load_existing_hrefs()
logging.info(f"load data from {res_json_file}, count: {len(final_data)}")
# 读取 merged.json
with open(input_json_file, 'r') as file:
merged_data = json.load(file)
# 遍历 merged.json 中的数据
loop = 0
for entry in merged_data:
href = entry.get('href')
person = entry.get('person')
if href in existing_hrefs:
logging.info(f"Skipping {href} - already processed")
continue
logging.info(f"Processing {href} - {person}")
# 获取并解析数据
while True:
data, credits = fetch_and_parse_page(href, scraper)
if data is None:
logging.warning(f'Retring {href} - {person} ')
time.sleep(3)
else:
break
# 如果数据正确,加入到 final_data
final_data.append({
'href': href,
'person': person,
**data
})
# 写入 performer 的独立 JSON 文件
full_data = {
'href': href,
'person': person,
**data,
'credits': credits if credits else {}
}
write_person_json(person.strip(), href, full_data)
# 更新 detail.json 文件
loop = loop + 1
if loop % 100 == 0:
logging.info(f'flush data to json file. now fetched data count: {loop}, total count: {len(final_data)}')
write_to_detail_json(final_data)
write_to_csv(final_data)
# 更新已存在的 href
existing_hrefs.add(href)
# 延时,防止请求过快被封锁
time.sleep(1)
# 全量访问
def main():
try:
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
process_all()
finally:
# 清理操作,保证在程序正常退出时执行
write_to_csv(final_data) # Write to CSV or other necessary tasks
write_to_detail_json(final_data) # Save data to JSON
logging.info("Data processing completed.")
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
process_one(url)
else:
main()

View File

@ -0,0 +1,140 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/astrology.rme/sign="
astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = '../result'
# 记录 ethinc_map
astro_map = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, astro):
soup = BeautifulSoup(html, "html.parser")
astro_div = soup.find("div", id="astro")
if not astro_div:
logging.warning(f"Warning: No 'astro' div found in {astro}")
return None
flag = False
list_cnt = 0
birth_date = None
for elem in astro_div.find_all(recursive=False):
if elem.name == "h3" and "astroday" in elem.get("class", []):
birth_date = elem.get_text(strip=True)
elif elem.name == "div" and "perficon" in elem.get("class", []):
a_tag = elem.find("a")
if a_tag:
href = host_url + a_tag["href"]
name = a_tag.find("span", class_="perfname")
if name:
astro_map.append({
"astrology": astro,
"birth_date": birth_date,
"person": name.get_text(strip=True),
"href": href
})
flag = True
list_cnt = list_cnt +1
if flag:
logging.info(f"get {list_cnt} persons from this page. total persons: {len(astro_map)}")
return soup
else:
return None
# 处理翻页,星座的无需翻页
def handle_pagination(soup, astro):
return None
# 主逻辑函数:循环处理每个种族
def process_astro_data():
for astro in astro_list:
url = base_url + astro
next_url = url
logging.info(f"Fetching data for {astro}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, astro)
if soup:
next_url = handle_pagination(soup, astro)
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 定期保存结果
save_data()
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 保存到文件
def save_data():
with open(f'{res_dir}/astro.json', 'w', encoding='utf-8') as json_file:
json.dump(astro_map, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/astro.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['astrology', 'birth_date', 'person', 'href'])
writer.writeheader()
writer.writerows(astro_map)
# 执行主逻辑
if __name__ == '__main__':
process_astro_data()
save_data()
logging.info("Data fetching and saving completed.")

View File

@ -0,0 +1,152 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import requests
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 创建 cloudscraper 会话
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = '../result'
# 存储出生日期的映射
birth_map = []
# 设置基础URL
host_url = "https://www.iafd.com"
base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
# 定义获取页面内容的函数
def fetch_page(month, day):
url = base_url.format(month=month, day=day)
retries = 3
while retries > 0:
try:
# 发送请求并获取页面
logging.info(f"Fetching URL: {url}")
response = scraper.get(url)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
logging.error(f"Request failed: {e}")
retries -= 1
time.sleep(2) # 等待2秒后重试
return None
# 解析页面内容并更新birth_map
def parse_page(html, month, day):
soup = BeautifulSoup(html, 'html.parser')
datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
if not datarows:
return None
flag = False
list_cnt = 0
rows = datarows[0].find_all('div', class_='col-sm-4')
for row in rows:
link_tag = row.find('a')
person = link_tag.text.strip() if link_tag else ''
href = link_tag['href'] if link_tag else ''
href = host_url + href
# 如果 href 已经在 birth_map 中,跳过
flag = True
if any(entry['href'] == href for entry in birth_map):
continue
# 将数据添加到 birth_map
birth_map.append({
'month': month,
'day': day,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.info(f"get {list_cnt} persons from this page. total persons: {len(birth_map)}")
return soup
else:
return None
# 循环遍历每个日期
def fetch_birthdays():
for month in range(1, 13): # 遍历1到12月
for day in range(1, 32): # 遍历1到31天
logging.info(f"Processing: Month {month}, Day {day}")
while True:
html = fetch_page(month, day)
if html:
soup = parse_page(html, month, day)
if soup:
# 定期保存结果
save_data()
# 跳出while循环获取下一个生日的url数据
time.sleep(2) # 控制访问频率
break
else:
logging.warning(f"No data. Retrying: Month {month}, Day {day}")
time.sleep(3) # 等待后再重试
else:
logging.warning(f"Network error. Retrying: Month {month}, Day {day}")
time.sleep(3) # 等待后再重试
# 将birth_map保存到json文件
def save_data():
with open(f'{res_dir}/birth.json', 'w', encoding='utf-8') as f:
json.dump(birth_map, f, ensure_ascii=False, indent=4)
with open(f'{res_dir}/birth.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['month', 'day', 'person', 'href'])
writer.writeheader()
for entry in birth_map:
writer.writerow(entry)
# 主函数
def main():
# 获取数据
fetch_birthdays()
# 保存结果
save_data()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,166 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import json
import time
import csv
from bs4 import BeautifulSoup
import logging
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
base_url = f"{host_url}/lookupethnic.rme/ethnic="
ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
# 设置 headers 和 scraper
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()
# 结果路径
res_dir = '../result'
# 记录 ethinc_map
ethnic_map = []
# 网络请求并解析 HTML
def fetch_page(url):
try:
response = scraper.get(url, headers=headers)
response.raise_for_status()
return response.text
except Exception as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
# 解析 HTML 内容,提取需要的数据
def parse_page(html, ethnic):
# 手动修复 HTML 标签
html = html.replace('<br>', '').replace('<a ', '<a target="_blank" ') # 修复一些不规范标签
soup = BeautifulSoup(html, 'lxml') # 使用lxml解析器
#soup = BeautifulSoup(html, 'html.parser')
rows = soup.find_all('div', class_='row headshotrow')
flag = False
list_cnt = 0
for row in rows:
for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
link_tag = col.find('a')
img_tag = col.find('div', class_='pictag')
flag = True
if link_tag and img_tag:
href = host_url + link_tag['href']
person = img_tag.text.strip()
# 将数据存储到 ethnic_map
ethnic_map.append({
'ethnic': ethnic,
'person': person,
'href': href
})
list_cnt = list_cnt +1
if flag:
logging.info(f"get {list_cnt} persons from this page. total persons: {len(ethnic_map)}")
return soup
else:
return None
# 处理翻页
def handle_pagination(soup, ethnic):
next_page = soup.find('a', rel='next')
if next_page:
next_url = host_url + next_page['href']
logging.info(f"Found next page: {next_url}")
return next_url
else:
logging.info(f"All pages fetched for {ethnic}.")
return None
# 处理带空格的种族名
def format_ethnic(ethnic):
return ethnic.replace(' ', '+')
# 主逻辑函数:循环处理每个种族
def process_ethnic_data():
all_person = len(ethnic_map) # 应该为0
all_pages = 0
for ethnic in ethnic_list:
url = base_url + format_ethnic(ethnic)
next_url = url
cursor = int(all_person / 100)
pages = 0
logging.info(f"--------Fetching data for {ethnic}, url {url} ...")
while next_url:
html = fetch_page(next_url)
if html:
soup = parse_page(html, ethnic)
if soup:
next_url = handle_pagination(soup, ethnic)
pages = pages + 1
else:
logging.info(f"wrong html content. retring {next_url} ...")
# 统计,并定期保存结果
if len(ethnic_map) / 100 > cursor:
cursor = int(len(ethnic_map) / 100)
save_data()
time.sleep(2) # 控制访问频率
else:
logging.info(f"Retrying {next_url} ...")
time.sleep(5) # 等待后再重试
# 统计输出
ethnic_person = len(ethnic_map) - all_person
all_person = len(ethnic_map)
all_pages = all_pages + pages
logging.info(f"--------Fetching data for {ethnic} end. total pages: {pages}, total persons: {ethnic_person}, all persons fetched: {all_person}")
# 统计最后结果
logging.info(f"--------Fetching all data end. total ethnic: {len(ethnic_list)}, total pages: {all_pages}, total persons: {all_person}")
# 保存到文件
def save_data():
with open(f'{res_dir}/ethnic.json', 'w', encoding='utf-8') as json_file:
json.dump(ethnic_map, json_file, indent=4, ensure_ascii=False)
with open(f'{res_dir}/ethnic.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=['ethnic', 'person', 'href'])
writer.writeheader()
writer.writerows(ethnic_map)
# 执行主逻辑
if __name__ == '__main__':
process_ethnic_data()
save_data()
logging.info("Data fetching and saving completed.")

View File

@ -0,0 +1,120 @@
"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import json
import csv
import os
import argparse
from collections import defaultdict
# 结果路径
res_dir = '../result'
# 读取文件并返回内容
def read_json(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"文件 {file_path} 未找到.")
return []
except json.JSONDecodeError:
print(f"文件 {file_path} 解析错误.")
return []
# 处理数据,去重并合并 person 字段
def process_data(files):
href_map = defaultdict(list)
# 读取并处理每个文件
for file in files:
data = read_json(file['path'])
for entry in data:
href = entry.get('href')
person = entry.get('person')
if href:
href_map[href].append(person)
# 合并相同 href 的 person连接用 "|"
result = []
for href, persons in href_map.items():
person = '|'.join(set(persons)) # 去重后合并
result.append({'href': href, 'person': person})
return result
# 保存结果到JSON文件
def save_to_json(data, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
# 保存结果到CSV文件
def save_to_csv(data, output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['href', 'person'])
writer.writeheader()
writer.writerows(data)
# 主函数,执行数据处理并保存
def main():
# 使用 argparse 获取命令行参数
parser = argparse.ArgumentParser(description="合并多个 JSON 文件并输出到一个新的 JSON 和 CSV 文件")
parser.add_argument('files', nargs='+', choices=['birth', 'astro', 'ethnic'],
help="指定需要合并的文件, 至少两个, 最多三个: birth, astro, ethnic")
args = parser.parse_args()
# 确保至少选择两个文件
if len(args.files) < 2:
print("请至少选择两个文件进行合并。")
return
# 定义需要处理的文件
file_map = {
'birth': f'{res_dir}/birth.json',
'astro': f'{res_dir}/astro.json',
'ethnic': f'{res_dir}/ethnic.json'
}
files = [{'path': file_map[file], 'name': file} for file in args.files]
# 处理数据
processed_data = process_data(files)
# 根据输入的文件名生成 merged 文件名
output_json_file = f'{res_dir}/merged_{"_".join(args.files)}.json'
output_csv_file = f'{res_dir}/merged_{"_".join(args.files)}.csv'
# 确保 result 目录存在
os.makedirs(f'{res_dir}', exist_ok=True)
# 输出结果到 JSON 和 CSV 文件
save_to_json(processed_data, output_json_file)
save_to_csv(processed_data, output_csv_file)
print(f"数据处理完成,结果已保存到 {output_json_file}{output_csv_file}")
if __name__ == "__main__":
main()