This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scripts/iafd/detail_fetch.py
2025-02-24 14:51:51 +08:00

216 lines
7.8 KiB
Python

"""
Script Name:
Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
list_merge.py 上面三个列表的数据,取交集,得到整体数据。
iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。(作用不大,因为国籍、照片等字段不匹配)
html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并;
stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
从而获取到一份完整的数据列表。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import config
# 配置日志
config.setup_logging()
# 结果路径
res_dir = './result'
res_json_file = f'{res_dir}/detail.json'
res_csv_file = f'{res_dir}/detail.csv'
input_json_file = f'{res_dir}/merged.json'
# 存储结果
final_data = []
# 读取 detail.json 中的 href
def load_existing_hrefs():
existing_hrefs = set()
try:
with open(res_json_file, 'r') as file:
data = json.load(file)
for entry in data:
existing_hrefs.add(entry['href'])
except FileNotFoundError:
logging.info("detail.json not found, starting fresh.")
return existing_hrefs
# 请求网页并提取所需数据
def fetch_and_parse_page(url, scraper):
try:
response = scraper.get(url)
if response.status_code != 200:
logging.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
return None
# 解析 HTML 内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取数据
data = {}
# 定义我们需要的字段名称和HTML中对应的标签
fields = {
'performer_aka': 'Performer AKA',
'birthday': 'Birthday',
'astrology': 'Astrology',
'birthplace': 'Birthplace',
'gender': 'Gender',
'years_active': 'Years Active',
'ethnicity': 'Ethnicity',
'nationality': 'Nationality',
'hair_colors': 'Hair Colors',
'eye_color': 'Eye Color',
'height': 'Height',
'weight': 'Weight',
'measurements': 'Measurements',
'tattoos': 'Tattoos',
'piercings': 'Piercings'
}
reversed_map = {v: k for k, v in fields.items()}
# 遍历每个 bioheading
bioheadings = soup.find_all('p', class_='bioheading')
for bio in bioheadings:
heading = bio.text.strip()
biodata = None
# 如果包含 "Performer",需要特殊处理
if 'Performer' in heading:
heading = 'Performer AKA'
biodata_div = bio.find_next('div', class_='biodata')
if biodata_div:
div_text = biodata_div.get_text(separator='|').strip()
biodata = [b.strip() for b in div_text.split('|') if b.strip()]
else:
biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
# 保存数据
if heading in reversed_map:
kkey = reversed_map[heading]
data[kkey] = biodata
return data
except RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None
# 写入 detail.json
def write_to_detail_json(data):
with open(res_json_file, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
# 写入 CSV 文件
def write_to_csv(data):
try:
with open(res_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings']
writer.writerow(header)
for entry in data:
writer.writerow([
entry.get('person', ''),
entry.get('href', ''),
'|'.join(entry.get('performer_aka', [])),
entry.get('birthday', ''),
entry.get('astrology', ''),
entry.get('birthplace', ''),
entry.get('gender', ''),
entry.get('years_active', ''),
entry.get('ethnicity', ''),
entry.get('nationality', ''),
entry.get('hair_colors', ''),
entry.get('eye_color', ''),
entry.get('height', ''),
entry.get('weight', ''),
entry.get('measurements', ''),
entry.get('tattoos', ''),
entry.get('piercings', '')
])
except Exception as e:
logging.error(f"Error writing to CSV: {e}")
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
write_to_csv(final_data) # Ensure final data is written when exiting
write_to_detail_json(final_data)
sys.exit(0)
def main():
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 加载已存在的 href 列表
existing_hrefs = load_existing_hrefs()
# 读取 merged.json
with open(input_json_file, 'r') as file:
merged_data = json.load(file)
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
# 遍历 merged.json 中的数据
loop = 0
for entry in merged_data:
href = entry.get('href')
person = entry.get('person')
if href in existing_hrefs:
logging.info(f"Skipping {href} - already processed")
continue
logging.info(f"Processing {href} - {person}")
# 获取并解析数据
data = fetch_and_parse_page(href, scraper)
if data:
# 如果数据正确,加入到 final_data
final_data.append({
'href': href,
'person': person,
**data
})
loop = loop+1
if loop % 100 == 0:
# 更新 detail.json 文件
print(f'flush data to json file. now data count: {loop}')
write_to_detail_json(final_data)
# 更新已存在的 href
existing_hrefs.add(href)
# 延时,防止请求过快被封锁
time.sleep(1)
# 完成后一次性写入 CSV
write_to_csv(final_data)
logging.info("Data processing completed.")
if __name__ == "__main__":
main()