Files
stock/scripts/iafd/src_json/movie_detail_fetch.py
2025-03-05 16:36:34 +08:00

334 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import csv
import time
import logging
import sys
import signal
import re
import cloudscraper
from bs4 import BeautifulSoup
import config
config.setup_logging()
# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"
# 目录和文件路径
RESULT_DIR = "../result"
OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
BATCH_SIZE = 100 # 每100条数据写入文件
movies_dir = f'{RESULT_DIR}/movies'
# 初始化 Cloudflare 绕过工具
scraper = cloudscraper.create_scraper()
# 全量数据
all_movies = []
def load_existing_data():
"""加载已处理的数据,支持续传"""
if os.path.exists(OUTPUT_JSON):
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
try:
return json.load(f)
except json.JSONDecodeError:
return []
return []
def save_data():
"""保存数据到 JSON 和 CSV 文件"""
logging.info("Saving data...")
global all_movies
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump(all_movies, f, indent=4, ensure_ascii=False)
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
"AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
for movie in all_movies:
writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
# 请求网页并返回 HTML 内容
def fetch_html(href):
"""请求网页并返回 HTML 内容"""
for attempt in range(3):
try:
response = scraper.get(href, timeout=10)
if response.status_code == 200:
return response.text
except Exception as e:
logging.warning(f"Error fetching {href}: {e}")
time.sleep(2)
logging.error(f"Failed to fetch {href} after 3 attempts")
return None
# 解析网页 HTML 并提取电影信息
def parse_movie_details(html, href, title):
"""解析网页 HTML 并提取电影信息"""
soup = BeautifulSoup(html, "html.parser")
# 解析电影基础信息
movie_data = {}
info_div = soup.find("div", class_="col-xs-12 col-sm-3")
if info_div:
labels = info_div.find_all("p", class_="bioheading")
values = info_div.find_all("p", class_="biodata")
for label, value in zip(labels, values):
key = label.text.strip()
val = value.text.strip()
if key in ["Distributor", "Studio", "Director"]:
link = value.find("a")
if link:
val = link.text.strip()
movie_data[f'{key}Href'] = host_url + link['href']
movie_data[key] = val
else:
return None
# 解析演职人员信息
performers = []
cast_divs = soup.find_all("div", class_="castbox")
for cast in cast_divs:
performer = {}
link = cast.find("a")
if link:
performer["name"] = link.text.strip()
performer["href"] = host_url + link["href"]
performer["tags"] = [
tag.strip() for br in cast.find_all("br")
if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
]
#performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
performers.append(performer)
# 解析场景拆解
scene_breakdowns = []
scene_table = soup.find("div", id="sceneinfo")
if scene_table:
rows = scene_table.find_all("tr")
for row in rows:
cols = row.find_all("td")
if len(cols) >= 2:
scene = cols[0].text.strip() # 场景编号
performer_info = cols[1] # 包含表演者及链接信息
# 获取 <br> 之前的完整 HTML保留 <i> 标签等格式)
performer_html = str(performer_info) # 获取所有HTML内容
split_html = performer_html.split("<br/>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
split_html = performer_html.split("<br>") # 按 <br> 进行分割
if split_html:
performers_html = split_html[0].strip() # 取 <br> 之前的部分
else:
performers_html = performer_html.strip() # 如果没有 <br>,取全部
# 解析为纯文本去除HTML标签仅提取文本内容
performers_soup = BeautifulSoup(performers_html, "html.parser")
performers_text = performers_soup.get_text()
# 提取表演者
scene_performers = [p.strip() for p in performers_text.split(",")]
# 尝试获取 `webscene` 和 `studio`
links_data = {}
links = performer_info.find_all("a")
if links:
webscene_title = links[0].text.strip() if len(links)>0 else None
webscene = links[0]["href"] if len(links)>0 else None
studio = links[1].text.strip() if len(links)>1 else None
studio_lnk = links[1]["href"] if len(links)>1 else None
links_data = {
"title": webscene_title,
"webscene": webscene,
"studio": studio,
"studio_lnk": studio_lnk,
}
scene_data = {
"scene": scene,
"performers": scene_performers,
**links_data,
}
scene_breakdowns.append(scene_data)
appears_in = []
appears_divs = soup.find("div", id="appearssection")
if appears_divs:
rows = appears_divs.find_all("li")
for row in rows:
lnk = row.find("a")
if lnk:
appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
return {
"href": href,
"title": title,
"Minutes": movie_data.get("Minutes", ""),
"Distributor": movie_data.get("Distributor", ""),
"Studio": movie_data.get("Studio", ""),
"ReleaseDate": movie_data.get("Release Date", ""),
"AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
"All-Girl": movie_data.get("All-Girl", ""),
"All-Male": movie_data.get("All-Male", ""),
"Compilation": movie_data.get("Compilation", ""),
"Webscene": movie_data.get("Webscene", ""),
"Director": movie_data.get("Director", ""),
"DirectorHref": movie_data.get("DirectorHref", ""),
"DistributorHref": movie_data.get("DistributorHref", ""),
"StudioHref": movie_data.get("StudioHref", ""),
"Performers": performers,
"SceneBreakdowns": scene_breakdowns,
"AppearsIn": appears_in,
}
# 创建目录
def create_sub_directory(base_dir, str):
# 获取 person 的前两个字母并转为小写
sub_dir = str[:1].lower()
full_path = os.path.join(base_dir, sub_dir)
if not os.path.exists(full_path):
os.makedirs(full_path)
return full_path
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 写入每个 performer 的单独 JSON 文件
def write_movie_json(href, data):
# 获取目录
movie_id = extract_id_from_href(href)
person_dir = create_sub_directory(movies_dir, movie_id)
person_filename = f"{movie_id}.json" # 用 - 替换空格
full_path = os.path.join(person_dir, person_filename)
try:
with open(full_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)
except Exception as e:
logging.error(f"Error writing file {full_path}: {e}")
def process_movies():
"""处理电影数据"""
global all_movies
all_movies = load_existing_data()
processed_hrefs = {movie["href"] for movie in all_movies}
# 读取 distributors.json 文件
with open(INPUT_FILE, "r", encoding="utf-8") as f:
movies = json.load(f)
count = 0
for entry in movies:
href = entry["href"]
title = entry["title"]
if href in processed_hrefs:
logging.info(f"Skiping existed: {title} ({href})")
continue # 跳过已处理数据
logging.info(f"Processing: {title} ({href})")
while True:
html = fetch_html(href)
if not html:
logging.warning(f'Retring {title} ({href}) ')
continue # 获取失败,跳过
else:
movie = parse_movie_details(html, href, title)
if not movie:
logging.warning(f'Retring {title} ({href}) ')
continue
else:
all_movies.append(movie)
count += 1
# 写入本地文件
write_movie_json(href, movie)
break
# 每 BATCH_SIZE 条数据刷新一次文件
if count % BATCH_SIZE == 0:
save_data()
# 最终保存文件
save_data()
logging.info("Task completed.")
# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
"""从href中提取id参数"""
match = re.search(r'id=([a-f0-9\-]+)', href)
return match.group(1) if match else ''
# 指定url访问
def process_one(href):
# 初始化 cloudscraper
scraper = cloudscraper.create_scraper()
# 获取并解析数据
movie = {}
while True:
html = fetch_html(href)
if not html:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
movie = parse_movie_details(html, href, 'title')
if movie:
break
else:
logging.warning(f'fetching {href} error. retrying...')
continue # 获取失败,跳过
if movie:
write_movie_json(href, movie)
print(f'fetch succ. saved result in {movies_dir}')
# 处理程序被终止时的数据
def handle_exit_signal(signal, frame):
logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
save_data()
sys.exit(0)
# 全量访问
def main():
try:
# 注册退出信号
signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C
signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal
process_movies()
finally:
# 清理操作,保证在程序正常退出时执行
save_data()
logging.info("Data processing completed.")
# 程序入口,读取参数
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
process_one(url)
else:
main()