stock/scripts/iafd/movie_meta_fetch.py

import os
import json
import csv
import time
import logging
import sys
import signal
import re
import cloudscraper
from bs4 import BeautifulSoup
import config

config.setup_logging()

# 定义基础 URL 和可变参数
host_url = "https://www.iafd.com"

# 目录和文件路径
RESULT_DIR = "result"
INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
OUTPUT_JSON = os.path.join(RESULT_DIR, "movie_details.json")
OUTPUT_CSV = os.path.join(RESULT_DIR, "movie_details.csv")
BATCH_SIZE = 100  # 每100条数据写入文件

# 初始化 Cloudflare 绕过工具
scraper = cloudscraper.create_scraper()


def load_existing_data():
    """加载已处理的数据，支持续传"""
    if os.path.exists(OUTPUT_JSON):
        with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
            try:
                return json.load(f)
            except json.JSONDecodeError:
                return []
    return []


def save_data(all_movies):
    """保存数据到 JSON 和 CSV 文件"""
    logging.info("Saving data...")

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(all_movies, f, indent=4, ensure_ascii=False)

    with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
                         "AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
        for movie in all_movies:
            writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
                             movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
                             movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])


def fetch_html(href):
    """请求网页并返回 HTML 内容"""
    for attempt in range(3):
        try:
            response = scraper.get(href, timeout=10)
            if response.status_code == 200:
                return response.text
        except Exception as e:
            logging.warning(f"Error fetching {href}: {e}")
        time.sleep(2)

    logging.error(f"Failed to fetch {href} after 3 attempts")
    return None


def parse_movie_details(html, href, title):
    """解析网页 HTML 并提取电影信息"""
    soup = BeautifulSoup(html, "html.parser")

    # 解析电影基础信息
    movie_data = {}
    director_href = ''
    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
    if info_div:
        labels = info_div.find_all("p", class_="bioheading")
        values = info_div.find_all("p", class_="biodata")
        for label, value in zip(labels, values):
            key = label.text.strip()
            val = value.text.strip()
            if key in ["Distributor", "Studio", "Director"]:
                link = value.find("a")
                if link:
                    val = link.text.strip()
                    if key == 'Director':
                        director_href = host_url + link['href']
            movie_data[key] = val
    else:
        return None

    # 解析演职人员信息
    performers = []
    cast_divs = soup.find_all("div", class_="castbox")
    for cast in cast_divs:
        performer = {}
        link = cast.find("a")
        if link:
            performer["name"] = link.text.strip()
            performer["href"] =  host_url + link["href"]

        performer["tags"] = [
            tag.strip() for br in cast.find_all("br")
            if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
        ]

        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
        performers.append(performer)

    # 解析场景拆解
    scene_breakdowns = []
    scene_table = soup.find("div", id="sceneinfo")
    if scene_table:
        rows = scene_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:
                scene = cols[0].text.strip()
                scene_performers = [p.strip() for p in cols[1].text.split(",")]
                scene_breakdowns.append({"scene": scene, "performers": scene_performers})

    appears_in = []
    appears_divs = soup.find("div", id="appearssection")
    if appears_divs:
        rows = appears_divs.find_all("li")
        for row in rows:
            lnk = row.find("a")
            if lnk:
                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})


    return {
        "href": href,
        "title": title,
        "Minutes": movie_data.get("Minutes", ""),
        "Distributor": movie_data.get("Distributor", ""),
        "Studio": movie_data.get("Studio", ""),
        "ReleaseDate": movie_data.get("Release Date", ""),
        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
        "All-Girl": movie_data.get("All-Girl", ""),
        "All-Male": movie_data.get("All-Male", ""),
        "Compilation": movie_data.get("Compilation", ""),
        "Webscene": movie_data.get("Webscene", ""),
        "Director": movie_data.get("Director", ""),
        "DirectorHref": director_href,
        "Performers": performers,
        "SceneBreakdowns": scene_breakdowns,
        "AppearsIn": appears_in,
    }


def process_movies():
    """处理电影数据"""
    all_movies = load_existing_data()
    processed_hrefs = {movie["href"] for movie in all_movies}

    # 读取 distributors.json 文件
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        movies = json.load(f)

    new_movies = []
    count = 0

    for entry in movies:
        href = entry["href"]
        title = entry["title"]

        if href in processed_hrefs:
            continue  # 跳过已处理数据

        logging.info(f"Processing: {title} ({href})")

        html = fetch_html(href)
        if not html:
            continue  # 获取失败，跳过

        movie = parse_movie_details(html, href, title)
        new_movies.append(movie)
        count += 1

        # 每 BATCH_SIZE 条数据刷新一次文件
        if count % BATCH_SIZE == 0:
            save_data(all_movies + new_movies)

    # 最终保存文件
    all_movies.extend(new_movies)
    save_data(all_movies)

    logging.info("Task completed.")


# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
def extract_id_from_href(href):
    """从href中提取id参数"""
    match = re.search(r'id=([a-f0-9\-]+)', href)
    return match.group(1) if match else ''

# 指定url访问
def process_one(href):
    # 初始化 cloudscraper
    scraper = cloudscraper.create_scraper()
    # 获取并解析数据
    movie = {}
    while True:
        html = fetch_html(href)
        if not html:
            logging.warning(f'fetching {href} error. retrying...')
            continue  # 获取失败，跳过

        movie = parse_movie_details(html, href, 'title')
        if movie:
            break
        else:
            logging.warning(f'fetching {href} error. retrying...')
            continue  # 获取失败，跳过

    id = extract_id_from_href(href)
    filename = f"{id}.json"  # 用 - 替换空格

    try:
        with open(filename, 'w', encoding='utf-8') as json_file:
            json.dump(movie, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        logging.error(f"Error writing file {filename}: {e}")
    print(f'fetch succ. saved result in {filename}')


def handle_exit_signal(signal, frame):
    logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
    save_data()
    sys.exit(0)

# 全量访问
def main():
    try:
        # 注册退出信号
        signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
        signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
        process_movies()
    finally:
        # 清理操作，保证在程序正常退出时执行
        save_data()
        logging.info("Data processing completed.")

if __name__ == "__main__":
    if len(sys.argv) > 1:
        url = sys.argv[1]
        process_one(url)
    else:
        main()