import os import json import csv import time import logging import sys import signal import re import cloudscraper from bs4 import BeautifulSoup import config config.setup_logging() # 定义基础 URL 和可变参数 host_url = "https://www.iafd.com" # 目录和文件路径 RESULT_DIR = "result" INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json") OUTPUT_JSON = os.path.join(RESULT_DIR, "movie_details.json") OUTPUT_CSV = os.path.join(RESULT_DIR, "movie_details.csv") BATCH_SIZE = 100 # 每100条数据写入文件 # 初始化 Cloudflare 绕过工具 scraper = cloudscraper.create_scraper() def load_existing_data(): """加载已处理的数据,支持续传""" if os.path.exists(OUTPUT_JSON): with open(OUTPUT_JSON, "r", encoding="utf-8") as f: try: return json.load(f) except json.JSONDecodeError: return [] return [] def save_data(all_movies): """保存数据到 JSON 和 CSV 文件""" logging.info("Saving data...") with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(all_movies, f, indent=4, ensure_ascii=False) with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate", "AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"]) for movie in all_movies: writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"], movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"], movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]]) def fetch_html(href): """请求网页并返回 HTML 内容""" for attempt in range(3): try: response = scraper.get(href, timeout=10) if response.status_code == 200: return response.text except Exception as e: logging.warning(f"Error fetching {href}: {e}") time.sleep(2) logging.error(f"Failed to fetch {href} after 3 attempts") return None def parse_movie_details(html, href, title): """解析网页 HTML 并提取电影信息""" soup = BeautifulSoup(html, "html.parser") # 解析电影基础信息 movie_data = {} director_href = '' info_div = soup.find("div", class_="col-xs-12 col-sm-3") if info_div: labels = info_div.find_all("p", class_="bioheading") values = info_div.find_all("p", class_="biodata") for label, value in zip(labels, values): key = label.text.strip() val = value.text.strip() if key in ["Distributor", "Studio", "Director"]: link = value.find("a") if link: val = link.text.strip() if key == 'Director': director_href = host_url + link['href'] movie_data[key] = val else: return None # 解析演职人员信息 performers = [] cast_divs = soup.find_all("div", class_="castbox") for cast in cast_divs: performer = {} link = cast.find("a") if link: performer["name"] = link.text.strip() performer["href"] = host_url + link["href"] performer["tags"] = [ tag.strip() for br in cast.find_all("br") if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip() ] #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()] performers.append(performer) # 解析场景拆解 scene_breakdowns = [] scene_table = soup.find("div", id="sceneinfo") if scene_table: rows = scene_table.find_all("tr") for row in rows: cols = row.find_all("td") if len(cols) >= 2: scene = cols[0].text.strip() scene_performers = [p.strip() for p in cols[1].text.split(",")] scene_breakdowns.append({"scene": scene, "performers": scene_performers}) appears_in = [] appears_divs = soup.find("div", id="appearssection") if appears_divs: rows = appears_divs.find_all("li") for row in rows: lnk = row.find("a") if lnk: appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']}) return { "href": href, "title": title, "Minutes": movie_data.get("Minutes", ""), "Distributor": movie_data.get("Distributor", ""), "Studio": movie_data.get("Studio", ""), "ReleaseDate": movie_data.get("Release Date", ""), "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""), "All-Girl": movie_data.get("All-Girl", ""), "All-Male": movie_data.get("All-Male", ""), "Compilation": movie_data.get("Compilation", ""), "Webscene": movie_data.get("Webscene", ""), "Director": movie_data.get("Director", ""), "DirectorHref": director_href, "Performers": performers, "SceneBreakdowns": scene_breakdowns, "AppearsIn": appears_in, } def process_movies(): """处理电影数据""" all_movies = load_existing_data() processed_hrefs = {movie["href"] for movie in all_movies} # 读取 distributors.json 文件 with open(INPUT_FILE, "r", encoding="utf-8") as f: movies = json.load(f) new_movies = [] count = 0 for entry in movies: href = entry["href"] title = entry["title"] if href in processed_hrefs: continue # 跳过已处理数据 logging.info(f"Processing: {title} ({href})") html = fetch_html(href) if not html: continue # 获取失败,跳过 movie = parse_movie_details(html, href, title) new_movies.append(movie) count += 1 # 每 BATCH_SIZE 条数据刷新一次文件 if count % BATCH_SIZE == 0: save_data(all_movies + new_movies) # 最终保存文件 all_movies.extend(new_movies) save_data(all_movies) logging.info("Task completed.") # 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值 def extract_id_from_href(href): """从href中提取id参数""" match = re.search(r'id=([a-f0-9\-]+)', href) return match.group(1) if match else '' # 指定url访问 def process_one(href): # 初始化 cloudscraper scraper = cloudscraper.create_scraper() # 获取并解析数据 movie = {} while True: html = fetch_html(href) if not html: logging.warning(f'fetching {href} error. retrying...') continue # 获取失败,跳过 movie = parse_movie_details(html, href, 'title') if movie: break else: logging.warning(f'fetching {href} error. retrying...') continue # 获取失败,跳过 id = extract_id_from_href(href) filename = f"{id}.json" # 用 - 替换空格 try: with open(filename, 'w', encoding='utf-8') as json_file: json.dump(movie, json_file, indent=4, ensure_ascii=False) except Exception as e: logging.error(f"Error writing file {filename}: {e}") print(f'fetch succ. saved result in {filename}') def handle_exit_signal(signal, frame): logging.info("Gracefully exiting... Saving remaining data to Json and CSV.") save_data() sys.exit(0) # 全量访问 def main(): try: # 注册退出信号 signal.signal(signal.SIGINT, handle_exit_signal) # Handle Ctrl+C signal.signal(signal.SIGTERM, handle_exit_signal) # Handle kill signal process_movies() finally: # 清理操作,保证在程序正常退出时执行 save_data() logging.info("Data processing completed.") if __name__ == "__main__": if len(sys.argv) > 1: url = sys.argv[1] process_one(url) else: main()