modify scripts

2025-03-17 11:30:35 +08:00
parent e6327fbe73
commit d5dc76b87f
178 changed files with 44 additions and 184447 deletions
--- a/iafd/src_json/movie_detail_fetch.py
+++ b/iafd/src_json/movie_detail_fetch.py
@ -0,0 +1,334 @@
+import os
+import json
+import csv
+import time
+import logging
+import sys
+import signal
+import re
+import cloudscraper
+from bs4 import BeautifulSoup
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+# 目录和文件路径
+RESULT_DIR = "../result"
+OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
+INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
+OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
+OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
+BATCH_SIZE = 100  # 每100条数据写入文件
+movies_dir = f'{RESULT_DIR}/movies'
+
+# 初始化 Cloudflare 绕过工具
+scraper = cloudscraper.create_scraper()
+
+# 全量数据
+all_movies = []
+
+def load_existing_data():
+    """加载已处理的数据，支持续传"""
+    if os.path.exists(OUTPUT_JSON):
+        with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
+            try:
+                return json.load(f)
+            except json.JSONDecodeError:
+                return []
+    return []
+
+
+def save_data():
+    """保存数据到 JSON 和 CSV 文件"""
+    logging.info("Saving data...")
+    global all_movies
+
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump(all_movies, f, indent=4, ensure_ascii=False)
+
+    with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
+                         "AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
+        for movie in all_movies:
+            writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
+                             movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
+                             movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
+
+# 请求网页并返回 HTML 内容
+def fetch_html(href):
+    """请求网页并返回 HTML 内容"""
+    for attempt in range(3):
+        try:
+            response = scraper.get(href, timeout=10)
+            if response.status_code == 200:
+                return response.text
+        except Exception as e:
+            logging.warning(f"Error fetching {href}: {e}")
+        time.sleep(2)
+
+    logging.error(f"Failed to fetch {href} after 3 attempts")
+    return None
+
+# 解析网页 HTML 并提取电影信息
+def parse_movie_details(html, href, title):
+    """解析网页 HTML 并提取电影信息"""
+    soup = BeautifulSoup(html, "html.parser")
+
+    # 解析电影基础信息
+    movie_data = {}
+    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+    if info_div:
+        labels = info_div.find_all("p", class_="bioheading")
+        values = info_div.find_all("p", class_="biodata")
+        for label, value in zip(labels, values):
+            key = label.text.strip()
+            val = value.text.strip()
+            if key in ["Distributor", "Studio", "Director"]:
+                link = value.find("a")
+                if link:
+                    val = link.text.strip()
+                    movie_data[f'{key}Href'] = host_url + link['href']
+            movie_data[key] = val
+    else:
+        return None
+
+    # 解析演职人员信息
+    performers = []
+    cast_divs = soup.find_all("div", class_="castbox")
+    for cast in cast_divs:
+        performer = {}
+        link = cast.find("a")
+        if link:
+            performer["name"] = link.text.strip()
+            performer["href"] =  host_url + link["href"]
+
+        performer["tags"] = [
+            tag.strip() for br in cast.find_all("br")
+            if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+        ]
+        
+        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+        performers.append(performer)
+
+    # 解析场景拆解
+    scene_breakdowns = []
+    scene_table = soup.find("div", id="sceneinfo")
+    if scene_table:
+        rows = scene_table.find_all("tr")
+
+        for row in rows:
+            cols = row.find_all("td")
+            if len(cols) >= 2:
+                scene = cols[0].text.strip()  # 场景编号
+                performer_info = cols[1]  # 包含表演者及链接信息
+
+                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
+                performer_html = str(performer_info)  # 获取所有HTML内容
+                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
+                if split_html:
+                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                else:
+                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
+                    if split_html:
+                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                    else:
+                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部
+
+                # 解析为纯文本（去除HTML标签，仅提取文本内容）
+                performers_soup = BeautifulSoup(performers_html, "html.parser")
+                performers_text = performers_soup.get_text()
+
+                # 提取表演者
+                scene_performers = [p.strip() for p in performers_text.split(",")]
+
+                # 尝试获取 `webscene` 和 `studio`
+                links_data = {}
+                links = performer_info.find_all("a")
+                if links:
+                    webscene_title = links[0].text.strip() if len(links)>0 else None
+                    webscene = links[0]["href"] if len(links)>0 else None
+                    studio = links[1].text.strip() if len(links)>1 else None
+                    studio_lnk = links[1]["href"] if len(links)>1 else None
+                    links_data = {
+                        "title": webscene_title,
+                        "webscene": webscene,
+                        "studio": studio,
+                        "studio_lnk": studio_lnk,
+                    }
+
+                scene_data = {
+                    "scene": scene,
+                    "performers": scene_performers,
+                    **links_data,
+                }
+                scene_breakdowns.append(scene_data)
+
+    appears_in = []
+    appears_divs = soup.find("div", id="appearssection")
+    if appears_divs:
+        rows = appears_divs.find_all("li")
+        for row in rows:
+            lnk = row.find("a")
+            if lnk:
+                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+    return {
+        "href": href,
+        "title": title,
+        "Minutes": movie_data.get("Minutes", ""),
+        "Distributor": movie_data.get("Distributor", ""),
+        "Studio": movie_data.get("Studio", ""),
+        "ReleaseDate": movie_data.get("Release Date", ""),
+        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+        "All-Girl": movie_data.get("All-Girl", ""),
+        "All-Male": movie_data.get("All-Male", ""),
+        "Compilation": movie_data.get("Compilation", ""),
+        "Webscene": movie_data.get("Webscene", ""),
+        "Director": movie_data.get("Director", ""),
+        "DirectorHref": movie_data.get("DirectorHref", ""),
+        "DistributorHref": movie_data.get("DistributorHref", ""),
+        "StudioHref": movie_data.get("StudioHref", ""),
+        "Performers": performers,
+        "SceneBreakdowns": scene_breakdowns,
+        "AppearsIn": appears_in,
+    }
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_movie_json(href, data):
+    # 获取目录
+    movie_id = extract_id_from_href(href)
+    person_dir = create_sub_directory(movies_dir, movie_id)
+    person_filename = f"{movie_id}.json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+def process_movies():
+    """处理电影数据"""
+    global all_movies
+    all_movies = load_existing_data()
+    processed_hrefs = {movie["href"] for movie in all_movies}
+
+    # 读取 distributors.json 文件
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        movies = json.load(f)
+
+    count = 0
+
+    for entry in movies:
+        href = entry["href"]
+        title = entry["title"]
+
+        if href in processed_hrefs:
+            logging.info(f"Skiping existed: {title} ({href})")
+            continue  # 跳过已处理数据
+
+        logging.info(f"Processing: {title} ({href})")
+
+        while True:
+            html = fetch_html(href)
+            if not html:
+                logging.warning(f'Retring  {title} ({href}) ')
+                continue  # 获取失败，跳过
+            else:
+                movie = parse_movie_details(html, href, title)
+                if not movie:
+                    logging.warning(f'Retring  {title} ({href}) ')
+                    continue
+                else:
+                    all_movies.append(movie)
+                    count += 1
+
+                    # 写入本地文件
+                    write_movie_json(href, movie)
+                    break
+
+        # 每 BATCH_SIZE 条数据刷新一次文件
+        if count % BATCH_SIZE == 0:
+            save_data()
+
+    # 最终保存文件
+    save_data()
+
+    logging.info("Task completed.")
+
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 指定url访问
+def process_one(href):
+    # 初始化 cloudscraper
+    scraper = cloudscraper.create_scraper()
+    # 获取并解析数据
+    movie = {}
+    while True:
+        html = fetch_html(href)
+        if not html:
+            logging.warning(f'fetching {href} error. retrying...')
+            continue  # 获取失败，跳过
+
+        movie = parse_movie_details(html, href, 'title')
+        if movie:
+            break
+        else:
+            logging.warning(f'fetching {href} error. retrying...')
+            continue  # 获取失败，跳过
+    
+    if movie:
+        write_movie_json(href, movie)
+
+    print(f'fetch succ. saved result in {movies_dir}')
+
+# 处理程序被终止时的数据
+def handle_exit_signal(signal, frame):
+    logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
+    save_data()
+    sys.exit(0)
+
+# 全量访问
+def main():
+    try:
+        # 注册退出信号
+        signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
+        signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
+        process_movies()
+    finally:
+        # 清理操作，保证在程序正常退出时执行
+        save_data()
+        logging.info("Data processing completed.")
+
+# 程序入口，读取参数
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        url = sys.argv[1]
+        process_one(url)
+    else:
+        main()