add some scripts.

2025-03-02 15:27:53 +08:00
parent 6b2e7f5281
commit d522dd9830
17 changed files with 2249514 additions and 0 deletions
--- a/scripts/iafd/distributors_list_fetch.py
+++ b/scripts/iafd/distributors_list_fetch.py
@ -0,0 +1,192 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import cloudscraper
+import json
+import time
+import csv
+from bs4 import BeautifulSoup
+import logging
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+base_url = f"{host_url}/distrib.rme/distrib="
+dist_list_url = f'{base_url}/distrib.asp'
+
+distr_map = {
+    6812 : 'nubilefilms.com',
+    8563 : 'teenmegaworld network',
+    6779 : 'x-art.com',
+    7133 : 'tushy.com',
+    6496 : 'blacked.com',
+    7758 : 'vixen.com',
+    6791 : 'teamskeet.com',
+    12454: 'vip4k.com',
+    13541: 'wow network',
+    9702 : 'cum4k.com',
+    6778 : 'tiny4k.com',
+    12667: 'anal4k.com',
+    7419 : 'exotic4k.com',
+    13594: 'facials4k.com',
+    13633: 'mom4k.com',
+    12335: 'slim4k.com',
+    16709: 'strippers4k.com',
+
+}
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+# 结果路径
+res_dir = './result'
+all_data = []
+
+# 网络请求并解析 HTML
+def fetch_page(url):
+    try:
+        response = scraper.get(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        logging.error(f"Failed to fetch {url}: {e}")
+        return None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page(html, name):
+    soup = BeautifulSoup(html, "html.parser")
+    table = soup.find("table", id="distable")
+
+    if not table:
+        logging.warning(f"Warning: No 'distable' table found in {name}")
+        return None
+    
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    global all_data
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 5:
+            title = cols[0].text.strip()
+            label = cols[1].text.strip()
+            year = cols[2].text.strip()
+            rev = cols[3].text.strip()
+            a_href = cols[0].find('a')
+            href = host_url + a_href['href'] if a_href else ''
+
+            all_data.append({
+                'distributors': name,
+                'title': title,
+                'label': label,
+                'year': year,
+                'rev': rev,
+                'href': href
+            })
+    return soup
+
+# 处理翻页,星座的无需翻页
+def handle_pagination(soup, astro):
+    return None
+
+# 获取列表页
+def process_list_gage():
+    global distr_map
+
+    logging.info(f"Fetching data for {dist_list_url} ...")
+    select_element = None
+    while True:
+        html = fetch_page(dist_list_url)
+        if html:
+            soup = BeautifulSoup(html, "html.parser")
+            select_element = soup.find('select', {'name': 'Distrib'})
+            if select_element :
+                break
+            else:
+                logging.info(f"wrong html content. retring {dist_list_url} ...")
+        else:
+            logging.info(f"wrong html content. retring {dist_list_url} ...")
+
+    if not select_element:
+        return None
+    
+    options = select_element.find_all('option')
+    for option in options:
+        value = option.get('value')  # 获取 value 属性
+        text = option.text.strip()   # 获取文本内容
+        distr_map[int(value)] = text
+    logging.info(f'fetch {dist_list_url} succ. total distributors: {len(distr_map)}')
+    return True
+
+# 主逻辑函数：循环处理每个种族
+def process_main_data():
+    for dis_key, dis_name in distr_map.items():
+        url = base_url + str(dis_key)
+        next_url = url
+        logging.info(f"Fetching data for {dis_name}, url {url} ...")
+
+        while next_url:
+            html = fetch_page(next_url)
+            if html:
+                soup = parse_page(html, dis_name)
+                if soup:
+                    next_url = handle_pagination(soup, dis_name)
+                else:
+                    logging.info(f"wrong html content. retring {next_url} ...")
+                # 定期保存结果
+                save_data()
+                time.sleep(2)  # 控制访问频率
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+
+# 保存到文件
+def save_data():
+    with open(f'{res_dir}/distributors.json', 'w', encoding='utf-8') as json_file:
+        json.dump(all_data, json_file, indent=4, ensure_ascii=False)
+    
+    with open(f'{res_dir}/distributors.csv', 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=['distributors', 'title', 'label', 'year', 'rev', 'href'])
+        writer.writeheader()
+        writer.writerows(all_data)
+
+# 执行主逻辑
+if __name__ == '__main__':
+    #process_list_gage()
+    process_main_data()
+    save_data()
+    logging.info("Data fetching and saving completed.")
--- a/scripts/iafd/merge/auto_tag.py
+++ b/scripts/iafd/merge/auto_tag.py
@ -0,0 +1,101 @@
+import sqlite3
+import json
+import logging
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# 数据库连接
+DB_PATH = 'your_database.db'  # 数据库路径，修改为实际路径
+# 预定义标签，方便修改
+TAG_LIST = ['vixen', 'blacked', 'tushy', 'x-art']
+
+# 预加载标签 ID
+def get_all_tag_ids():
+    try:
+        with sqlite3.connect(DB_PATH) as conn:
+            cursor = conn.cursor()
+            #cursor.execute("SELECT id, name FROM tags WHERE name IN ('vixen', 'blacked', 'tushy', 'x-art')")
+            cursor.execute("SELECT id, name FROM tags WHERE name IN ({})".format(', '.join(['?']*len(TAG_LIST))), TAG_LIST)
+            tags = cursor.fetchall()
+            # 创建标签名到 tag_id 的映射
+            return {tag_name.lower(): tag_id for tag_id, tag_name in tags}
+    except Exception as e:
+        logger.error(f"Error fetching tag IDs: {e}")
+        return {}
+
+# 批量查找 performers 的 performer_id
+def get_performers_ids(performer_names):
+    try:
+        with sqlite3.connect(DB_PATH) as conn:
+            cursor = conn.cursor()
+            query = "SELECT id, name FROM performers WHERE LOWER(name) IN ({})".format(
+                ','.join(['?'] * len(performer_names))
+            )
+            cursor.execute(query, [name.lower() for name in performer_names])
+            performers = cursor.fetchall()
+            return {performer_name.lower(): performer_id for performer_id, performer_name in performers}
+    except Exception as e:
+        logger.error(f"Error fetching performer IDs: {e}")
+        return {}
+
+# 插入数据到 performers_tags 表
+def insert_performer_tag(performer_id, tag_id):
+    try:
+        with sqlite3.connect(DB_PATH) as conn:
+            cursor = conn.cursor()
+            # 检查 performers_tags 中是否已有此条数据
+            cursor.execute("SELECT 1 FROM performers_tags WHERE performer_id = ? AND tag_id = ?", (performer_id, tag_id))
+            if not cursor.fetchone():
+                cursor.execute("INSERT INTO performers_tags (performer_id, tag_id) VALUES (?, ?)", (performer_id, tag_id))
+                conn.commit()
+                logger.info(f"Inserted performer_id {performer_id} and tag_id {tag_id} into performers_tags.")
+            else:
+                logger.info(f"Entry for performer_id {performer_id} and tag_id {tag_id} already exists in performers_tags.")
+    except Exception as e:
+        logger.error(f"Error inserting into performers_tags: {e}")
+
+# 处理 detail.json 文件
+def process_detail_json(detail_file):
+    try:
+        with open(detail_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        # 获取所有标签的 ID
+        tag_ids = get_all_tag_ids()
+
+        # 收集需要查询的 performers.name
+        performer_names = [entry.get('person') for entry in data]
+
+        # 批量查询 performers.id
+        performer_ids = get_performers_ids(performer_names)
+
+        for entry in data:
+            person = entry.get('person')
+            vixen_cnt = entry.get('vixen_cnt', 0)
+            blacked_cnt = entry.get('blacked_cnt', 0)
+            tushy_cnt = entry.get('tushy_cnt', 0)
+            x_art_cnt = entry.get('x_art_cnt', 0)
+
+            # 获取 performer_id
+            performer_id = performer_ids.get(person.lower())
+            if not performer_id:
+                continue  # 如果找不到 performer_id，跳过此条数据
+
+            # 处理每个 tag（vixen, blacked, tushy, x-art）
+            for tag_name, count in zip(TAG_LIST, [vixen_cnt, blacked_cnt, tushy_cnt, x_art_cnt]):
+                if count > 0:
+                    tag_id = tag_ids.get(tag_name)
+                    if tag_id:
+                        insert_performer_tag(performer_id, tag_id)
+    except Exception as e:
+        logger.error(f"Error processing {detail_file}: {e}")
+
+# 主函数
+def main():
+    detail_file = 'detail.json'  # 输入文件路径，可以替换成实际路径
+    process_detail_json(detail_file)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/iafd/merge/detail_birth.csv
+++ b/scripts/iafd/merge/detail_birth.csv
--- a/scripts/iafd/merge/detail_birth.json
+++ b/scripts/iafd/merge/detail_birth.json
--- a/scripts/iafd/merge/json2csv.py
+++ b/scripts/iafd/merge/json2csv.py
@ -0,0 +1,72 @@
+import json
+import csv
+
+# 读取 detail_birth.json 文件
+def read_json(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"文件 {file_path} 未找到.")
+        return []
+    except json.JSONDecodeError:
+        print(f"文件 {file_path} 解析错误.")
+        return []
+
+# 写入 CSV 文件
+def write_to_csv(data, output_file):
+    with open(output_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=[
+            'person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 
+            'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height', 
+            'weight', 'measurements', 'tattoos', 'piercings'
+        ])
+        writer.writeheader()
+        for entry in data:
+            # 确保 performer_aka 始终为列表类型
+            performer_aka = entry.get('performer_aka', [])
+            
+            # 如果是 None 或非列表类型，转换为一个空列表
+            if performer_aka is None:
+                performer_aka = []
+            elif not isinstance(performer_aka, list):
+                performer_aka = [performer_aka]
+                
+            # 写入每一行
+            writer.writerow({
+                'person': entry.get('person', ''),
+                'href': entry.get('href', ''),
+                'performer_aka': performer_aka,
+                'birthday': entry.get('birthday', ''),
+                'astrology': entry.get('astrology', ''),
+                'birthplace': entry.get('birthplace', ''),
+                'gender': entry.get('gender', ''),
+                'years_active': entry.get('years_active', ''),
+                'ethnicity': entry.get('ethnicity', ''),
+                'nationality': entry.get('nationality', ''),
+                'hair_colors': entry.get('hair_colors', ''),
+                'eye_color': entry.get('eye_color', ''),
+                'height': entry.get('height', ''),
+                'weight': entry.get('weight', ''),
+                'measurements': entry.get('measurements', ''),
+                'tattoos': entry.get('tattoos', ''),
+                'piercings': entry.get('piercings', '')
+            })
+
+# 主函数，执行转化操作
+def main():
+    # 输入的 JSON 文件路径
+    input_json_file = 'detail_birth.json'
+    # 输出的 CSV 文件路径
+    output_csv_file = 'detail_birth.csv'
+
+    # 读取 JSON 文件
+    data = read_json(input_json_file)
+
+    # 将数据写入 CSV 文件
+    write_to_csv(data, output_csv_file)
+
+    print(f"数据已保存到 {output_csv_file}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/iafd/merge/merged.csv
+++ b/scripts/iafd/merge/merged.csv
--- a/scripts/iafd/merge/merged.json
+++ b/scripts/iafd/merge/merged.json
--- a/scripts/iafd/merge/result.json
+++ b/scripts/iafd/merge/result.json
--- a/scripts/iafd/merge/stashdb.csv
+++ b/scripts/iafd/merge/stashdb.csv
--- a/scripts/iafd/merge/stashdb.json
+++ b/scripts/iafd/merge/stashdb.json
--- a/scripts/iafd/merge/url_match.py
+++ b/scripts/iafd/merge/url_match.py
@ -0,0 +1,120 @@
+import json
+import logging
+import cloudscraper
+import time
+from requests.exceptions import RequestException
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+test_flag = True
+
+# 读取stashdb.json
+def read_json(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            return json.load(file)
+    except FileNotFoundError:
+        logger.error(f"File {file_path} not found.")
+        return []
+    except json.JSONDecodeError:
+        logger.error(f"Error decoding JSON from {file_path}.")
+        return []
+
+# 请求URL并获取重定向后的URL
+def fetch_real_url_2(url, scraper):
+    try:
+        response = scraper.get(url, allow_redirects=True)
+        if response.status_code == 200:
+            return response.url  # 获取最终的URL
+        else:
+            logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
+            return None
+    except RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
+        return None
+
+def fetch_real_url(url, scraper):
+    try:
+        # 请求URL，禁止自动重定向
+        response = scraper.get(url, allow_redirects=False)
+        
+        # 检查是否是302响应，并获取Location头部的URL
+        if response.status_code == 302 or response.status_code == 301:
+            redirect_url = response.headers.get("Location")
+            if redirect_url:
+                logger.info(f"Redirected to: {redirect_url}")
+                return redirect_url
+            else:
+                logger.warning(f"Redirect response received, but no Location header found for {url}")
+                return None
+        else:
+            logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
+            return None
+    except RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
+        return None
+
+# 处理每个 URL
+def process_urls(data, scraper):
+    loop = 0
+    global test_flag 
+
+    for entry in data:
+        iafd_urls = entry.get('iafd_urls', [])
+        real_urls = []
+
+        for url in iafd_urls:
+            if 'perfid=' in url:
+                # 如果是重定向链接，访问并获取重定向后的URL
+                real_url = fetch_real_url(url, scraper)
+                if real_url:
+                    real_urls.append(real_url)
+                    # 测试时，执行小批量数据
+                    loop = loop + 1
+                    if test_flag and loop >10:
+                        return data
+                    
+            elif 'person.rme/id=' in url:
+                # 非perfid链接直接添加
+                real_urls.append(url)
+            else:
+                # 非perfid链接直接添加
+                real_urls.append(url)
+                logger.warning(f"unkown url format: {url}")
+
+        # 更新iafd_real_url字段
+        entry['iafd_real_url'] = real_urls
+
+    return data
+
+# 保存处理后的结果到 result.json
+def save_to_json(data, output_file):
+    try:
+        with open(output_file, 'w', encoding='utf-8') as file:
+            json.dump(data, file, ensure_ascii=False, indent=4)
+        logger.info(f"Data saved to {output_file}")
+    except Exception as e:
+        logger.error(f"Error saving to {output_file}: {e}")
+
+# 主函数
+def main():
+    # 读取输入文件
+    input_file = 'stashdb.json'
+    output_file = 'result.json'
+    
+    # 创建cloudscraper对象
+    scraper = cloudscraper.create_scraper()
+
+    # 读取stashdb.json中的数据
+    data = read_json(input_file)
+
+    # 处理每个 URL，获取重定向后的URL
+    processed_data = process_urls(data, scraper)
+
+    # 保存结果到 result.json
+    save_to_json(processed_data, output_file)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/iafd/movie_meta_fetch.py
+++ b/scripts/iafd/movie_meta_fetch.py
@ -0,0 +1,254 @@
+import os
+import json
+import csv
+import time
+import logging
+import sys
+import signal
+import re
+import cloudscraper
+from bs4 import BeautifulSoup
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+# 目录和文件路径
+RESULT_DIR = "result"
+INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
+OUTPUT_JSON = os.path.join(RESULT_DIR, "movie_details.json")
+OUTPUT_CSV = os.path.join(RESULT_DIR, "movie_details.csv")
+BATCH_SIZE = 100  # 每100条数据写入文件
+
+# 初始化 Cloudflare 绕过工具
+scraper = cloudscraper.create_scraper()
+
+
+def load_existing_data():
+    """加载已处理的数据，支持续传"""
+    if os.path.exists(OUTPUT_JSON):
+        with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
+            try:
+                return json.load(f)
+            except json.JSONDecodeError:
+                return []
+    return []
+
+
+def save_data(all_movies):
+    """保存数据到 JSON 和 CSV 文件"""
+    logging.info("Saving data...")
+
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump(all_movies, f, indent=4, ensure_ascii=False)
+
+    with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
+                         "AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
+        for movie in all_movies:
+            writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
+                             movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
+                             movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
+
+
+def fetch_html(href):
+    """请求网页并返回 HTML 内容"""
+    for attempt in range(3):
+        try:
+            response = scraper.get(href, timeout=10)
+            if response.status_code == 200:
+                return response.text
+        except Exception as e:
+            logging.warning(f"Error fetching {href}: {e}")
+        time.sleep(2)
+
+    logging.error(f"Failed to fetch {href} after 3 attempts")
+    return None
+
+
+def parse_movie_details(html, href, title):
+    """解析网页 HTML 并提取电影信息"""
+    soup = BeautifulSoup(html, "html.parser")
+
+    # 解析电影基础信息
+    movie_data = {}
+    director_href = ''
+    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+    if info_div:
+        labels = info_div.find_all("p", class_="bioheading")
+        values = info_div.find_all("p", class_="biodata")
+        for label, value in zip(labels, values):
+            key = label.text.strip()
+            val = value.text.strip()
+            if key in ["Distributor", "Studio", "Director"]:
+                link = value.find("a")
+                if link:
+                    val = link.text.strip()
+                    if key == 'Director':
+                        director_href = host_url + link['href']
+            movie_data[key] = val
+    else:
+        return None
+
+    # 解析演职人员信息
+    performers = []
+    cast_divs = soup.find_all("div", class_="castbox")
+    for cast in cast_divs:
+        performer = {}
+        link = cast.find("a")
+        if link:
+            performer["name"] = link.text.strip()
+            performer["href"] =  host_url + link["href"]
+
+        performer["tags"] = [
+            tag.strip() for br in cast.find_all("br")
+            if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+        ]
+        
+        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+        performers.append(performer)
+
+    # 解析场景拆解
+    scene_breakdowns = []
+    scene_table = soup.find("div", id="sceneinfo")
+    if scene_table:
+        rows = scene_table.find_all("tr")
+        for row in rows:
+            cols = row.find_all("td")
+            if len(cols) >= 2:
+                scene = cols[0].text.strip()
+                scene_performers = [p.strip() for p in cols[1].text.split(",")]
+                scene_breakdowns.append({"scene": scene, "performers": scene_performers})
+
+    appears_in = []
+    appears_divs = soup.find("div", id="appearssection")
+    if appears_divs:
+        rows = appears_divs.find_all("li")
+        for row in rows:
+            lnk = row.find("a")
+            if lnk:
+                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+    return {
+        "href": href,
+        "title": title,
+        "Minutes": movie_data.get("Minutes", ""),
+        "Distributor": movie_data.get("Distributor", ""),
+        "Studio": movie_data.get("Studio", ""),
+        "ReleaseDate": movie_data.get("Release Date", ""),
+        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+        "All-Girl": movie_data.get("All-Girl", ""),
+        "All-Male": movie_data.get("All-Male", ""),
+        "Compilation": movie_data.get("Compilation", ""),
+        "Webscene": movie_data.get("Webscene", ""),
+        "Director": movie_data.get("Director", ""),
+        "DirectorHref": director_href,
+        "Performers": performers,
+        "SceneBreakdowns": scene_breakdowns,
+        "AppearsIn": appears_in,
+    }
+
+
+def process_movies():
+    """处理电影数据"""
+    all_movies = load_existing_data()
+    processed_hrefs = {movie["href"] for movie in all_movies}
+
+    # 读取 distributors.json 文件
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        movies = json.load(f)
+
+    new_movies = []
+    count = 0
+
+    for entry in movies:
+        href = entry["href"]
+        title = entry["title"]
+
+        if href in processed_hrefs:
+            continue  # 跳过已处理数据
+
+        logging.info(f"Processing: {title} ({href})")
+
+        html = fetch_html(href)
+        if not html:
+            continue  # 获取失败，跳过
+
+        movie = parse_movie_details(html, href, title)
+        new_movies.append(movie)
+        count += 1
+
+        # 每 BATCH_SIZE 条数据刷新一次文件
+        if count % BATCH_SIZE == 0:
+            save_data(all_movies + new_movies)
+
+    # 最终保存文件
+    all_movies.extend(new_movies)
+    save_data(all_movies)
+
+    logging.info("Task completed.")
+
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 指定url访问
+def process_one(href):
+    # 初始化 cloudscraper
+    scraper = cloudscraper.create_scraper()
+    # 获取并解析数据
+    movie = {}
+    while True:
+        html = fetch_html(href)
+        if not html:
+            logging.warning(f'fetching {href} error. retrying...')
+            continue  # 获取失败，跳过
+
+        movie = parse_movie_details(html, href, 'title')
+        if movie:
+            break
+        else:
+            logging.warning(f'fetching {href} error. retrying...')
+            continue  # 获取失败，跳过
+    
+    id = extract_id_from_href(href)
+    filename = f"{id}.json"  # 用 - 替换空格
+
+    try:
+        with open(filename, 'w', encoding='utf-8') as json_file:
+            json.dump(movie, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {filename}: {e}")
+    print(f'fetch succ. saved result in {filename}')
+
+
+def handle_exit_signal(signal, frame):
+    logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
+    save_data()
+    sys.exit(0)
+
+# 全量访问
+def main():
+    try:
+        # 注册退出信号
+        signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
+        signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
+        process_movies()
+    finally:
+        # 清理操作，保证在程序正常退出时执行
+        save_data()
+        logging.info("Data processing completed.")
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        url = sys.argv[1]
+        process_one(url)
+    else:
+        main()
--- a/scripts/iafd/result/distributors.csv
+++ b/scripts/iafd/result/distributors.csv
--- a/scripts/iafd/result/distributors.json
+++ b/scripts/iafd/result/distributors.json
--- a/scripts/iafd/result/studios.csv
+++ b/scripts/iafd/result/studios.csv
--- a/scripts/iafd/result/studios.json
+++ b/scripts/iafd/result/studios.json
--- a/scripts/iafd/studios_list_fetch.py
+++ b/scripts/iafd/studios_list_fetch.py
@ -0,0 +1,191 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import cloudscraper
+import json
+import time
+import csv
+from bs4 import BeautifulSoup
+import logging
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+base_url = f"{host_url}/studio.rme/studio="
+list_page_url = f'{base_url}/studio.asp'
+
+studio_map = {
+    6812 : 'nubilefilms.com',
+    9811 : 'Teen Mega World',
+    6779 : 'x-art.com',
+    7133 : 'tushy.com',
+    6496 : 'blacked.com',
+    7758 : 'vixen.com',
+    6791 : 'teamskeet.com',
+    8052: 'wowgirls.com',
+    9702 : 'cum4k.com',
+    6778 : 'tiny4k.com',
+    12667: 'anal4k.com',
+    7419 : 'exotic4k.com',
+    13594: 'facials4k.com',
+    13633: 'mom4k.com',
+    12335: 'slim4k.com',
+    16709: 'strippers4k.com',
+
+}
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+# 结果路径
+res_dir = './result'
+all_data = []
+
+# 网络请求并解析 HTML
+def fetch_page(url):
+    try:
+        response = scraper.get(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        logging.error(f"Failed to fetch {url}: {e}")
+        return None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page(html, name):
+    soup = BeautifulSoup(html, "html.parser")
+    table = soup.find("table", id="studio")
+
+    if not table:
+        logging.warning(f"Warning: No 'studio' table found in {name}")
+        return None
+    
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    global all_data
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 5:
+            title = cols[0].text.strip()
+            label = cols[1].text.strip()
+            year = cols[2].text.strip()
+            rev = cols[3].text.strip()
+            a_href = cols[0].find('a')
+            href = host_url + a_href['href'] if a_href else ''
+
+            all_data.append({
+                'studios': name,
+                'title': title,
+                'label': label,
+                'year': year,
+                'rev': rev,
+                'href': href
+            })
+    return soup
+
+# 处理翻页,星座的无需翻页
+def handle_pagination(soup, astro):
+    return None
+
+# 获取列表页
+def process_list_gage():
+    global studio_map
+
+    logging.info(f"Fetching data for {list_page_url} ...")
+    select_element = None
+    while True:
+        html = fetch_page(list_page_url)
+        if html:
+            soup = BeautifulSoup(html, "html.parser")
+            select_element = soup.find('select', {'name': 'Studio'})
+            if select_element :
+                break
+            else:
+                logging.info(f"wrong html content. retring {list_page_url} ...")
+        else:
+            logging.info(f"wrong html content. retring {list_page_url} ...")
+
+    if not select_element:
+        return None
+    
+    options = select_element.find_all('option')
+    for option in options:
+        value = option.get('value')  # 获取 value 属性
+        text = option.text.strip()   # 获取文本内容
+        studio_map[int(value)] = text
+    logging.info(f'fetch {list_page_url} succ. total distributors: {len(studio_map)}')
+    return True
+
+# 主逻辑函数：循环处理每个种族
+def process_main_data():
+    for key, name in studio_map.items():
+        url = base_url + str(key)
+        next_url = url
+        logging.info(f"Fetching data for {name}, url {url} ...")
+
+        while next_url:
+            html = fetch_page(next_url)
+            if html:
+                soup = parse_page(html, name)
+                if soup:
+                    next_url = handle_pagination(soup, name)
+                else:
+                    logging.info(f"wrong html content. retring {next_url} ...")
+                # 定期保存结果
+                save_data()
+                time.sleep(2)  # 控制访问频率
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+
+# 保存到文件
+def save_data():
+    with open(f'{res_dir}/studios.json', 'w', encoding='utf-8') as json_file:
+        json.dump(all_data, json_file, indent=4, ensure_ascii=False)
+    
+    with open(f'{res_dir}/studios.csv', 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=['studios', 'title', 'label', 'year', 'rev', 'href'])
+        writer.writeheader()
+        writer.writerows(all_data)
+
+# 执行主逻辑
+if __name__ == '__main__':
+    #process_list_gage()
+    process_main_data()
+    save_data()
+    logging.info("Data fetching and saving completed.")