modify scripts

2025-03-17 11:30:35 +08:00
parent e6327fbe73
commit d5dc76b87f
178 changed files with 44 additions and 184447 deletions
--- a/iafd/merge/auto_tag.py
+++ b/iafd/merge/auto_tag.py
@ -0,0 +1,101 @@
+import sqlite3
+import json
+import logging
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# 数据库连接
+DB_PATH = 'your_database.db'  # 数据库路径，修改为实际路径
+# 预定义标签，方便修改
+TAG_LIST = ['vixen', 'blacked', 'tushy', 'x-art']
+
+# 预加载标签 ID
+def get_all_tag_ids():
+    try:
+        with sqlite3.connect(DB_PATH) as conn:
+            cursor = conn.cursor()
+            #cursor.execute("SELECT id, name FROM tags WHERE name IN ('vixen', 'blacked', 'tushy', 'x-art')")
+            cursor.execute("SELECT id, name FROM tags WHERE name IN ({})".format(', '.join(['?']*len(TAG_LIST))), TAG_LIST)
+            tags = cursor.fetchall()
+            # 创建标签名到 tag_id 的映射
+            return {tag_name.lower(): tag_id for tag_id, tag_name in tags}
+    except Exception as e:
+        logger.error(f"Error fetching tag IDs: {e}")
+        return {}
+
+# 批量查找 performers 的 performer_id
+def get_performers_ids(performer_names):
+    try:
+        with sqlite3.connect(DB_PATH) as conn:
+            cursor = conn.cursor()
+            query = "SELECT id, name FROM performers WHERE LOWER(name) IN ({})".format(
+                ','.join(['?'] * len(performer_names))
+            )
+            cursor.execute(query, [name.lower() for name in performer_names])
+            performers = cursor.fetchall()
+            return {performer_name.lower(): performer_id for performer_id, performer_name in performers}
+    except Exception as e:
+        logger.error(f"Error fetching performer IDs: {e}")
+        return {}
+
+# 插入数据到 performers_tags 表
+def insert_performer_tag(performer_id, tag_id):
+    try:
+        with sqlite3.connect(DB_PATH) as conn:
+            cursor = conn.cursor()
+            # 检查 performers_tags 中是否已有此条数据
+            cursor.execute("SELECT 1 FROM performers_tags WHERE performer_id = ? AND tag_id = ?", (performer_id, tag_id))
+            if not cursor.fetchone():
+                cursor.execute("INSERT INTO performers_tags (performer_id, tag_id) VALUES (?, ?)", (performer_id, tag_id))
+                conn.commit()
+                logger.info(f"Inserted performer_id {performer_id} and tag_id {tag_id} into performers_tags.")
+            else:
+                logger.info(f"Entry for performer_id {performer_id} and tag_id {tag_id} already exists in performers_tags.")
+    except Exception as e:
+        logger.error(f"Error inserting into performers_tags: {e}")
+
+# 处理 detail.json 文件
+def process_detail_json(detail_file):
+    try:
+        with open(detail_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        # 获取所有标签的 ID
+        tag_ids = get_all_tag_ids()
+
+        # 收集需要查询的 performers.name
+        performer_names = [entry.get('person') for entry in data]
+
+        # 批量查询 performers.id
+        performer_ids = get_performers_ids(performer_names)
+
+        for entry in data:
+            person = entry.get('person')
+            vixen_cnt = entry.get('vixen_cnt', 0)
+            blacked_cnt = entry.get('blacked_cnt', 0)
+            tushy_cnt = entry.get('tushy_cnt', 0)
+            x_art_cnt = entry.get('x_art_cnt', 0)
+
+            # 获取 performer_id
+            performer_id = performer_ids.get(person.lower())
+            if not performer_id:
+                continue  # 如果找不到 performer_id，跳过此条数据
+
+            # 处理每个 tag（vixen, blacked, tushy, x-art）
+            for tag_name, count in zip(TAG_LIST, [vixen_cnt, blacked_cnt, tushy_cnt, x_art_cnt]):
+                if count > 0:
+                    tag_id = tag_ids.get(tag_name)
+                    if tag_id:
+                        insert_performer_tag(performer_id, tag_id)
+    except Exception as e:
+        logger.error(f"Error processing {detail_file}: {e}")
+
+# 主函数
+def main():
+    detail_file = 'detail.json'  # 输入文件路径，可以替换成实际路径
+    process_detail_json(detail_file)
+
+if __name__ == "__main__":
+    main()
--- a/iafd/merge/json2csv.py
+++ b/iafd/merge/json2csv.py
@ -0,0 +1,72 @@
+import json
+import csv
+
+# 读取 detail_birth.json 文件
+def read_json(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"文件 {file_path} 未找到.")
+        return []
+    except json.JSONDecodeError:
+        print(f"文件 {file_path} 解析错误.")
+        return []
+
+# 写入 CSV 文件
+def write_to_csv(data, output_file):
+    with open(output_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=[
+            'person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 
+            'years_active', 'ethnicity', 'nationality', 'hair_colors', 'eye_color', 'height', 
+            'weight', 'measurements', 'tattoos', 'piercings'
+        ])
+        writer.writeheader()
+        for entry in data:
+            # 确保 performer_aka 始终为列表类型
+            performer_aka = entry.get('performer_aka', [])
+            
+            # 如果是 None 或非列表类型，转换为一个空列表
+            if performer_aka is None:
+                performer_aka = []
+            elif not isinstance(performer_aka, list):
+                performer_aka = [performer_aka]
+                
+            # 写入每一行
+            writer.writerow({
+                'person': entry.get('person', ''),
+                'href': entry.get('href', ''),
+                'performer_aka': performer_aka,
+                'birthday': entry.get('birthday', ''),
+                'astrology': entry.get('astrology', ''),
+                'birthplace': entry.get('birthplace', ''),
+                'gender': entry.get('gender', ''),
+                'years_active': entry.get('years_active', ''),
+                'ethnicity': entry.get('ethnicity', ''),
+                'nationality': entry.get('nationality', ''),
+                'hair_colors': entry.get('hair_colors', ''),
+                'eye_color': entry.get('eye_color', ''),
+                'height': entry.get('height', ''),
+                'weight': entry.get('weight', ''),
+                'measurements': entry.get('measurements', ''),
+                'tattoos': entry.get('tattoos', ''),
+                'piercings': entry.get('piercings', '')
+            })
+
+# 主函数，执行转化操作
+def main():
+    # 输入的 JSON 文件路径
+    input_json_file = 'detail_birth.json'
+    # 输出的 CSV 文件路径
+    output_csv_file = 'detail_birth.csv'
+
+    # 读取 JSON 文件
+    data = read_json(input_json_file)
+
+    # 将数据写入 CSV 文件
+    write_to_csv(data, output_csv_file)
+
+    print(f"数据已保存到 {output_csv_file}")
+
+if __name__ == "__main__":
+    main()
--- a/iafd/merge/url_match.py
+++ b/iafd/merge/url_match.py
@ -0,0 +1,120 @@
+import json
+import logging
+import cloudscraper
+import time
+from requests.exceptions import RequestException
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+test_flag = True
+
+# 读取stashdb.json
+def read_json(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            return json.load(file)
+    except FileNotFoundError:
+        logger.error(f"File {file_path} not found.")
+        return []
+    except json.JSONDecodeError:
+        logger.error(f"Error decoding JSON from {file_path}.")
+        return []
+
+# 请求URL并获取重定向后的URL
+def fetch_real_url_2(url, scraper):
+    try:
+        response = scraper.get(url, allow_redirects=True)
+        if response.status_code == 200:
+            return response.url  # 获取最终的URL
+        else:
+            logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
+            return None
+    except RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
+        return None
+
+def fetch_real_url(url, scraper):
+    try:
+        # 请求URL，禁止自动重定向
+        response = scraper.get(url, allow_redirects=False)
+        
+        # 检查是否是302响应，并获取Location头部的URL
+        if response.status_code == 302 or response.status_code == 301:
+            redirect_url = response.headers.get("Location")
+            if redirect_url:
+                logger.info(f"Redirected to: {redirect_url}")
+                return redirect_url
+            else:
+                logger.warning(f"Redirect response received, but no Location header found for {url}")
+                return None
+        else:
+            logger.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
+            return None
+    except RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
+        return None
+
+# 处理每个 URL
+def process_urls(data, scraper):
+    loop = 0
+    global test_flag 
+
+    for entry in data:
+        iafd_urls = entry.get('iafd_urls', [])
+        real_urls = []
+
+        for url in iafd_urls:
+            if 'perfid=' in url:
+                # 如果是重定向链接，访问并获取重定向后的URL
+                real_url = fetch_real_url(url, scraper)
+                if real_url:
+                    real_urls.append(real_url)
+                    # 测试时，执行小批量数据
+                    loop = loop + 1
+                    if test_flag and loop >10:
+                        return data
+                    
+            elif 'person.rme/id=' in url:
+                # 非perfid链接直接添加
+                real_urls.append(url)
+            else:
+                # 非perfid链接直接添加
+                real_urls.append(url)
+                logger.warning(f"unkown url format: {url}")
+
+        # 更新iafd_real_url字段
+        entry['iafd_real_url'] = real_urls
+
+    return data
+
+# 保存处理后的结果到 result.json
+def save_to_json(data, output_file):
+    try:
+        with open(output_file, 'w', encoding='utf-8') as file:
+            json.dump(data, file, ensure_ascii=False, indent=4)
+        logger.info(f"Data saved to {output_file}")
+    except Exception as e:
+        logger.error(f"Error saving to {output_file}: {e}")
+
+# 主函数
+def main():
+    # 读取输入文件
+    input_file = 'stashdb.json'
+    output_file = 'result.json'
+    
+    # 创建cloudscraper对象
+    scraper = cloudscraper.create_scraper()
+
+    # 读取stashdb.json中的数据
+    data = read_json(input_file)
+
+    # 处理每个 URL，获取重定向后的URL
+    processed_data = process_urls(data, scraper)
+
+    # 保存结果到 result.json
+    save_to_json(processed_data, output_file)
+
+if __name__ == "__main__":
+    main()
--- a/iafd/src/config.py
+++ b/iafd/src/config.py
@ -0,0 +1,87 @@
+import logging
+import os
+import inspect
+import time
+from datetime import datetime
+from logging.handlers import RotatingFileHandler
+from collections import defaultdict
+
+home_dir = os.path.expanduser("~")
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
+global_share_data_dir = f'{home_dir}/sharedata'
+
+# 统计日志频率
+log_count = defaultdict(int)  # 记录日志的次数
+last_log_time = defaultdict(float)  # 记录上次写入的时间戳
+
+class RateLimitFilter(logging.Filter):
+    """
+    频率限制过滤器：
+    1. 在 60 秒内，同样的日志最多写入 60 次，超过则忽略
+    2. 如果日志速率超过 100 条/秒，发出告警
+    """
+    LOG_LIMIT = 60  # 每分钟最多记录相同消息 10 次
+
+    def filter(self, record):
+        global log_count, last_log_time
+        message_key = record.getMessage()  # 获取日志内容
+        
+        # 计算当前时间
+        now = time.time()
+        elapsed = now - last_log_time[message_key]
+
+        # 限制相同日志的写入频率
+        if elapsed < 60:  # 60 秒内
+            log_count[message_key] += 1
+            if log_count[message_key] > self.LOG_LIMIT:
+                print('reach limit.')
+                return False  # 直接丢弃
+        else:
+            log_count[message_key] = 1  # 超过 60 秒，重新计数
+
+        last_log_time[message_key] = now
+
+        return True  # 允许写入日志
+
+
+
+def setup_logging(log_filename=None):
+    if log_filename is None:
+        caller_frame = inspect.stack()[1]
+        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+        current_date = datetime.now().strftime('%Y%m%d')
+        log_filename = f'../log/{caller_filename}_{current_date}.log'
+    
+    max_log_size = 100 * 1024 * 1024  # 10 MB
+    max_log_files = 10  # 最多保留 10 个日志文件
+
+    file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
+    file_handler.setFormatter(logging.Formatter(
+        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+    ))
+
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter(
+        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+    ))
+
+    # 创建 logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    logger.handlers = []  # 避免重复添加 handler
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+
+    # 添加频率限制
+    rate_limit_filter = RateLimitFilter()
+    file_handler.addFilter(rate_limit_filter)
+    console_handler.addFilter(rate_limit_filter)
+
+
+# 运行示例
+if __name__ == "__main__":
+    setup_logging()
+
+    for i in range(1000):
+        logging.info("测试日志，检测频率限制")
+        time.sleep(0.01)  # 模拟快速写入日志
--- a/iafd/src/fetch.py
+++ b/iafd/src/fetch.py
@ -0,0 +1,411 @@
+
+import json
+import time
+import csv
+import argparse
+import logging
+from functools import partial
+import config
+import sqlite_utils as db_tools
+import iafd_scraper as scraper
+import utils 
+
+config.setup_logging()
+
+debug = False
+force = False
+
+# 按星座获取演员列表，无翻页
+def fetch_performers_by_astro():    
+    for astro in scraper.astro_list:
+        url = scraper.astr_base_url + astro
+        logging.info(f"Fetching data for {astro}, url {url} ...")
+
+        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_page_astro(soup, astro)
+            if list_data:
+                for row in list_data :
+                    # 写入演员数据表
+                    perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
+                    if perfomer_id:
+                        logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
+                    else:
+                        logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
+
+            else:
+                logging.warning(f'fetch astro error. {url} ...')
+        elif status_code  and status_code == 404:
+            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+        else:
+            logging.warning(f'fetch astro error. {url} ...')
+
+        # 调试添加break
+        if debug:
+            break
+
+
+# 按生日获取演员列表，无翻页
+def fetch_performers_by_birth():    
+    for month in range(1, 13):  # 遍历1到12月
+        for day in range(1, 32):  # 遍历1到31天
+            url = scraper.birth_base_url.format(month=month, day=day)
+            logging.info(f"Fetching data for birth, url {url}")
+            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
+            if soup:
+                list_data, next_url = scraper.parse_page_birth(soup, month, day)
+                if list_data:
+                    for row in list_data :
+                        # 写入演员数据表
+                        perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
+                        if perfomer_id:
+                            logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
+                        else:
+                            logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
+                else:
+                    logging.warning(f'fetch astro error. {url} ...')
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+            else:
+                logging.warning(f'fetch astro error. {url} ...')
+
+            # 调试添加break
+            if debug:
+                return True
+
+# 更新人种列表
+def fetch_ethic_list():
+    url = scraper.ethnic_list_url
+    logging.info(f"Fetching data for performer's ethnic list, url {url} ...")
+    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="ethnicity1", attr_type="id"))
+    if soup:
+        list_data = scraper.parse_page_ethnic_list(soup, url)
+        if list_data:
+            for row in list_data :
+                dist_id = db_tools.insert_or_update_ethnic({'name': row['name'], 'href': row.get('href', '')})
+                if dist_id:
+                    logging.debug(f'insert one record into ethnic table. id:{dist_id}, name: {row['name']}, href:{row.get('href', '')}')
+        else:
+            logging.warning(f'fetch ethnic error. {url} ...')
+    elif status_code  and status_code == 404:
+        logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+    else:
+        logging.warning(f'fetch page error. {url} ...')
+
+
+# 按人种获取演员列表，有翻页
+def fetch_performers_by_ethnic():    
+    # 先刷新列表
+    fetch_ethic_list()
+
+    ethnic_list = db_tools.query_ethnic_hrefs()
+    for row in ethnic_list:
+        url = row['href']
+        ethnic = row['name']
+        next_url = url
+
+        while next_url:
+            logging.info(f"Fetching data for {ethnic}, url {url} ...")
+            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), 
+                        parser="lxml", preprocessor=scraper.preprocess_html)
+            if soup:
+                list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
+                if list_data:
+                    for row in list_data :
+                        # 写入演员数据表
+                        perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)                      
+                        if perfomer_id:
+                            logging.debug(f'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}')
+                        else:
+                            logging.warning(f'insert performer index failed. name: {row['person']}, href:{row['href']}')
+                else:
+                    logging.warning(f'fetch astro error. {url} ...')
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
+                break
+            else:
+                logging.warning(f'fetch astro error. {url} ...')
+
+            # 调试添加break
+            if debug:
+                return True
+
+# 获取distributors列表
+def fetch_distributors_list():
+    url = scraper.distributors_list_url
+    logging.info(f"Fetching data for distributors list, url {url} ...")
+    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
+    if soup:
+        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
+        if list_data:
+            for row in list_data :
+                dis_url = scraper.distributors_base_url + row['href']
+                dist_id = db_tools.insert_or_update_distributor({'name': row['name'], 'href': dis_url})
+                if dist_id:
+                    logging.debug(f'insert one record into distributors table. id:{dist_id}, name: {row['name']}, href:{dis_url}')
+        else:
+            logging.warning(f'fetch astro error. {url} ...')
+    elif status_code  and status_code == 404:
+        logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+    else:
+        logging.warning(f'fetch astro error. {url} ...')
+
+# 获取studios列表
+def fetch_studios_list():
+    url = scraper.studios_list_url
+    logging.info(f"Fetching data for studios list, url {url} ...")
+    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
+    if soup:
+        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
+        if list_data:
+            for row in list_data :
+                stu_url = scraper.studios_base_url + row['href']
+                stu_id = db_tools.insert_or_update_studio({'name': row['name'], 'href': stu_url})
+                if stu_id:
+                    logging.debug(f'insert one record into studios table. id:{stu_id}, name: {row['name']}, href:{stu_url}')
+        else:
+            logging.warning(f'fetch astro error. {url} ...')
+    elif status_code  and status_code == 404:
+        logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+    else:
+        logging.warning(f'fetch astro error. {url} ...')
+
+
+# 更新distributors列表中的影片信息
+def fetch_movies_by_dist():
+    # 先刷新一下列表
+    fetch_distributors_list()
+
+    url_list = db_tools.query_studio_hrefs()
+    if debug:
+        url_list = db_tools.query_distributor_hrefs(name='vixen.com')
+    for url in url_list:
+        logging.info(f"Fetching data for distributor url {url} ...")
+        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
+            if list_data:                
+                for movie in list_data:
+                    tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
+                    if tmp_id:
+                        logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
+                    else:
+                        logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
+            else :
+                logging.warning(f'parse_page_movie error. url: {url}')
+        elif status_code  and status_code == 404:
+            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+        else:
+            logging.warning(f'fetching page error. {url}')
+        # 调试增加brak
+        if debug:
+            break
+
+# 更新distributors列表中的影片信息
+def fetch_movies_by_stu():
+    # 先刷新一下列表
+    fetch_studios_list()
+
+    url_list = db_tools.query_studio_hrefs()
+    if debug:
+        url_list = db_tools.query_studio_hrefs(name='vixen.com')
+    for url in url_list:
+        logging.info(f"Fetching data for studio url {url} ...")
+        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
+            if list_data:                
+                for movie in list_data:
+                    tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
+                    if tmp_id:
+                        logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
+                    else:
+                        logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
+            else :
+                logging.warning(f'parse_page_movie error. url: {url}')
+        elif status_code  and status_code == 404:
+            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+        else:
+            logging.warning(f'fetching page error. {url}')
+        # 调试增加brak
+        if debug:
+            break
+
+# 更新演员信息，单次循环
+def fetch_performers_detail_once(perfomers_list):
+    last_performer_id = 0
+    for performer in perfomers_list:
+        url = performer['href']
+        person = performer['name']
+        logging.info(f"Fetching data for performer ({person}), url {url} ...")
+        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
+        if soup:
+            data = scraper.parse_page_performer(soup)
+            if data:
+                performer_id = db_tools.insert_or_update_performer({
+                    'href': url,
+                    'person': person,
+                    **data
+                })
+                if performer_id:
+                    logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
+                    last_performer_id = performer_id
+                else:
+                    logging.warning(f'insert person: ({person}) {url} failed.')
+
+                # 写入到本地json文件
+                utils.write_person_json(person, url, {
+                    'href': url,
+                    'person': person,
+                    **data
+                })
+            else:
+                logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
+        elif status_code  and status_code == 404:
+            performer_id = db_tools.insert_or_update_performer_404(name=person, href=url)
+            logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
+        else:
+            logging.warning(f'fetch_page error. person: ({person}), url: {url}')
+        time.sleep(1)
+    return last_performer_id
+
+# 更新演员信息
+def fetch_performers_detail():
+    limit_count = 5 if debug else 100
+    perfomers_list = []
+
+    # 获取新演员的列表
+    while True:
+        perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
+        if len(perfomers_list) < 1:
+            logging.info(f'all new performers fetched. ')
+            break
+        last_perfomer_id = fetch_performers_detail_once(perfomers_list)
+        logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
+        if debug:
+            break
+
+    # 获取待更新的演员的列表
+    while True:
+        perfomers_list = db_tools.get_performers_needed_update(limit=limit_count)
+        if len(perfomers_list) < 1:
+            logging.info(f'all existed performers updated. ')
+            break
+        last_perfomer_id = fetch_performers_detail_once(perfomers_list)
+        logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
+        if debug:
+            break
+
+# 更新影片信息
+def fetch_movies_detail():
+    limit_count = 10 if debug else 100
+    movies_list = []
+    while True:
+        movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
+        if len(movies_list) < 1:
+            logging.info(f'all movies fetched.')
+            break
+        last_movie_id = 0
+        succ_count = 0
+        for movie in movies_list:
+            url = movie['href']
+            title = movie['title']
+            logging.debug(f"Fetching data for movie ({title}), url {url} ...")
+            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
+            if soup:
+                movie_data = scraper.parse_page_movie(soup, url, title)
+                if movie_data :
+                    # 修复url不规范的问题
+                    if movie_data['DistributorHref']:
+                        movie_data['DistributorHref'] = utils.dist_stu_href_rewrite(movie_data['DistributorHref'].lower())
+                    if movie_data['StudioHref']:
+                        movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
+                    movie_id = db_tools.insert_or_update_movie(movie_data)
+                    if movie_id:
+                        logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
+                        last_movie_id = movie_id
+                        succ_count += 1
+                    else:
+                        logging.warning(f'insert movie {url} failed.')
+
+                    # 写入到本地json文件
+                    utils.write_movie_json(url, movie_data)
+                else:
+                    logging.warning(f'parse_page_movie error. url: {url}')
+            elif status_code  and status_code == 404:
+                # 标记为已处理
+                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url)
+                logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
+            else:
+                logging.warning(f'fetch_page error. url: {url}')
+            time.sleep(1)
+        logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
+        # 调试增加break
+        if debug:
+            return True
+
+
+# 建立缩写到函数的映射
+function_map = {
+    "astro": fetch_performers_by_astro,
+    "birth": fetch_performers_by_birth,
+    "ethnic": fetch_performers_by_ethnic,
+    "dist" : fetch_movies_by_dist,
+    "stu" : fetch_movies_by_stu,
+    "performers": fetch_performers_detail,
+    "movies" : fetch_movies_detail,
+}   
+
+# 主函数
+def main(cmd, args_debug, args_force):
+    global debug
+    debug = args_debug
+
+    global force
+    force = args_force
+
+    # 开启任务
+    task_id = db_tools.insert_task_log()
+    if task_id is None:
+        logging.warning(f'insert task log error.')
+        return None
+    
+    logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')
+
+    # 执行指定的函数
+    if cmd:
+        function_names = args.cmd.split(",")  # 拆分输入
+        for short_name in function_names:
+            func = function_map.get(short_name.strip())  # 从映射中获取对应的函数
+            if callable(func):
+                db_tools.update_task_log(task_id, task_status=f'Running {func}')
+                func()
+            else:
+                print(f"Warning: {short_name} is not a valid function shortcut.")
+    else: # 全量执行
+        for name, func in function_map.items():
+            if callable(func):
+                db_tools.update_task_log(task_id, task_status=f'Running {func}')
+                func()
+            else:
+                print(f"Warning: {name} is not a valid function shortcut.")
+
+    logging.info(f'all process completed!')
+    db_tools.finalize_task_log(task_id)
+
+    # TODO:
+    # 1, movies 更新之后，要给相应的 performers 表打个 is_full_data = 0, 然后刷新获取
+    # 2, distributors 和 studios 对movie列表的互相检验
+    # 3, 数据不规范问题，可以先手动导入所有 performers 和 movies ，然后用本程序增量获取新的
+
+if __name__ == "__main__":
+    # 命令行参数处理
+    keys_str = ",".join(function_map.keys())
+
+    parser = argparse.ArgumentParser(description='fetch iafd data.')
+    parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
+    parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
+    args = parser.parse_args()
+    
+    main(args.cmd, args.debug, args.force)
--- a/iafd/src/iafd_scraper.py
+++ b/iafd/src/iafd_scraper.py
@ -0,0 +1,562 @@
+
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+import config
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+astr_base_url = f"{host_url}/astrology.rme/sign="
+astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
+
+birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
+
+distributors_list_url = f'{host_url}/distrib.asp'
+distributors_base_url = f"{host_url}/distrib.rme/distrib="
+
+studios_list_url = f"{host_url}/studio.asp"
+studios_base_url = f"{host_url}/studio.rme/studio="
+
+ethnic_list_url = f'{host_url}/advsearch.asp'
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+    for attempt in range(max_retries):
+        try:
+            if host_url not in url.lower():
+                logging.error(f'wrong url format: {url}')
+                return None, None
+            
+            response = scraper.get(url, headers=headers)
+
+            # 处理 HTTP 状态码
+            if response.status_code == 404:
+                logging.debug(f"Page not found (404): {url}")
+                return None, 404  # 直接返回 404，调用方可以跳过
+            
+            response.raise_for_status()  # 处理 HTTP 错误
+
+            # 过期的网页，与404相同处理
+            if "invalid or outdated page" in response.text.lower():
+                logging.debug(f"invalid or outdated page: {url}")
+                return None, 404  # 直接返回 404，调用方可以跳过                
+
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(response.text) if preprocessor else response.text
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup, response.status_code
+
+            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+        except cloudscraper.exceptions.CloudflareChallengeError as e:
+            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+        except cloudscraper.exceptions.CloudflareCode1020 as e:
+            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+        except Exception as e:
+            logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+    logging.error(f'Fetching failed after max retries. {url}')
+    return None, None  # 达到最大重试次数仍然失败
+
+# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
+def preprocess_html(html):
+    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
+
+# 通用的 HTML 结构验证器
+def generic_validator(soup, tag, identifier, attr_type="id"):
+    if attr_type == "id":
+        return soup.find(tag, id=identifier) is not None
+    elif attr_type == "class":
+        return bool(soup.find_all(tag, class_=identifier))
+    elif attr_type == "name": 
+        return bool(soup.find('select', {'name': identifier}))
+    return False
+
+# 检查电影信息是否存在
+def movie_validator(soup, table_id):
+    return soup.find("table", id=table_id) is not None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_ethnic_list(soup, href):
+    div_root = soup.find("select", id="ethnicity1")
+    if not div_root:
+        logging.warning(f"Warning: No 'ethnicity1' select found in {href}")
+        return None, None
+    
+    list_data = []
+
+    # 提取所有的 <option> 标签
+    options = div_root.find_all('option')
+    if options:
+        # 解析并输出 value 和文本内容
+        for option in options:
+            href = option.get('value', None)
+            text = option.text.strip()
+            if href and href.lower() == 'none':
+                continue
+            list_data.append({
+                "name": text,
+                "href": host_url + href if href else '' 
+            })
+    return list_data
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_astro(soup, astro):
+    astro_div = soup.find("div", id="astro")
+    if not astro_div:
+        logging.warning(f"Warning: No 'astro' div found in {astro}")
+        return None, None
+    
+    flag = False
+    list_cnt = 0
+    list_data = []
+    next_url = None
+
+    birth_date = None
+    for elem in astro_div.find_all(recursive=False):
+        if elem.name == "h3" and "astroday" in elem.get("class", []):
+            birth_date = elem.get_text(strip=True)
+        elif elem.name == "div" and "perficon" in elem.get("class", []):
+            a_tag = elem.find("a")
+            if a_tag:
+                href = host_url + a_tag["href"]
+                name = a_tag.find("span", class_="perfname")
+                if name:
+                    list_data.append({
+                        "astrology": astro,
+                        "birth_date": birth_date,
+                        "person": name.get_text(strip=True),
+                        "href": href
+                    })
+                    flag = True
+                    list_cnt = list_cnt +1
+    if flag:
+        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+        return list_data, next_url
+    else:
+        return None, None
+
+
+# 解析页面内容并更新birth_map
+def parse_page_birth(soup, month, day):
+    datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
+    if not datarows:
+        return None, None
+    
+    flag = False
+    list_cnt = 0
+    list_data = []
+    next_url = None
+    rows = datarows[0].find_all('div', class_='col-sm-4')
+    for row in rows:
+        link_tag = row.find('a')
+        person = link_tag.text.strip() if link_tag else ''
+        href = link_tag['href'] if link_tag else ''
+        href = host_url + href
+        
+        # 如果 href 已经在 birth_map 中，跳过
+        flag = True
+        if any(entry['href'] == href for entry in list_data):
+            continue
+        
+        # 将数据添加到 birth_map
+        list_data.append({
+            'month': month,
+            'day': day,
+            'person': person,
+            'href': href
+        })
+        list_cnt = list_cnt +1
+
+    if flag:
+        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+        return list_data, next_url
+    else:
+        return None, None
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_ethnic(soup, ethnic):
+    rows = soup.find_all('div', class_='row headshotrow')
+    flag = False
+    list_data = []
+    next_url = None
+
+    for row in rows:
+        for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
+            link_tag = col.find('a')
+            img_tag = col.find('div', class_='pictag')
+            flag = True
+
+            if link_tag and img_tag:
+                href = host_url + link_tag['href']
+                person = img_tag.text.strip()
+
+                # 将数据存储到 ethnic_map
+                list_data.append({
+                    'ethnic': ethnic,
+                    'person': person,
+                    'href': href
+                })
+    if flag:
+        logging.debug(f"get {len(list_data)} persons from this page.")
+
+        next_page = soup.find('a', rel='next')
+        if next_page:
+            next_url = host_url + next_page['href']
+            logging.debug(f"Found next page: {next_url}")
+            return list_data, next_url
+        else:
+            logging.debug(f"All pages fetched for {ethnic}.")
+            return list_data, None
+    else:
+        return None, None
+
+# 解析列表页
+def parse_page_dist_stu_list(soup, select_name):
+    list_data = []
+    next_url = None
+
+    select_element = soup.find('select', {'name': select_name})
+    if select_element :    
+        options = select_element.find_all('option')
+        for option in options:
+            value = option.get('value')  # 获取 value 属性
+            text = option.text.strip()   # 获取文本内容
+            list_data.append({
+                'name' : text,
+                'href' : str(value)
+            })
+        return list_data, next_url
+    else:
+        return None, None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_dist_stu(soup, table_id):
+    table = soup.find("table", id=table_id)
+    if not table:
+        logging.warning(f"Warning: No {table_id} table found ")
+        return None, None
+    
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 5:
+            title = cols[0].text.strip()
+            label = cols[1].text.strip()
+            year = cols[2].text.strip()
+            rev = cols[3].text.strip()
+            a_href = cols[0].find('a')
+            href = host_url + a_href['href'] if a_href else ''
+
+            list_data.append({
+                'title': title,
+                'label': label,
+                'year': year,
+                'rev': rev,
+                'href': href
+            })
+    return list_data, next_url
+
+
+# 解析 作品列表，有个人出演，也有导演的
+def parse_credits_table(table, distributor_list):
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    movies = []
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+
+    # rows = table.find_all('tr', class_='we')
+    for row in rows:
+        #tr_class = row.get('class', '')  # 获取 class 属性，如果没有则返回空字符串
+        tr_class = ' '.join(row.get('class', []))  # 获取 class 属性，如果没有则返回空字符串
+        cols = row.find_all('td')
+        if len(cols) >= 6:
+            title = cols[0].text.strip()
+            href_a = cols[0].find('a') 
+            href = href_a['href'] if href_a else ''
+            year = cols[1].text.strip()
+            distributor = cols[2].text.strip().lower()
+            href_d = cols[2].find('a') 
+            href_dist = host_url + href_d['href'] if href_d else ''
+            notes = cols[3].text.strip()
+            rev = cols[4].text.strip()
+            formats = cols[5].text.strip()
+
+            for key in distributor_list:
+                if key in distributor:
+                    distributor_count[key] += 1
+
+            movies.append({
+                'title': title,
+                'href' : href,
+                'year': year,
+                'distributor': distributor,
+                'distributor_href': href_dist,
+                'notes': notes,
+                'rev': rev,
+                'formats': formats,
+                'tr_class': tr_class
+            })
+    return movies, distributor_count
+
+
+# 请求网页并提取所需数据
+def parse_page_performer(soup):
+    # 提取数据
+    data = {}
+
+    # 定义我们需要的字段名称和HTML中对应的标签
+    fields = {
+        'performer_aka': 'Performer AKA',
+        'birthday': 'Birthday',
+        'astrology': 'Astrology',
+        'birthplace': 'Birthplace',
+        'gender': 'Gender',
+        'years_active': 'Years Active',
+        'ethnicity': 'Ethnicity',
+        'nationality': 'Nationality',
+        'hair_colors': 'Hair Colors',
+        'eye_color': 'Eye Color',
+        'height': 'Height',
+        'weight': 'Weight',
+        'measurements': 'Measurements',
+        'tattoos': 'Tattoos',
+        'piercings': 'Piercings'
+    }
+    reversed_map = {v: k for k, v in fields.items()}
+
+    # 解析表格数据, 获取参演或者导演的列表
+    role_list = ['personal', 'directoral']
+    distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']        
+    credits_list = {}
+
+    # 使用字典来存储统计
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+    for role in role_list:
+        table = soup.find('table', id=role)
+        if table :
+            movies, stat_map = parse_credits_table(table, distributor_list)
+            credits_list[role] = movies
+            # 更新 distributor 统计
+            for distributor in distributor_list:
+                distributor_count[distributor] += stat_map.get(distributor, 0)
+
+    # 统计 movies 数量
+    #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
+    movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
+
+    # 如果没有找到
+    if len(credits_list) == 0 :
+        logging.warning(f"movie table empty. url: {url} ")
+
+    # 遍历每个 bioheading, 获取metadata
+    bioheadings = soup.find_all('p', class_='bioheading')
+    for bio in bioheadings:
+        heading = bio.text.strip()
+        biodata = None
+
+        # 如果包含 "Performer",需要特殊处理
+        if 'Performer' in heading:
+            heading = 'Performer AKA'
+            biodata_div = bio.find_next('div', class_='biodata')
+            if biodata_div:
+                div_text = biodata_div.get_text(separator='|').strip()
+                biodata = [b.strip() for b in div_text.split('|') if b.strip()]
+        else:
+            biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
+        
+        # 保存数据
+        if heading in reversed_map:
+            kkey = reversed_map[heading]
+            data[kkey] = biodata
+            
+    # 添加统计数据到 data
+    data['movies_cnt'] = movies_cnt
+    data['vixen_cnt'] = distributor_count['vixen']
+    data['blacked_cnt'] = distributor_count['blacked']
+    data['tushy_cnt'] = distributor_count['tushy']
+    data['x_art_cnt'] = distributor_count['x-art']
+    data['credits'] = credits_list
+
+    return data
+
+
+
+# 解析网页 HTML 并提取电影信息
+def parse_page_movie(soup, href, title):
+    # 解析电影基础信息
+    movie_data = {}
+    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+    if info_div:
+        labels = info_div.find_all("p", class_="bioheading")
+        values = info_div.find_all("p", class_="biodata")
+        for label, value in zip(labels, values):
+            key = label.text.strip()
+            val = value.text.strip()
+            if key in ["Distributor", "Studio", "Director"]:
+                link = value.find("a")
+                if link:
+                    val = link.text.strip()
+                    movie_data[f'{key}Href'] = host_url + link['href']
+            movie_data[key] = val
+    else:
+        return None
+
+    # 解析演职人员信息
+    performers = []
+    cast_divs = soup.find_all("div", class_="castbox")
+    for cast in cast_divs:
+        performer = {}
+        link = cast.find("a")
+        if link:
+            performer["name"] = link.text.strip()
+            performer["href"] =  host_url + link["href"]
+
+        performer["tags"] = [
+            tag.strip() for br in cast.find_all("br")
+            if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+        ]
+        
+        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+        performers.append(performer)
+
+    # 解析场景拆解
+    scene_breakdowns = []
+    scene_table = soup.find("div", id="sceneinfo")
+    if scene_table:
+        rows = scene_table.find_all("tr")
+
+        for row in rows:
+            cols = row.find_all("td")
+            if len(cols) >= 2:
+                scene = cols[0].text.strip()  # 场景编号
+                performer_info = cols[1]  # 包含表演者及链接信息
+
+                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
+                performer_html = str(performer_info)  # 获取所有HTML内容
+                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
+                if split_html:
+                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                else:
+                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
+                    if split_html:
+                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                    else:
+                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部
+
+                # 解析为纯文本（去除HTML标签，仅提取文本内容）
+                performers_soup = BeautifulSoup(performers_html, "html.parser")
+                performers_text = performers_soup.get_text()
+
+                # 提取表演者
+                scene_performers = [p.strip() for p in performers_text.split(",")]
+
+                # 尝试获取 `webscene` 和 `studio`
+                links_data = {}
+                links = performer_info.find_all("a")
+                if links:
+                    webscene_title = links[0].text.strip() if len(links)>0 else None
+                    webscene = links[0]["href"] if len(links)>0 else None
+                    studio = links[1].text.strip() if len(links)>1 else None
+                    studio_lnk = links[1]["href"] if len(links)>1 else None
+                    links_data = {
+                        "title": webscene_title,
+                        "webscene": webscene,
+                        "studio": studio,
+                        "studio_lnk": studio_lnk,
+                    }
+
+                scene_data = {
+                    "scene": scene,
+                    "performers": scene_performers,
+                    **links_data,
+                }
+                scene_breakdowns.append(scene_data)
+
+    appears_in = []
+    appears_divs = soup.find("div", id="appearssection")
+    if appears_divs:
+        rows = appears_divs.find_all("li")
+        for row in rows:
+            lnk = row.find("a")
+            if lnk:
+                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+    return {
+        "href": href,
+        "title": title,
+        "Minutes": movie_data.get("Minutes", ""),
+        "Distributor": movie_data.get("Distributor", ""),
+        "Studio": movie_data.get("Studio", ""),
+        "ReleaseDate": movie_data.get("Release Date", ""),
+        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+        "All-Girl": movie_data.get("All-Girl", ""),
+        "All-Male": movie_data.get("All-Male", ""),
+        "Compilation": movie_data.get("Compilation", ""),
+        "Webscene": movie_data.get("Webscene", ""),
+        "Director": movie_data.get("Director", ""),
+        "DirectorHref": movie_data.get("DirectorHref", ""),
+        "DistributorHref": movie_data.get("DistributorHref", ""),
+        "StudioHref": movie_data.get("StudioHref", ""),
+        "Performers": performers,
+        "SceneBreakdowns": scene_breakdowns,
+        "AppearsIn": appears_in,
+    }
+
+
+if __name__ == "__main__":
+
+    for astro in astro_list:
+        url = astr_base_url + astro
+        next_url = url
+        logging.info(f"Fetching data for {astro}, url {url} ...")
+
+        while True:
+            soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
+            if soup:
+                list_data, next_url = parse_page_astro(soup, astro)
+                if list_data:
+                    print(list_data[0] if len(list_data)>0 else 'no data')
+                    break
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+
+        time.sleep(2)  # 控制访问频率
--- a/iafd/src/load.py
+++ b/iafd/src/load.py
@ -0,0 +1,107 @@
+
+import json
+import time
+import csv
+import argparse
+import logging
+from functools import partial
+import config
+import sqlite_utils as db_tools
+import iafd_scraper as scraper
+import utils 
+
+config.setup_logging()
+
+res_dir = '/root/hostdir/scripts_data/iafd_202503'
+
+# 演员列表
+def load_performer_list(file, **from_fields):
+    json_data = utils.read_json(file)
+    if json_data is None:
+        json_data = []
+    
+    total_rows = len(json_data)
+    loaded_rows = 0
+    succ = 0
+    for row in json_data:
+        row_id = db_tools.insert_performer_index(name=row.get('person', ''), 
+                                                 href=row.get('href', ''), 
+                                                 **from_fields
+                                                 )
+        if row_id:
+            logging.debug(f'insert one person, id: {row_id}, person: {row['person']}, url: {row['href']}')
+            succ += 1
+        else:
+            logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
+        loaded_rows += 1
+        if loaded_rows % 10000 == 0:
+            logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
+    
+    logging.info(f'load data succ. file: {file}, rows: {total_rows}, succ rows: {succ}')
+
+# movie 列表
+def load_movie_list(file, **from_fields):
+    json_data = utils.read_json(file)
+    if json_data is None:
+        json_data = []
+    
+    total_rows = len(json_data)
+    loaded_rows = 0
+    succ = 0
+    for row in json_data:
+        row_id = db_tools.insert_movie_index(title=row.get('title', ''), 
+                                             href=row.get('href', ''), 
+                                             release_year=utils.to_number(row['year']), 
+                                             **from_fields
+                                             )
+        if row_id:
+            logging.debug(f'insert one movie, id: {row_id}, title: {row['title']}, url: {row['href']}')
+            succ += 1
+        else:
+            logging.warning(f'insert movie failed: {row['title']}, {row['href']} failed.')
+        loaded_rows += 1
+        if loaded_rows % 10000 == 0:
+            logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
+    
+    logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
+
+
+# 演员详情
+def load_performers(file):
+    json_data = utils.read_json(file)
+    if json_data is None:
+        json_data = []
+    
+    total_rows = len(json_data)
+    loaded_rows = 0
+    succ = 0
+    for row in json_data:
+        performer_id = db_tools.insert_or_update_performer(row)
+        if performer_id:
+            logging.debug(f'insert one person, id: {performer_id}, person: {row['person']}, url: {row['href']}')
+            succ += 1
+        else:
+            logging.warning(f'insert person failed. {row['person']}, {row['href']} failed.')
+        loaded_rows += 1
+        if loaded_rows % 10000 == 0:
+            logging.info(f'loading file: {file}, total rows: {total_rows}, loaded rows: {loaded_rows}, succ rows: {succ}')
+
+    logging.info(f'load data succ. file: {file}, rows: {len(json_data)}, succ rows: {succ}')
+
+
+if __name__ == "__main__":
+    
+    load_performer_list(f'{res_dir}/astro.json', from_astro_list=1)
+    time.sleep(3)
+    load_performer_list(f'{res_dir}/birth.json', from_birth_list=1)
+    time.sleep(3)
+    load_performer_list(f'{res_dir}/ethnic.json', from_ethnic_list=1)
+    time.sleep(3)
+
+    load_movie_list(f'{res_dir}/distributors.json', from_dist_list=1)
+    time.sleep(3)
+    load_movie_list(f'{res_dir}/studios.json', from_stu_list=1)
+    time.sleep(3)
+
+    load_performers(f'{res_dir}/performers.json')
+    
--- a/iafd/src/sqlite_utils.py
+++ b/iafd/src/sqlite_utils.py
@ -0,0 +1,848 @@
+import sqlite3
+import json
+import config
+import utils
+import logging
+import sys
+from datetime import datetime
+
+# 连接 SQLite 数据库
+DB_PATH = f"{config.global_share_data_dir}/shared.db"  # 替换为你的数据库文件
+conn = sqlite3.connect(DB_PATH)
+cursor = conn.cursor()
+
+# 获取当前时间
+def get_current_time():
+    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+# """从指定表中通过 href 查找 id"""
+def get_id_by_href(table: str, href: str) -> int:
+    if href is None:
+        return None
+    cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
+    row = cursor.fetchone()
+    return row[0] if row else None
+
+# 插入演员索引，来自于列表数据
+def insert_performer_index(name, href, from_astro_list=None, from_birth_list=None, from_ethnic_list=None, from_movie_list=None):
+    try:
+        # **查询是否已存在该演员**
+        cursor.execute("""
+            SELECT id, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list 
+            FROM iafd_performers WHERE href = ?
+        """, (href,))
+        existing_performer = cursor.fetchone()
+
+        if existing_performer:  # **如果演员已存在**
+            performer_id, existing_name, existing_astro, existing_birth, existing_ethnic, existing_movie = existing_performer
+
+            # **如果没有传入值，则保持原有值**
+            from_astro_list = from_astro_list if from_astro_list is not None else existing_astro
+            from_birth_list = from_birth_list if from_birth_list is not None else existing_birth
+            from_ethnic_list = from_ethnic_list if from_ethnic_list is not None else existing_ethnic
+            from_movie_list = from_movie_list if from_movie_list is not None else existing_movie
+
+            cursor.execute("""
+                UPDATE iafd_performers 
+                SET name = ?, 
+                    from_astro_list = ?, 
+                    from_birth_list = ?, 
+                    from_ethnic_list = ?, 
+                    from_movie_list = ?, 
+                    updated_at = datetime('now', 'localtime')
+                WHERE href = ?
+            """, (name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list, href))
+        else:  # **如果演员不存在，插入**
+            cursor.execute("""
+                INSERT INTO iafd_performers (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list) 
+                VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
+            """, (href, name, from_astro_list, from_birth_list, from_ethnic_list, from_movie_list))
+
+        conn.commit()
+
+        performer_id = get_id_by_href('iafd_performers', href)
+        if performer_id:
+            logging.debug(f'Inserted/Updated performer index, id: {performer_id}, name: {name}, href: {href}')
+
+        return performer_id
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+    except Exception as e:
+        conn.rollback()
+        logging.error(f"未知错误: {e}")
+        return None
+
+# """插入电影索引，来自于列表数据"""    
+def insert_movie_index(title, href, release_year=0, from_performer_list=None, from_dist_list=None, from_stu_list=None):
+    try:
+        # **查询是否已存在该电影**
+        cursor.execute("""
+            SELECT id, title, release_year, from_performer_list, from_dist_list, from_stu_list
+            FROM iafd_movies WHERE href = ?
+        """, (href,))
+        existing_movie = cursor.fetchone()
+
+        if existing_movie:  # **如果电影已存在**
+            movie_id, existing_title, existing_year, existing_performer, existing_dist, existing_stu = existing_movie
+
+            # **如果没有传入值，则保持原有值**
+            release_year = release_year if release_year != 0 else existing_year
+            from_performer_list = from_performer_list if from_performer_list is not None else existing_performer
+            from_dist_list = from_dist_list if from_dist_list is not None else existing_dist
+            from_stu_list = from_stu_list if from_stu_list is not None else existing_stu
+
+            cursor.execute("""
+                UPDATE iafd_movies 
+                SET title = ?, 
+                    release_year = ?, 
+                    from_performer_list = ?, 
+                    from_dist_list = ?, 
+                    from_stu_list = ?, 
+                    updated_at = datetime('now', 'localtime')
+                WHERE href = ?
+            """, (title, release_year, from_performer_list, from_dist_list, from_stu_list, href))
+        else:  # **如果电影不存在，插入**
+            cursor.execute("""
+                INSERT INTO iafd_movies (title, href, release_year, from_performer_list, from_dist_list, from_stu_list) 
+                VALUES (?, ?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
+            """, (title, href, release_year, from_performer_list, from_dist_list, from_stu_list))
+
+        conn.commit()
+
+        movie_id = get_id_by_href('iafd_movies', href)
+        if movie_id:
+            logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
+
+        return movie_id
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+    except Exception as e:
+        conn.rollback()
+        logging.error(f"未知错误: {e}")
+        return None
+
+# 插入演员和电影的关联数据
+def insert_performer_movie(performer_id, movie_id, role, notes):
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_performers_movies (performer_id, movie_id, role, notes) 
+            VALUES (?, ?, ?, ?)
+            ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes, role=excluded.role
+        """,
+            (performer_id, movie_id, role, notes)
+        )
+        conn.commit()
+
+        #logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')  
+
+        return performer_id
+        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+
+# 插入电影和电影的关联数据
+def insert_movie_appears_in(movie_id, appears_in_id, gradation=0, notes=''):
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_movies_appers_in (movie_id, appears_in_id, gradation, notes) 
+            VALUES (?, ?, ?, ?)
+            ON CONFLICT(movie_id, appears_in_id) DO UPDATE SET notes=excluded.notes, gradation=excluded.gradation
+        """,
+            (movie_id, appears_in_id, gradation, notes)
+        )
+        conn.commit()
+
+        #logging.debug(f'insert one movie_appears_in, movie_id: {movie_id}, appears_in_id: {appears_in_id}')  
+
+        return movie_id
+        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+
+
+# 插入演员信息
+def insert_or_update_performer(data):
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors, 
+                                    eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt, 
+                                    blacked_cnt, tushy_cnt, x_art_cnt, is_full_data, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET
+                name = excluded.name,
+                gender = excluded.gender,
+                birthday = excluded.birthday,
+                astrology = excluded.astrology,
+                birthplace = excluded.birthplace,
+                years_active = excluded.years_active,
+                ethnicity = excluded.ethnicity,
+                nationality = excluded.nationality,
+                hair_colors = excluded.hair_colors,
+                eye_color = excluded.eye_color,
+                height_str = excluded.height_str,
+                weight_str = excluded.weight_str,
+                measurements = excluded.measurements,
+                tattoos = excluded.tattoos,
+                piercings = excluded.piercings,
+                weight = excluded.weight,
+                height = excluded.height,
+                movies_cnt = excluded.movies_cnt,
+                vixen_cnt = excluded.vixen_cnt,
+                blacked_cnt = excluded.blacked_cnt,
+                tushy_cnt = excluded.tushy_cnt,
+                x_art_cnt = excluded.x_art_cnt,
+                is_full_data = 1,
+                updated_at = datetime('now', 'localtime')
+        """, (
+            data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
+            data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
+            data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')), 
+            data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
+        ))
+        
+        # 获取 performer_id
+        performer_id = get_id_by_href('iafd_performers', data["href"])
+        if performer_id is None:
+            return None
+        logging.debug(f'insert one performer, id: {performer_id}, name: {data['person']}, href: {data['href']}')
+
+        # 插入新的 alias
+        for alias in data.get("performer_aka") or []:
+            if alias.lower() != "no known aliases":
+                cursor.execute("INSERT OR IGNORE INTO iafd_performer_aliases (performer_id, alias) VALUES (?, ?) ", (performer_id, alias))
+
+        conn.commit()
+
+        # 插入影片列表，可能有 personal 和 director 两个身份
+        credits = data.get('credits', {})
+        for role, movies in credits.items():
+            if movies:
+                for movie in movies:
+                    movie_id = get_id_by_href('iafd_movies', movie['href'])
+                    # 影片不存在，先插入
+                    if movie_id is None:
+                        movie_id = insert_movie_index(movie['title'], movie['href'], utils.to_number(movie['year']), from_performer_list=1)
+                    if movie_id:
+                        tmp_id = insert_performer_movie(performer_id, movie_id, role, movie['notes'])
+                        if tmp_id :
+                            logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}, role: {role}')  
+                    else:
+                        logging.warning(f'insert performer_movie failed. performer_id: {performer_id}, moive href: {movie['href']}')
+
+        return performer_id
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+    except Exception as e:
+        conn.rollback()
+        logging.error(f"未知错误: {e}")
+        return None
+
+
+# """插入或更新电影数据(异常url的处理，比如404链接)"""
+def insert_or_update_performer_404(name, href):
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
+            VALUES (?, ?, 1, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET
+                name = excluded.name,
+                is_full_data = 1,
+                updated_at = datetime('now', 'localtime')
+        """, (
+            href, name
+        ))
+        
+        # 获取 performer_id
+        performer_id = get_id_by_href('iafd_performers', href)
+        if performer_id is None:
+            return None
+        logging.debug(f'insert one performer, id: {performer_id}, name: {name}, href: {href}')
+
+        return performer_id
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+    except Exception as e:
+        conn.rollback()
+        logging.error(f"未知错误: {e}")
+        return None
+
+
+# 按 id 或 href 删除演员
+def delete_performer(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM iafd_performers WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM iafd_performers WHERE href = ?", (identifier,))
+        else:
+            logging.warning("无效的删除参数")
+            return
+        conn.commit()
+        logging.info(f"成功删除演员: {identifier}")
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# 按 id、href 或 name 查询演员信息
+def query_performer(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM iafd_performers WHERE id = ?", (identifier,))
+        elif "http" in identifier:
+            cursor.execute("SELECT * FROM iafd_performers WHERE href = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM iafd_performers WHERE name LIKE ?", (f"%{identifier}%",))
+
+        performer = cursor.fetchone()
+        if performer:
+            cursor.execute("SELECT alias FROM iafd_performer_aliases WHERE performer_id = ?", (performer[0],))
+            aliases = [row[0] for row in cursor.fetchall()]
+            result = dict(zip([desc[0] for desc in cursor.description], performer))
+            result["performer_aka"] = aliases
+            return result
+        else:
+            logging.warning(f"未找到演员: {identifier}")
+            return None
+
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_performer_hrefs(**filters):
+    try:
+        sql = "SELECT href, name FROM iafd_performers WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+        if "is_full_data" in filters:
+            sql += " AND is_full_data = ?"
+            params.append(filters["is_full_data"])
+        if 'limit' in filters:
+            sql += " limit ?"
+            params.append(filters["limit"])
+
+
+        cursor.execute(sql, params)
+        #return [row[0].lower() for row in cursor.fetchall()]   # 返回小写
+        return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+    
+
+# 插入或更新发行商 """
+def insert_or_update_ethnic(data):
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_meta_ethnic (name, href) 
+            VALUES (?, ?)
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name
+        """, (data["name"], data["href"]))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM iafd_meta_ethnic WHERE href = ?", (data["href"],))
+        dist_id = cursor.fetchone()[0]
+        if dist_id:
+            logging.debug(f"成功插入/更新ethnic: {data['name']}")
+            return dist_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_ethnic_hrefs(**filters):
+    try:
+        sql = "SELECT href, name FROM iafd_meta_ethnic WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "url" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        #return [row[0].lower() for row in cursor.fetchall()]  # 链接使用小写
+        return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+# 插入或更新发行商 """
+def insert_or_update_distributor(data):
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_distributors (name, href, updated_at) 
+            VALUES (?, ? , datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name, 
+                updated_at = datetime('now', 'localtime')
+        """, (data["name"], data["href"]))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM iafd_distributors WHERE href = ?", (data["href"],))
+        dist_id = cursor.fetchone()[0]
+        if dist_id:
+            logging.debug(f"成功插入/更新发行商: {data['name']}")
+            return dist_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
+# 删除发行商（按 id 或 name） """
+def delete_distributor(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM iafd_distributors WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM iafd_distributors WHERE name = ?", (identifier,))
+        conn.commit()
+        logging.info(f"成功删除发行商: {identifier}")
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# 查询发行商（按 id 或 name） """
+def query_distributor(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM iafd_distributors WHERE id = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM iafd_distributors WHERE name LIKE ?", (f"%{identifier}%",))
+
+        distributor = cursor.fetchone()
+        if distributor:
+            return dict(zip([desc[0] for desc in cursor.description], distributor))
+        else:
+            logging.warning(f"未找到发行商: {identifier}")
+            return None
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_distributor_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM iafd_distributors WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "url" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        return [row[0].lower() for row in cursor.fetchall()]  # 链接使用小写
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+# """ 插入或更新制作公司 """
+def insert_or_update_studio(data):
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_studios (name, href, updated_at) 
+            VALUES (?, ?, datetime('now', 'localtime')) 
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name, 
+                updated_at = datetime('now', 'localtime')
+        """, (data["name"], data["href"]))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM iafd_studios WHERE href = ?", (data["href"],))
+        stu_id = cursor.fetchone()[0]
+        if stu_id:
+            logging.debug(f"成功插入/更新发行商: {data['name']}")
+            return stu_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
+# """ 删除制作公司（按 id 或 name） """
+def delete_studio(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM iafd_studios WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM iafd_studios WHERE name = ?", (identifier,))
+        conn.commit()
+        logging.info(f"成功删除制作公司: {identifier}")
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# """ 查询制作公司（按 id 或 name） """
+def query_studio(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM iafd_studios WHERE id = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM iafd_studios WHERE name LIKE ?", (f"%{identifier}%",))
+
+        studio = cursor.fetchone()
+        if studio:
+            return dict(zip([desc[0] for desc in cursor.description], studio))
+        else:
+            logging.warning(f"未找到制作公司: {identifier}")
+            return None
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_studio_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM iafd_studios WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        return [row[0].lower() for row in cursor.fetchall()]    # 链接使用小写
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+# """插入或更新电影数据"""
+def insert_or_update_movie(movie_data):
+    try:
+        # 获取相关 ID
+        distributor_id = get_id_by_href('iafd_distributors', movie_data['DistributorHref'])
+        studio_id = get_id_by_href('iafd_studios', movie_data['StudioHref'])
+        director_id = get_id_by_href('iafd_performers', movie_data['DirectorHref'])
+        # 导演不存在的话，插入一条
+        if director_id is None:
+            director_id = insert_performer_index( movie_data['Director'],  movie_data['DirectorHref'], from_movie_list=1)
+        if studio_id is None:
+            studio_id = 0
+        if distributor_id is None:
+            distributor_id = 0
+
+        # 插入或更新电影信息
+        cursor.execute(
+            """
+            INSERT INTO iafd_movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date, 
+                               all_girl, all_male, compilation, webscene, director_id, href, is_full_data, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET 
+                title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id, 
+                studio_id=excluded.studio_id, release_date=excluded.release_date, 
+                added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl, 
+                all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
+                director_id=excluded.director_id, is_full_data=1, updated_at = datetime('now', 'localtime')
+            """,
+            (movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'], 
+             movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'], 
+             movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
+        )
+        conn.commit()
+        
+        # 获取插入的 movie_id
+        movie_id = get_id_by_href('iafd_movies', movie_data['href'])
+        if movie_id is None:
+            return None
+        
+        logging.debug(f'insert one move, id: {movie_id}, title: {movie_data['title']}, href: {movie_data['href']}')
+        
+        # 插入 performers_movies 关系表
+        for performer in movie_data.get('Performers', []):
+            performer_id = get_id_by_href('iafd_performers', performer['href'])
+            # 如果演员不存在，先插入
+            if performer_id is None:
+                performer_id = insert_performer_index(performer['name'], performer['href'], from_movie_list=1)
+            if performer_id:
+                notes = '|'.join(tag for tag in performer['tags'] if tag != performer['name'])
+                tmp_id = insert_performer_movie(performer_id, movie_id, 'personal', notes)
+                if tmp_id:
+                    logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
+                else:
+                    logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
+            else:
+                logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
+
+        # 插入 movies_appers_in 表
+        for appears in movie_data.get("AppearsIn", []):
+            appears_in_id = get_id_by_href('iafd_movies', appears['href'])
+            # 不存在，先插入
+            if appears_in_id is None:
+                appears_in_id = insert_movie_index( appears['title'], appears['href'])
+            if appears_in_id:
+                tmp_id = insert_movie_appears_in(movie_id, appears_in_id)
+                if tmp_id:
+                    logging.debug(f'insert one movie_appears_in record. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
+                else:
+                    logging.warning(f'insert movie_appears_in failed. movie_id: {movie_id}, appears_in_id: {appears_in_id}')
+            else:
+                logging.warning(f'get appears_in_id failed. title: {appears['title']}, href: {appears['href']}')
+                
+        return movie_id
+        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+
+
+# """插入或更新电影数据(异常url的处理，比如404链接)"""
+def insert_or_update_movie_404(title, href):
+    try:
+        # 插入或更新电影信息
+        cursor.execute(
+            """
+            INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
+            VALUES (?, ?, 1, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET 
+                title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime')
+            """,
+            (title, href)
+        )
+        conn.commit()
+        
+        # 获取插入的 movie_id
+        movie_id = get_id_by_href('iafd_movies', href)
+        if movie_id is None:
+            return None
+                        
+        return movie_id
+        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+
+
+# 删除电影数据"""
+def delete_movie(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM iafd_movies WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM iafd_movies WHERE href = ?", (identifier,))
+        else:
+            logging.warning("无效的删除参数")
+            return
+        conn.commit()
+        logging.info(f"Deleted movie with {identifier}")
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error("Error deleting movie: %s", e)
+
+# 查找电影数据"""
+def query_movies(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM iafd_movies WHERE id = ?", (identifier,))
+        elif "http" in identifier:
+            cursor.execute("SELECT * FROM iafd_movies WHERE href = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM iafd_movies WHERE title LIKE ?", (f"%{identifier}%",))
+
+        movie = cursor.fetchone()
+        if movie:
+            cursor.execute("SELECT * FROM iafd_performers_movies WHERE performer_id = ?", (movie[0],))
+            performers = [row[0] for row in cursor.fetchall()]
+            result = dict(zip([desc[0] for desc in cursor.description], performers))
+            result["performers"] = performers
+            return result
+        else:
+            logging.warning(f"find no data: {identifier}")
+            return None
+
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_movie_hrefs(**filters):
+    try:
+        sql = "SELECT href, title FROM iafd_movies WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "title" in filters:
+            sql += " AND title LIKE ?"
+            params.append(f"%{filters['title']}%")
+        if "is_full_data" in filters:
+            sql += " AND is_full_data = ?"
+            params.append(filters["is_full_data"])
+        if 'limit' in filters:
+            sql += " limit ?"
+            params.append(filters["limit"])
+
+        cursor.execute(sql, params)
+        #return [row[0].lower() for row in cursor.fetchall()]    # 链接使用小写
+        return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return []
+    
+# 获取 view_iafd_performers_movies 中数据 不匹配的演员信息。
+def get_performers_needed_update(limit=None):
+    try:
+        sql = """
+            SELECT href, name FROM view_iafd_performers_movies where actual_movies_cnt != movies_cnt
+        """
+        
+        if limit is not None:
+            sql += f" LIMIT {limit}"
+
+        cursor.execute(sql)
+        return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
+    
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return []
+    
+# 插入一条任务日志
+def insert_task_log():
+    try:
+        cursor.execute("""
+            INSERT INTO iafd_task_log (task_status) VALUES ('Start')
+        """)
+        conn.commit()
+
+        task_id = cursor.lastrowid
+        if task_id is None:
+            return None
+        update_task_log(task_id=task_id, task_status='Start')
+
+        return task_id # 获取插入的 task_id
+    except sqlite3.Error as e:
+        logging.error(f"插入任务失败: {e}")
+        return None
+
+# 更新任务日志的字段
+def update_task_log_inner(task_id, **kwargs):
+    try:
+        fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
+        params = list(kwargs.values()) + [task_id]
+
+        sql = f"UPDATE iafd_task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
+        cursor.execute(sql, params)
+        conn.commit()
+    except sqlite3.Error as e:
+        logging.error(f"更新任务 {task_id} 失败: {e}")
+
+# 更新任务日志的字段
+def update_task_log(task_id, task_status):
+    try:
+        # 获取 performers、studios 等表的最终行数
+        cursor.execute("SELECT COUNT(*) FROM iafd_performers where is_full_data=1")
+        full_data_performers = cursor.fetchone()[0]
+        cursor.execute("SELECT COUNT(*) FROM iafd_performers")
+        total_performers = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM iafd_movies where is_full_data=1")
+        full_data_movies = cursor.fetchone()[0]
+        cursor.execute("SELECT COUNT(*) FROM iafd_movies")
+        total_movies = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM iafd_distributors")
+        total_distributors = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM iafd_studios")
+        total_studios = cursor.fetchone()[0]
+
+        # 更新 task_log
+        update_task_log_inner(task_id, 
+                    full_data_performers=full_data_performers,
+                    total_performers=total_performers,
+                    full_data_movies=full_data_movies,
+                    total_movies=total_movies,
+                    total_distributors=total_distributors,
+                    total_studios=total_studios,
+                    task_status=task_status)
+        
+    except sqlite3.Error as e:
+        logging.error(f"更新任务 {task_id} 失败: {e}")
+
+
+# 任务结束，更新字段
+def finalize_task_log(task_id):
+    try:
+        # 更新 task_log
+        update_task_log(task_id, task_status="Success")
+    except sqlite3.Error as e:
+        logging.error(f"任务 {task_id} 结束失败: {e}")
+
+if __name__ == "__main__":
+
+    try:
+        with open('../result/detail.json', 'r') as file:
+            performers = json.load(file)
+            for performer in performers:
+                insert_or_update_performer(performer)
+            
+            print(query_performer("Kirsten"))
+            #delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
+            print(query_performer_hrefs())
+    except FileNotFoundError:
+        logging.info("detail.json not found, starting fresh.")
--- a/iafd/src/utils.py
+++ b/iafd/src/utils.py
@ -0,0 +1,101 @@
+import re
+import os
+import json
+import time
+import csv
+import logging
+import config
+
+# 解析 height 和 weight（转换成数字）
+def parse_height(height_str):
+    return 0
+    try:
+        return int(height_str.split("(")[-1].replace(" cm)", ""))
+    except:
+        return None
+
+def parse_weight(weight_str):
+    return 0
+    try:
+        return int(weight_str.split(" ")[0])
+    except:
+        return None
+    
+update_dir = f'{config.global_host_data_dir}/iafd'
+performers_dir = f'{update_dir}/performers'
+movies_dir = f'{update_dir}/movies'
+
+def to_number(value):
+    """将字符串转换为数字，如果无效则返回 0"""
+    try:
+        return float(value)
+    except (ValueError, TypeError):
+        return 0
+
+def dist_stu_href_rewrite(href):
+    # 提取 ID（适用于 distrib 或 studio）
+    import re
+    match = re.search(r"(distrib|studio)=(\d+)", href)
+    if not match:
+        return None  # 不是目标 URL，返回 None
+
+    key, id_number = match.groups()
+    new_url = f"https://www.iafd.com/{key}.rme/{key}={id_number}"
+    return new_url
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_person_json(person, href, data):
+    # 获取目录
+    person_dir = create_sub_directory(performers_dir, person)
+    person_id = extract_id_from_href(href)
+    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+
+# 写入每个 performer 的单独 JSON 文件
+def write_movie_json(href, data):
+    # 获取目录
+    movie_id = extract_id_from_href(href)
+    person_dir = create_sub_directory(movies_dir, movie_id)
+    person_filename = f"{movie_id}.json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+
+# 读取json文件并返回内容
+def read_json(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"文件 {file_path} 未找到.")
+        return None
+    except json.JSONDecodeError:
+        print(f"文件 {file_path} 解析错误.")
+        return None
--- a/iafd/src_json/config.py
+++ b/iafd/src_json/config.py
@ -0,0 +1,26 @@
+import logging
+import os
+import inspect
+from datetime import datetime
+
+global_share_data_dir = '/root/sharedata'
+global_host_data_dir = '/root/hostdir/scripts_data'
+
+# 设置日志配置
+def setup_logging(log_filename=None):
+    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
+    if log_filename is None:
+        # 获取调用 setup_logging 的脚本文件名
+        caller_frame = inspect.stack()[1]
+        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+
+        # 获取当前日期，格式为 yyyymmdd
+        current_date = datetime.now().strftime('%Y%m%d')
+        # 拼接 log 文件名，将日期加在扩展名前
+        log_filename = f'../log/{caller_filename}_{current_date}.log'
+    
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
+                        handlers=[
+                            logging.FileHandler(log_filename),
+                            logging.StreamHandler()
+                        ])
--- a/iafd/src_json/movie_detail_fetch.py
+++ b/iafd/src_json/movie_detail_fetch.py
@ -0,0 +1,334 @@
+import os
+import json
+import csv
+import time
+import logging
+import sys
+import signal
+import re
+import cloudscraper
+from bs4 import BeautifulSoup
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+# 目录和文件路径
+RESULT_DIR = "../result"
+OUTPUT_DIR = f"{config.global_share_data_dir}/iafd"
+INPUT_FILE = os.path.join(RESULT_DIR, "movie_list.json")
+OUTPUT_JSON = os.path.join(OUTPUT_DIR, "movie_details.json")
+OUTPUT_CSV = os.path.join(OUTPUT_DIR, "movie_details.csv")
+BATCH_SIZE = 100  # 每100条数据写入文件
+movies_dir = f'{RESULT_DIR}/movies'
+
+# 初始化 Cloudflare 绕过工具
+scraper = cloudscraper.create_scraper()
+
+# 全量数据
+all_movies = []
+
+def load_existing_data():
+    """加载已处理的数据，支持续传"""
+    if os.path.exists(OUTPUT_JSON):
+        with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
+            try:
+                return json.load(f)
+            except json.JSONDecodeError:
+                return []
+    return []
+
+
+def save_data():
+    """保存数据到 JSON 和 CSV 文件"""
+    logging.info("Saving data...")
+    global all_movies
+
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump(all_movies, f, indent=4, ensure_ascii=False)
+
+    with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["href", "title", "Minutes", "Distributor", "Studio", "ReleaseDate",
+                         "AddedtoIAFDDate", "All-Girl", "All-Male", "Compilation", "Webscene", "Director"])
+        for movie in all_movies:
+            writer.writerow([movie["href"], movie["title"], movie["Minutes"], movie["Distributor"],
+                             movie["Studio"], movie["ReleaseDate"], movie["AddedtoIAFDDate"], movie["All-Girl"],
+                             movie["All-Male"], movie["Compilation"], movie["Webscene"], movie["Director"]])
+
+# 请求网页并返回 HTML 内容
+def fetch_html(href):
+    """请求网页并返回 HTML 内容"""
+    for attempt in range(3):
+        try:
+            response = scraper.get(href, timeout=10)
+            if response.status_code == 200:
+                return response.text
+        except Exception as e:
+            logging.warning(f"Error fetching {href}: {e}")
+        time.sleep(2)
+
+    logging.error(f"Failed to fetch {href} after 3 attempts")
+    return None
+
+# 解析网页 HTML 并提取电影信息
+def parse_movie_details(html, href, title):
+    """解析网页 HTML 并提取电影信息"""
+    soup = BeautifulSoup(html, "html.parser")
+
+    # 解析电影基础信息
+    movie_data = {}
+    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+    if info_div:
+        labels = info_div.find_all("p", class_="bioheading")
+        values = info_div.find_all("p", class_="biodata")
+        for label, value in zip(labels, values):
+            key = label.text.strip()
+            val = value.text.strip()
+            if key in ["Distributor", "Studio", "Director"]:
+                link = value.find("a")
+                if link:
+                    val = link.text.strip()
+                    movie_data[f'{key}Href'] = host_url + link['href']
+            movie_data[key] = val
+    else:
+        return None
+
+    # 解析演职人员信息
+    performers = []
+    cast_divs = soup.find_all("div", class_="castbox")
+    for cast in cast_divs:
+        performer = {}
+        link = cast.find("a")
+        if link:
+            performer["name"] = link.text.strip()
+            performer["href"] =  host_url + link["href"]
+
+        performer["tags"] = [
+            tag.strip() for br in cast.find_all("br")
+            if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+        ]
+        
+        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+        performers.append(performer)
+
+    # 解析场景拆解
+    scene_breakdowns = []
+    scene_table = soup.find("div", id="sceneinfo")
+    if scene_table:
+        rows = scene_table.find_all("tr")
+
+        for row in rows:
+            cols = row.find_all("td")
+            if len(cols) >= 2:
+                scene = cols[0].text.strip()  # 场景编号
+                performer_info = cols[1]  # 包含表演者及链接信息
+
+                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
+                performer_html = str(performer_info)  # 获取所有HTML内容
+                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
+                if split_html:
+                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                else:
+                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
+                    if split_html:
+                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                    else:
+                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部
+
+                # 解析为纯文本（去除HTML标签，仅提取文本内容）
+                performers_soup = BeautifulSoup(performers_html, "html.parser")
+                performers_text = performers_soup.get_text()
+
+                # 提取表演者
+                scene_performers = [p.strip() for p in performers_text.split(",")]
+
+                # 尝试获取 `webscene` 和 `studio`
+                links_data = {}
+                links = performer_info.find_all("a")
+                if links:
+                    webscene_title = links[0].text.strip() if len(links)>0 else None
+                    webscene = links[0]["href"] if len(links)>0 else None
+                    studio = links[1].text.strip() if len(links)>1 else None
+                    studio_lnk = links[1]["href"] if len(links)>1 else None
+                    links_data = {
+                        "title": webscene_title,
+                        "webscene": webscene,
+                        "studio": studio,
+                        "studio_lnk": studio_lnk,
+                    }
+
+                scene_data = {
+                    "scene": scene,
+                    "performers": scene_performers,
+                    **links_data,
+                }
+                scene_breakdowns.append(scene_data)
+
+    appears_in = []
+    appears_divs = soup.find("div", id="appearssection")
+    if appears_divs:
+        rows = appears_divs.find_all("li")
+        for row in rows:
+            lnk = row.find("a")
+            if lnk:
+                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+    return {
+        "href": href,
+        "title": title,
+        "Minutes": movie_data.get("Minutes", ""),
+        "Distributor": movie_data.get("Distributor", ""),
+        "Studio": movie_data.get("Studio", ""),
+        "ReleaseDate": movie_data.get("Release Date", ""),
+        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+        "All-Girl": movie_data.get("All-Girl", ""),
+        "All-Male": movie_data.get("All-Male", ""),
+        "Compilation": movie_data.get("Compilation", ""),
+        "Webscene": movie_data.get("Webscene", ""),
+        "Director": movie_data.get("Director", ""),
+        "DirectorHref": movie_data.get("DirectorHref", ""),
+        "DistributorHref": movie_data.get("DistributorHref", ""),
+        "StudioHref": movie_data.get("StudioHref", ""),
+        "Performers": performers,
+        "SceneBreakdowns": scene_breakdowns,
+        "AppearsIn": appears_in,
+    }
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_movie_json(href, data):
+    # 获取目录
+    movie_id = extract_id_from_href(href)
+    person_dir = create_sub_directory(movies_dir, movie_id)
+    person_filename = f"{movie_id}.json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+def process_movies():
+    """处理电影数据"""
+    global all_movies
+    all_movies = load_existing_data()
+    processed_hrefs = {movie["href"] for movie in all_movies}
+
+    # 读取 distributors.json 文件
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        movies = json.load(f)
+
+    count = 0
+
+    for entry in movies:
+        href = entry["href"]
+        title = entry["title"]
+
+        if href in processed_hrefs:
+            logging.info(f"Skiping existed: {title} ({href})")
+            continue  # 跳过已处理数据
+
+        logging.info(f"Processing: {title} ({href})")
+
+        while True:
+            html = fetch_html(href)
+            if not html:
+                logging.warning(f'Retring  {title} ({href}) ')
+                continue  # 获取失败，跳过
+            else:
+                movie = parse_movie_details(html, href, title)
+                if not movie:
+                    logging.warning(f'Retring  {title} ({href}) ')
+                    continue
+                else:
+                    all_movies.append(movie)
+                    count += 1
+
+                    # 写入本地文件
+                    write_movie_json(href, movie)
+                    break
+
+        # 每 BATCH_SIZE 条数据刷新一次文件
+        if count % BATCH_SIZE == 0:
+            save_data()
+
+    # 最终保存文件
+    save_data()
+
+    logging.info("Task completed.")
+
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 指定url访问
+def process_one(href):
+    # 初始化 cloudscraper
+    scraper = cloudscraper.create_scraper()
+    # 获取并解析数据
+    movie = {}
+    while True:
+        html = fetch_html(href)
+        if not html:
+            logging.warning(f'fetching {href} error. retrying...')
+            continue  # 获取失败，跳过
+
+        movie = parse_movie_details(html, href, 'title')
+        if movie:
+            break
+        else:
+            logging.warning(f'fetching {href} error. retrying...')
+            continue  # 获取失败，跳过
+    
+    if movie:
+        write_movie_json(href, movie)
+
+    print(f'fetch succ. saved result in {movies_dir}')
+
+# 处理程序被终止时的数据
+def handle_exit_signal(signal, frame):
+    logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
+    save_data()
+    sys.exit(0)
+
+# 全量访问
+def main():
+    try:
+        # 注册退出信号
+        signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
+        signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
+        process_movies()
+    finally:
+        # 清理操作，保证在程序正常退出时执行
+        save_data()
+        logging.info("Data processing completed.")
+
+# 程序入口，读取参数
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        url = sys.argv[1]
+        process_one(url)
+    else:
+        main()
--- a/iafd/src_json/movie_list_fetch.py
+++ b/iafd/src_json/movie_list_fetch.py
@ -0,0 +1,255 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import cloudscraper
+import json
+import time
+import csv
+import argparse
+from bs4 import BeautifulSoup
+import logging
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+# 结果路径
+res_dir = f"{config.global_share_data_dir}/iafd"
+
+fetch_config = {
+    'dist': {
+        'base_url': f"{host_url}/distrib.rme/distrib=",
+        'list_page_url': f"{host_url}/distrib.asp",
+        'html_table_id': 'distable',
+        'html_select_name': 'Distrib',
+        'output_key_id': 'distributors',
+        'json_file': f'{res_dir}/distributors.json',
+        'csv_file': f'{res_dir}/distributors.csv',
+    },
+    'stu': {
+        'base_url': f"{host_url}/studio.rme/studio=",
+        'list_page_url': f"{host_url}/studio.asp",
+        'html_table_id': 'studio',
+        'html_select_name': 'Studio',
+        'output_key_id': 'studios',
+        'json_file': f'{res_dir}/studios.json',
+        'csv_file': f'{res_dir}/studios.csv',
+    }
+}
+
+distr_map = {
+    6812 : 'nubilefilms.com',
+    8563 : 'teenmegaworld network',
+    6779 : 'x-art.com',
+    7133 : 'tushy.com',
+    6496 : 'blacked.com',
+    7758 : 'vixen.com',
+    6791 : 'teamskeet.com',
+    12454: 'vip4k.com',
+    13541: 'wow network',
+    9702 : 'cum4k.com',
+    6778 : 'tiny4k.com',
+    12667: 'anal4k.com',
+    7419 : 'exotic4k.com',
+    13594: 'facials4k.com',
+    13633: 'mom4k.com',
+    12335: 'slim4k.com',
+    16709: 'strippers4k.com',
+
+}
+studio_map = {
+    6812 : 'nubilefilms.com',
+    9811 : 'Teen Mega World',
+    6779 : 'x-art.com',
+    7133 : 'tushy.com',
+    6496 : 'blacked.com',
+    7758 : 'vixen.com',
+    6791 : 'teamskeet.com',
+    8052: 'wowgirls.com',
+    9702 : 'cum4k.com',
+    6778 : 'tiny4k.com',
+    12667: 'anal4k.com',
+    7419 : 'exotic4k.com',
+    13594: 'facials4k.com',
+    13633: 'mom4k.com',
+    12335: 'slim4k.com',
+    16709: 'strippers4k.com',
+
+}
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+all_data = []
+
+# 网络请求并解析 HTML
+def fetch_page(url):
+    try:
+        response = scraper.get(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        logging.error(f"Failed to fetch {url}: {e}")
+        return None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page(html, name, config):
+    table_id = config['html_table_id']
+    key_id = config['output_key_id']
+
+    soup = BeautifulSoup(html, "html.parser")
+    table = soup.find("table", id=table_id)
+
+    if not table:
+        logging.warning(f"Warning: No {table_id} table found in {name}")
+        return None
+    
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    global all_data
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 5:
+            title = cols[0].text.strip()
+            label = cols[1].text.strip()
+            year = cols[2].text.strip()
+            rev = cols[3].text.strip()
+            a_href = cols[0].find('a')
+            href = host_url + a_href['href'] if a_href else ''
+
+            all_data.append({
+                key_id: name,
+                'title': title,
+                'label': label,
+                'year': year,
+                'rev': rev,
+                'href': href
+            })
+    return soup
+
+# 处理翻页,星座的无需翻页
+def handle_pagination(soup, astro):
+    return None
+
+# 获取列表页
+def process_list_gage(config):
+    list_page_url=config['list_page_url']
+    select_name = config['html_select_name']
+    list_map = {}
+
+    logging.info(f"Fetching data for {list_page_url} ...")
+    select_element = None
+    while True:
+        html = fetch_page(list_page_url)
+        if html:
+            soup = BeautifulSoup(html, "html.parser")
+            select_element = soup.find('select', {'name': select_name})
+            if select_element :
+                break
+            else:
+                logging.info(f"wrong html content. retring {list_page_url} ...")
+        else:
+            logging.info(f"wrong html content. retring {list_page_url} ...")
+
+    if not select_element:
+        return None
+    
+    options = select_element.find_all('option')
+    for option in options:
+        value = option.get('value')  # 获取 value 属性
+        text = option.text.strip()   # 获取文本内容
+        list_map[int(value)] = text
+    logging.info(f'fetch {list_page_url} succ. total lines: {len(list_map)}')
+    return list_map
+
+# 主逻辑函数：循环处理每个种族
+def process_main_data(list_data, config):
+    base_url = config['base_url']
+
+    for key, name in list_data.items():
+        url = base_url + str(key)
+        next_url = url
+        logging.info(f"Fetching data for {name}, url {url} ...")
+
+        while next_url:
+            html = fetch_page(next_url)
+            if html:
+                soup = parse_page(html, name, config)
+                if soup:
+                    next_url = handle_pagination(soup, name)
+                else:
+                    logging.info(f"wrong html content. retring {next_url} ...")
+                # 定期保存结果
+                save_data(config)
+                time.sleep(2)  # 控制访问频率
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+
+# 保存到文件
+def save_data(config):    
+    with open(config['json_file'], 'w', encoding='utf-8') as json_file:
+        json.dump(all_data, json_file, indent=4, ensure_ascii=False)
+    
+    with open(config['csv_file'], 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=[config['output_key_id'], 'title', 'label', 'year', 'rev', 'href'])
+        writer.writeheader()
+        writer.writerows(all_data)
+
+
+# 执行主逻辑
+if __name__ == '__main__':
+    # 命令行参数处理
+    parser = argparse.ArgumentParser(description='fetch movie list from iafd.com')
+    parser.add_argument('--type', type=str, default='dist', help='fetch by ... (dist , stu)')
+    parser.add_argument('--kind', type=str, default='parts', help='fetch all or parts (parts , all)')
+    args = parser.parse_args()
+    
+    config = fetch_config[args.type]
+    if not config:
+        logging.warning(f'unkwon type: {args.type} {args.kind}')        
+    else:
+        list_data = {}
+        if args.kind == 'all':
+            list_data = process_list_gage(config)
+        elif args.type == 'dist':
+            list_data = distr_map
+        else:
+            list_data = studio_map
+
+        process_main_data(list_data, config)
+        logging.info("Data fetching and saving completed.")
--- a/iafd/src_json/performers_details.py
+++ b/iafd/src_json/performers_details.py
@ -0,0 +1,393 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+import config
+
+# 配置日志
+config.setup_logging()
+
+# 结果路径
+res_dir = '../result'
+res_json_file = f'{res_dir}/detail.json'
+res_csv_file = f'{res_dir}/detail.csv'
+input_json_file = f'{res_dir}/merged.json'
+performers_dir = f'{res_dir}/performers'
+
+# 存储结果
+final_data = []
+
+# 读取 detail.json 中的 数据，以便于断点续传
+def load_existing_hrefs():
+    existing_hrefs = set()
+    global final_data
+    try:
+        with open(res_json_file, 'r') as file:
+            final_data = json.load(file)
+            for entry in final_data:
+                existing_hrefs.add(entry['href'])
+    except FileNotFoundError:
+        logging.info("detail.json not found, starting fresh.")
+    return existing_hrefs
+
+# 解析 作品列表，有个人出演，也有导演的
+def parse_credits_table(table, distributor_list):
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    movies = []
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+
+    # rows = table.find_all('tr', class_='we')
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 6:
+            title = cols[0].text.strip()
+            year = cols[1].text.strip()
+            distributor = cols[2].text.strip().lower()
+            notes = cols[3].text.strip()
+            rev = cols[4].text.strip()
+            formats = cols[5].text.strip()
+
+            for key in distributor_list:
+                if key in distributor:
+                    distributor_count[key] += 1
+
+            movies.append({
+                'title': title,
+                'year': year,
+                'distributor': distributor,
+                'notes': notes,
+                'rev': rev,
+                'formats': formats
+            })
+    return movies, distributor_count
+
+
+# 请求网页并提取所需数据
+def fetch_and_parse_page(url, scraper):
+    try:
+        response = scraper.get(url)
+        if response.status_code != 200:
+            logging.warning(f"Failed to fetch {url}, Status code: {response.status_code}")
+            return None, None
+        
+        # 解析 HTML 内容
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # 提取数据
+        data = {}
+
+        # 定义我们需要的字段名称和HTML中对应的标签
+        fields = {
+            'performer_aka': 'Performer AKA',
+            'birthday': 'Birthday',
+            'astrology': 'Astrology',
+            'birthplace': 'Birthplace',
+            'gender': 'Gender',
+            'years_active': 'Years Active',
+            'ethnicity': 'Ethnicity',
+            'nationality': 'Nationality',
+            'hair_colors': 'Hair Colors',
+            'eye_color': 'Eye Color',
+            'height': 'Height',
+            'weight': 'Weight',
+            'measurements': 'Measurements',
+            'tattoos': 'Tattoos',
+            'piercings': 'Piercings'
+        }
+        reversed_map = {v: k for k, v in fields.items()}
+
+        # 解析表格数据, 获取参演或者导演的列表
+        role_list = ['personal', 'directoral']
+        distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']        
+        credits_list = {}
+
+        # 使用字典来存储统计
+        distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+        for role in role_list:
+            table = soup.find('table', id=role)
+            if table :
+                movies, stat_map = parse_credits_table(table, distributor_list)
+                credits_list[role] = movies
+                # 更新 distributor 统计
+                for distributor in distributor_list:
+                    distributor_count[distributor] += stat_map.get(distributor, 0)
+
+        # 统计 movies 数量
+        #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
+        movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
+
+        # 如果没有找到
+        if len(credits_list) == 0 :
+            logging.warning(f"movie table empty. url: {url} ")
+
+        # 遍历每个 bioheading, 获取metadata
+        bioheadings = soup.find_all('p', class_='bioheading')
+        for bio in bioheadings:
+            heading = bio.text.strip()
+            biodata = None
+
+            # 如果包含 "Performer",需要特殊处理
+            if 'Performer' in heading:
+                heading = 'Performer AKA'
+                biodata_div = bio.find_next('div', class_='biodata')
+                if biodata_div:
+                    div_text = biodata_div.get_text(separator='|').strip()
+                    biodata = [b.strip() for b in div_text.split('|') if b.strip()]
+            else:
+                biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
+            
+            # 保存数据
+            if heading in reversed_map:
+                kkey = reversed_map[heading]
+                data[kkey] = biodata
+                
+        # 添加统计数据到 data
+        data['movies_cnt'] = movies_cnt
+        data['vixen_cnt'] = distributor_count['vixen']
+        data['blacked_cnt'] = distributor_count['blacked']
+        data['tushy_cnt'] = distributor_count['tushy']
+        data['x_art_cnt'] = distributor_count['x-art']
+
+        return data, credits_list
+    except RequestException as e:
+        logging.error(f"Error fetching {url}: {e}")
+        return None, None
+
+# 写入 detail.json
+def write_to_detail_json(data):
+    with open(res_json_file, 'w', encoding='utf-8') as json_file:
+        json.dump(data, json_file, indent=4, ensure_ascii=False)
+
+# 写入 CSV 文件
+def write_to_csv(data):
+    try:
+        with open(res_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
+            writer = csv.writer(csvfile, delimiter=',')
+            header = ['person', 'href', 'performer_aka', 'birthday', 'astrology', 'birthplace', 'gender', 'years_active', 'ethnicity', 
+                      'nationality', 'hair_colors', 'eye_color', 'height', 'weight', 'measurements', 'tattoos', 'piercings', 
+                      'movies_cnt', 'vixen_cnt', 'blacked_cnt', 'tushy_cnt', 'x_art_cnt']
+            writer.writerow(header)
+            for entry in data:
+                # 确保 performer_aka 始终为列表类型
+                performer_aka = entry.get('performer_aka', [])
+                
+                # 如果是 None 或非列表类型，转换为一个空列表
+                if performer_aka is None:
+                    performer_aka = []
+                elif not isinstance(performer_aka, list):
+                    performer_aka = [performer_aka]
+                    
+                writer.writerow([
+                    entry.get('person', ''),
+                    entry.get('href', ''),
+                    '|'.join(performer_aka),
+                    entry.get('birthday', ''),
+                    entry.get('astrology', ''),
+                    entry.get('birthplace', ''),
+                    entry.get('gender', ''),
+                    entry.get('years_active', ''),
+                    entry.get('ethnicity', ''),
+                    entry.get('nationality', ''),
+                    entry.get('hair_colors', ''),
+                    entry.get('eye_color', ''),
+                    entry.get('height', ''),
+                    entry.get('weight', ''),
+                    entry.get('measurements', ''),
+                    entry.get('tattoos', ''),
+                    entry.get('piercings', ''),
+                    entry.get('movies_cnt', 0),
+                    entry.get('vixen_cnt', 0),
+                    entry.get('blacked_cnt', 0),
+                    entry.get('tushy_cnt', 0),
+                    entry.get('x_art_cnt', 0)
+                ])
+    except Exception as e:
+        logging.error(f"Error writing to CSV: {e}")
+
+def handle_exit_signal(signal, frame):
+    logging.info("Gracefully exiting... Saving remaining data to Json and CSV.")
+    write_to_csv(final_data)  # Ensure final data is written when exiting
+    write_to_detail_json(final_data)
+    sys.exit(0)
+
+# 创建目录
+def create_directory_for_person(person):
+    # 获取 person 的前两个字母并转为小写
+    person_dir = person[:1].lower()
+    full_path = os.path.join(performers_dir, person_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_person_json(person, href, data):
+    # 获取目录
+    person_dir = create_directory_for_person(person)
+    person_id = extract_id_from_href(href)
+    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+
+# 指定url访问
+def process_one(href):
+    # 初始化 cloudscraper
+    scraper = cloudscraper.create_scraper()
+    # 获取并解析数据
+    while True:
+        data, movies = fetch_and_parse_page(href, scraper)
+        if data is None:
+            logging.warning(f'Retring {href} ')
+            time.sleep(3)
+        else:
+            break
+
+    # 写入 performer 的独立 JSON 文件
+    full_data = {
+        **data,
+        'credits': movies if movies else {}
+    }
+    person_id = extract_id_from_href(href)
+    person_filename = f"{person_id}.json"  # 用 - 替换空格
+
+    try:
+        with open(person_filename, 'w', encoding='utf-8') as json_file:
+            json.dump(full_data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {person_filename}: {e}")
+    print(f'fetch succ. saved result in {person_filename}')
+
+def process_all():
+    # 初始化 cloudscraper
+    scraper = cloudscraper.create_scraper()
+
+    # 加载已存在的 href 列表
+    global final_data
+    existing_hrefs = load_existing_hrefs()
+    logging.info(f"load data from {res_json_file}, count: {len(final_data)}")
+
+    # 读取 merged.json
+    with open(input_json_file, 'r') as file:
+        merged_data = json.load(file)
+
+    # 遍历 merged.json 中的数据
+    loop = 0
+    for entry in merged_data:
+        href = entry.get('href')
+        person = entry.get('person')
+
+        if href in existing_hrefs:
+            logging.info(f"Skipping {href} - already processed")
+            continue
+        
+        logging.info(f"Processing {href} - {person}")
+
+        # 获取并解析数据
+        while True:
+            data, credits = fetch_and_parse_page(href, scraper)
+            if data is None:
+                logging.warning(f'Retring {href} - {person} ')
+                time.sleep(3)
+            else:
+                break
+
+        # 如果数据正确,加入到 final_data
+        final_data.append({
+            'href': href,
+            'person': person,
+            **data
+        })
+
+        # 写入 performer 的独立 JSON 文件
+        full_data = {
+            'href': href,
+            'person': person,
+            **data,
+            'credits': credits if credits else {}
+        }
+        write_person_json(person.strip(), href, full_data)
+
+        # 更新 detail.json 文件
+        loop = loop + 1
+        if loop % 100 == 0:
+            logging.info(f'flush data to json file. now fetched data count: {loop}, total count: {len(final_data)}')
+            write_to_detail_json(final_data)
+            write_to_csv(final_data)
+        
+        # 更新已存在的 href
+        existing_hrefs.add(href)
+        
+        # 延时,防止请求过快被封锁
+        time.sleep(1)
+
+# 全量访问
+def main():
+    try:
+        # 注册退出信号
+        signal.signal(signal.SIGINT, handle_exit_signal)  # Handle Ctrl+C
+        signal.signal(signal.SIGTERM, handle_exit_signal)  # Handle kill signal
+        process_all()
+    finally:
+        # 清理操作，保证在程序正常退出时执行
+        write_to_csv(final_data)  # Write to CSV or other necessary tasks
+        write_to_detail_json(final_data)  # Save data to JSON
+        logging.info("Data processing completed.")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        url = sys.argv[1]
+        process_one(url)
+    else:
+        main()
--- a/iafd/src_json/performers_list_astro.py
+++ b/iafd/src_json/performers_list_astro.py
@ -0,0 +1,140 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import cloudscraper
+import json
+import time
+import csv
+from bs4 import BeautifulSoup
+import logging
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+base_url = f"{host_url}/astrology.rme/sign="
+astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+# 结果路径
+res_dir = '../result'
+
+# 记录 ethinc_map
+astro_map = []
+
+# 网络请求并解析 HTML
+def fetch_page(url):
+    try:
+        response = scraper.get(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        logging.error(f"Failed to fetch {url}: {e}")
+        return None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page(html, astro):
+    soup = BeautifulSoup(html, "html.parser")
+    astro_div = soup.find("div", id="astro")
+
+    if not astro_div:
+        logging.warning(f"Warning: No 'astro' div found in {astro}")
+        return None
+    
+    flag = False
+    list_cnt = 0
+
+    birth_date = None
+    for elem in astro_div.find_all(recursive=False):
+        if elem.name == "h3" and "astroday" in elem.get("class", []):
+            birth_date = elem.get_text(strip=True)
+        elif elem.name == "div" and "perficon" in elem.get("class", []):
+            a_tag = elem.find("a")
+            if a_tag:
+                href = host_url + a_tag["href"]
+                name = a_tag.find("span", class_="perfname")
+                if name:
+                    astro_map.append({
+                        "astrology": astro,
+                        "birth_date": birth_date,
+                        "person": name.get_text(strip=True),
+                        "href": href
+                    })
+                    flag = True
+                    list_cnt = list_cnt +1
+    if flag:
+        logging.info(f"get {list_cnt} persons from this page. total persons: {len(astro_map)}")
+        return soup
+    else:
+        return None
+
+# 处理翻页,星座的无需翻页
+def handle_pagination(soup, astro):
+    return None
+
+# 主逻辑函数：循环处理每个种族
+def process_astro_data():
+    for astro in astro_list:
+        url = base_url + astro
+        next_url = url
+        logging.info(f"Fetching data for {astro}, url {url} ...")
+
+        while next_url:
+            html = fetch_page(next_url)
+            if html:
+                soup = parse_page(html, astro)
+                if soup:
+                    next_url = handle_pagination(soup, astro)
+                else:
+                    logging.info(f"wrong html content. retring {next_url} ...")
+                # 定期保存结果
+                save_data()
+                time.sleep(2)  # 控制访问频率
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+
+# 保存到文件
+def save_data():
+    with open(f'{res_dir}/astro.json', 'w', encoding='utf-8') as json_file:
+        json.dump(astro_map, json_file, indent=4, ensure_ascii=False)
+    
+    with open(f'{res_dir}/astro.csv', 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=['astrology', 'birth_date', 'person', 'href'])
+        writer.writeheader()
+        writer.writerows(astro_map)
+
+# 执行主逻辑
+if __name__ == '__main__':
+    process_astro_data()
+    save_data()
+    logging.info("Data fetching and saving completed.")
--- a/iafd/src_json/performers_list_birth.py
+++ b/iafd/src_json/performers_list_birth.py
@ -0,0 +1,152 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import requests
+import cloudscraper
+import json
+import time
+import csv
+from bs4 import BeautifulSoup
+import logging
+import config
+
+config.setup_logging()
+
+# 创建 cloudscraper 会话
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+# 结果路径
+res_dir = '../result'
+
+# 存储出生日期的映射
+birth_map = []
+
+# 设置基础URL
+host_url = "https://www.iafd.com"
+base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
+
+# 定义获取页面内容的函数
+def fetch_page(month, day):
+    url = base_url.format(month=month, day=day)
+    retries = 3
+    while retries > 0:
+        try:
+            # 发送请求并获取页面
+            logging.info(f"Fetching URL: {url}")
+            response = scraper.get(url)
+            response.raise_for_status()
+            return response.text
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Request failed: {e}")
+            retries -= 1
+            time.sleep(2)  # 等待2秒后重试
+    return None
+
+# 解析页面内容并更新birth_map
+def parse_page(html, month, day):
+    soup = BeautifulSoup(html, 'html.parser')
+    datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
+    if not datarows:
+        return None
+    
+    flag = False
+    list_cnt = 0
+    rows = datarows[0].find_all('div', class_='col-sm-4')
+    for row in rows:
+        link_tag = row.find('a')
+        person = link_tag.text.strip() if link_tag else ''
+        href = link_tag['href'] if link_tag else ''
+        href = host_url + href
+        
+        # 如果 href 已经在 birth_map 中，跳过
+        flag = True
+        if any(entry['href'] == href for entry in birth_map):
+            continue
+        
+        # 将数据添加到 birth_map
+        birth_map.append({
+            'month': month,
+            'day': day,
+            'person': person,
+            'href': href
+        })
+        list_cnt = list_cnt +1
+
+    if flag:
+        logging.info(f"get {list_cnt} persons from this page. total persons: {len(birth_map)}")
+        return soup
+    else:
+        return None
+
+# 循环遍历每个日期
+def fetch_birthdays():
+    for month in range(1, 13):  # 遍历1到12月
+        for day in range(1, 32):  # 遍历1到31天
+            logging.info(f"Processing: Month {month}, Day {day}")
+            while True:
+                html = fetch_page(month, day)
+                if html:
+                    soup = parse_page(html, month, day)
+                    if soup:
+                        # 定期保存结果
+                        save_data()
+                        # 跳出while循环，获取下一个生日的url数据
+                        time.sleep(2)  # 控制访问频率
+                        break
+                    else:
+                        logging.warning(f"No data. Retrying: Month {month}, Day {day}")
+                        time.sleep(3)  # 等待后再重试
+                else:
+                    logging.warning(f"Network error. Retrying: Month {month}, Day {day}")
+                    time.sleep(3)  # 等待后再重试
+                    
+
+
+# 将birth_map保存到json文件
+def save_data():
+    with open(f'{res_dir}/birth.json', 'w', encoding='utf-8') as f:
+        json.dump(birth_map, f, ensure_ascii=False, indent=4)
+
+    with open(f'{res_dir}/birth.csv', 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=['month', 'day', 'person', 'href'])
+        writer.writeheader()
+        for entry in birth_map:
+            writer.writerow(entry)
+
+# 主函数
+def main():
+    # 获取数据
+    fetch_birthdays()
+    
+    # 保存结果
+    save_data()
+
+if __name__ == '__main__':
+    main()
--- a/iafd/src_json/performers_list_ethnic.py
+++ b/iafd/src_json/performers_list_ethnic.py
@ -0,0 +1,166 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import cloudscraper
+import json
+import time
+import csv
+from bs4 import BeautifulSoup
+import logging
+import config
+
+config.setup_logging()
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+base_url = f"{host_url}/lookupethnic.rme/ethnic="
+ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+# 结果路径
+res_dir = '../result'
+
+# 记录 ethinc_map
+ethnic_map = []
+
+# 网络请求并解析 HTML
+def fetch_page(url):
+    try:
+        response = scraper.get(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        logging.error(f"Failed to fetch {url}: {e}")
+        return None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page(html, ethnic):
+    # 手动修复 HTML 标签
+    html = html.replace('<br>', '').replace('<a ', '<a target="_blank" ')  # 修复一些不规范标签
+    soup = BeautifulSoup(html, 'lxml')  # 使用lxml解析器
+
+    #soup = BeautifulSoup(html, 'html.parser')
+    rows = soup.find_all('div', class_='row headshotrow')
+    flag = False
+    list_cnt = 0
+
+    for row in rows:
+        for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
+            link_tag = col.find('a')
+            img_tag = col.find('div', class_='pictag')
+            flag = True
+
+            if link_tag and img_tag:
+                href = host_url + link_tag['href']
+                person = img_tag.text.strip()
+
+                # 将数据存储到 ethnic_map
+                ethnic_map.append({
+                    'ethnic': ethnic,
+                    'person': person,
+                    'href': href
+                })
+                list_cnt = list_cnt +1
+    if flag:
+        logging.info(f"get {list_cnt} persons from this page. total persons: {len(ethnic_map)}")
+        return soup
+    else:
+        return None
+
+# 处理翻页
+def handle_pagination(soup, ethnic):
+    next_page = soup.find('a', rel='next')
+
+    if next_page:
+        next_url = host_url + next_page['href']
+        logging.info(f"Found next page: {next_url}")
+        return next_url
+    else:
+        logging.info(f"All pages fetched for {ethnic}.")
+        return None
+
+# 处理带空格的种族名
+def format_ethnic(ethnic):
+    return ethnic.replace(' ', '+')
+
+# 主逻辑函数：循环处理每个种族
+def process_ethnic_data():
+    all_person = len(ethnic_map) # 应该为0
+    all_pages = 0
+
+    for ethnic in ethnic_list:
+        url = base_url + format_ethnic(ethnic)
+        next_url = url
+        cursor = int(all_person / 100)
+        pages = 0
+        logging.info(f"--------Fetching data for {ethnic}, url {url} ...")
+
+        while next_url:
+            html = fetch_page(next_url)
+            if html:
+                soup = parse_page(html, ethnic)
+                if soup:
+                    next_url = handle_pagination(soup, ethnic)
+                    pages = pages + 1
+                else:
+                    logging.info(f"wrong html content. retring {next_url} ...")
+                # 统计，并定期保存结果
+                if len(ethnic_map) / 100 > cursor:
+                    cursor = int(len(ethnic_map) / 100)
+                    save_data()
+                    time.sleep(2) # 控制访问频率
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+        # 统计输出
+        ethnic_person = len(ethnic_map) - all_person
+        all_person = len(ethnic_map)
+        all_pages = all_pages + pages
+        logging.info(f"--------Fetching data for {ethnic} end. total pages: {pages}, total persons: {ethnic_person}, all persons fetched: {all_person}")
+    # 统计最后结果
+    logging.info(f"--------Fetching all data end. total ethnic: {len(ethnic_list)}, total pages: {all_pages}, total persons: {all_person}")
+
+
+# 保存到文件
+def save_data():
+    with open(f'{res_dir}/ethnic.json', 'w', encoding='utf-8') as json_file:
+        json.dump(ethnic_map, json_file, indent=4, ensure_ascii=False)
+    
+    with open(f'{res_dir}/ethnic.csv', 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=['ethnic', 'person', 'href'])
+        writer.writeheader()
+        writer.writerows(ethnic_map)
+
+# 执行主逻辑
+if __name__ == '__main__':
+    process_ethnic_data()
+    save_data()
+    logging.info("Data fetching and saving completed.")
--- a/iafd/src_json/performers_list_merge.py
+++ b/iafd/src_json/performers_list_merge.py
@ -0,0 +1,120 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import json
+import csv
+import os
+import argparse
+from collections import defaultdict
+
+# 结果路径
+res_dir = '../result'
+
+# 读取文件并返回内容
+def read_json(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"文件 {file_path} 未找到.")
+        return []
+    except json.JSONDecodeError:
+        print(f"文件 {file_path} 解析错误.")
+        return []
+
+# 处理数据，去重并合并 person 字段
+def process_data(files):
+    href_map = defaultdict(list)
+
+    # 读取并处理每个文件
+    for file in files:
+        data = read_json(file['path'])
+        for entry in data:
+            href = entry.get('href')
+            person = entry.get('person')
+            if href:
+                href_map[href].append(person)
+
+    # 合并相同 href 的 person，连接用 "|"
+    result = []
+    for href, persons in href_map.items():
+        person = '|'.join(set(persons))  # 去重后合并
+        result.append({'href': href, 'person': person})
+
+    return result
+
+# 保存结果到JSON文件
+def save_to_json(data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+# 保存结果到CSV文件
+def save_to_csv(data, output_file):
+    with open(output_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=['href', 'person'])
+        writer.writeheader()
+        writer.writerows(data)
+
+# 主函数，执行数据处理并保存
+def main():
+    # 使用 argparse 获取命令行参数
+    parser = argparse.ArgumentParser(description="合并多个 JSON 文件并输出到一个新的 JSON 和 CSV 文件")
+    parser.add_argument('files', nargs='+', choices=['birth', 'astro', 'ethnic'], 
+                        help="指定需要合并的文件, 至少两个, 最多三个: birth, astro, ethnic")
+    args = parser.parse_args()
+
+    # 确保至少选择两个文件
+    if len(args.files) < 2:
+        print("请至少选择两个文件进行合并。")
+        return
+
+    # 定义需要处理的文件
+    file_map = {
+        'birth': f'{res_dir}/birth.json',
+        'astro': f'{res_dir}/astro.json',
+        'ethnic': f'{res_dir}/ethnic.json'
+    }
+    
+    files = [{'path': file_map[file], 'name': file} for file in args.files]
+
+    # 处理数据
+    processed_data = process_data(files)
+
+    # 根据输入的文件名生成 merged 文件名
+    output_json_file = f'{res_dir}/merged_{"_".join(args.files)}.json'
+    output_csv_file = f'{res_dir}/merged_{"_".join(args.files)}.csv'
+
+    # 确保 result 目录存在
+    os.makedirs(f'{res_dir}', exist_ok=True)
+
+    # 输出结果到 JSON 和 CSV 文件
+    save_to_json(processed_data, output_json_file)
+    save_to_csv(processed_data, output_csv_file)
+
+    print(f"数据处理完成，结果已保存到 {output_json_file} 和 {output_csv_file}")
+
+if __name__ == "__main__":
+    main()
--- a/iafd/tools/data_merge.py
+++ b/iafd/tools/data_merge.py
@ -0,0 +1,236 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import os
+import json
+import csv
+import logging
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# 输入目录和输出文件
+input_dir = 'data'
+output_json_file = f'{input_dir}/iafd_merge.json'
+output_csv_file = f'{input_dir}/iafd_merge.csv'
+output_person_txt = f'{input_dir}/all_person.txt'
+
+# 读取iafd_meta.json
+try:
+    with open(os.path.join(input_dir, 'iafd_meta.json'), 'r', encoding='utf-8') as file:
+        iafd_data = json.load(file)
+    logger.info("Loaded iafd_meta.json")
+except Exception as e:
+    logger.error(f"Error loading iafd_meta.json: {e}")
+    iafd_data = []
+
+# 读取stashdb.json
+try:
+    with open(os.path.join(input_dir, 'stashdb.json'), 'r', encoding='utf-8') as file:
+        stashdb_data = json.load(file)
+    logger.info("Loaded stashdb.json")
+except Exception as e:
+    logger.error(f"Error loading stashdb.json: {e}")
+    stashdb_data = []
+
+# 读取javhd_meta.json
+try:
+    with open(os.path.join(input_dir, 'javhd_meta.json'), 'r', encoding='utf-8') as file:
+        javhd_data = json.load(file)
+    logger.info("Loaded javhd_meta.json")
+except Exception as e:
+    logger.error(f"Error loading javhd_meta.json: {e}")
+    javhd_data = []
+
+# 读取thelordofporn_meta.json
+try:
+    with open(os.path.join(input_dir, 'thelordofporn_meta.json'), 'r', encoding='utf-8') as file:
+        lordporn_data = json.load(file)
+    logger.info("Loaded thelordofporn_meta.json")
+except Exception as e:
+    logger.error(f"Error loading thelordofporn_meta.json: {e}")
+    lordporn_data = []
+
+# 构建all_meta_data，去重
+all_meta_data = set()
+
+# 从各数据源提取unique的姓名数据
+for person_entry in iafd_data:
+    all_meta_data.add(person_entry['person'])
+for stashdb_entry in stashdb_data:
+    all_meta_data.add(stashdb_entry['name'])
+for javhd_entry in javhd_data:
+    all_meta_data.add(javhd_entry['ja_name'])
+for lordporn_entry in lordporn_data:
+    all_meta_data.add(lordporn_entry['pornstar'])
+
+# 合并数据的列表
+merged_data = []
+
+# 遍历all_meta_data，按规则合并
+for person in all_meta_data:
+    # 初始化合并的数据结构体
+    merged_entry = {
+        'person': person
+    }
+
+    # 初始化stashdb_entry，所有字段为空
+    stashdb_entry = {
+        'stashdb_gender': '',
+        'stashdb_birthdate': '',
+        'stashdb_ethnicity': '',
+        'stashdb_country': '',
+        'stashdb_height': '',
+        'stashdb_measurements': '',
+        'stashdb_fake_tits': '',
+        'stashdb_career_length': '',
+        'stashdb_aliases': ''
+    }
+
+    # 初始化javhd_entry，所有字段为空
+    javhd_entry = {
+        'javhd_rank': '',
+        'javhd_height': '',
+        'javhd_weight': '',
+        'javhd_breast_size': '',
+        'javhd_breast_factor': '',
+        'javhd_birth_date': '',
+        'javhd_ethnicity': ''
+    }
+
+    # 初始化lordporn_entry，所有字段为空
+    lordporn_entry = {
+        'lordporn_rating': '',
+        'lordporn_rank': '',
+        'lordporn_career_start': '',
+        'lordporn_measurements': '',
+        'lordporn_born': '',
+        'lordporn_height': '',
+        'lordporn_weight': ''
+    }
+
+    # 初始化in_iafd字段，默认为N
+    in_iafd = 'N'
+    iafd_match = next((item for item in iafd_data if item.get('person') == person), None)
+    if iafd_match:
+        in_iafd = 'Y'
+
+    # 1. 检查是否存在于 stashdb 数据
+    in_stashdb = 'N'
+    stashdb_match = next((item for item in stashdb_data if item.get('name') == person), None)
+    if stashdb_match:
+        in_stashdb = 'Y'
+        # 更新stashdb_entry字段
+        stashdb_entry.update({
+            'stashdb_gender': stashdb_match.get('gender', ''),
+            'stashdb_birthdate': stashdb_match.get('birthdate', ''),
+            'stashdb_ethnicity': stashdb_match.get('ethnicity', ''),
+            'stashdb_country': stashdb_match.get('country', ''),
+            'stashdb_height': stashdb_match.get('height', ''),
+            'stashdb_measurements': stashdb_match.get('measurements', ''),
+            'stashdb_fake_tits': stashdb_match.get('fake_tits', ''),
+            'stashdb_career_length': stashdb_match.get('career_length', ''),
+            'stashdb_aliases': stashdb_match.get('aliases', '')
+        })
+
+    # 2. 检查是否存在于 javhd 数据
+    in_javhd = 'N'
+    javhd_match = next((item for item in javhd_data if item.get('ja_name') == person), None)
+    if javhd_match:
+        in_javhd = 'Y'
+        # 更新javhd_entry字段
+        javhd_entry.update({
+            'javhd_rank': javhd_match.get('rank', ''),
+            'javhd_height': javhd_match.get('height', ''),
+            'javhd_weight': javhd_match.get('weight', ''),
+            'javhd_breast_size': javhd_match.get('breast size', ''),
+            'javhd_breast_factor': javhd_match.get('breast factor', ''),
+            'javhd_birth_date': javhd_match.get('birth date', ''),
+            'javhd_ethnicity': javhd_match.get('ethnicity', '')
+        })
+
+    # 3. 检查是否存在于 thelordofporn 数据
+    in_lordporn = 'N'
+    lordporn_match = next((item for item in lordporn_data if item.get('pornstar') == person), None)
+    if lordporn_match:
+        in_lordporn = 'Y'
+        # 更新lordporn_entry字段
+        lordporn_entry.update({
+            'lordporn_rating': lordporn_match.get('rating', ''),
+            'lordporn_rank': lordporn_match.get('rank', ''),
+            'lordporn_career_start': lordporn_match.get('career_start', ''),
+            'lordporn_measurements': lordporn_match.get('measurements', ''),
+            'lordporn_born': lordporn_match.get('born', ''),
+            'lordporn_height': lordporn_match.get('height', ''),
+            'lordporn_weight': lordporn_match.get('weight', '')
+        })
+
+    # 添加 in_stashdb, in_javhd, in_lordporn 字段，确保都输出
+    merged_entry.update({
+        'in_iafd': in_iafd,
+        'in_stashdb': in_stashdb,
+        'in_javhd': in_javhd,
+        'in_lordporn': in_lordporn
+    })
+
+    # 将stashdb_entry, javhd_entry, lordporn_entry合并到结果中
+    merged_entry.update(stashdb_entry)
+    merged_entry.update(javhd_entry)
+    merged_entry.update(lordporn_entry)
+
+    # 将合并后的条目加入到结果列表
+    merged_data.append(merged_entry)
+
+# 写入iafd_merge.json
+try:
+    with open(output_json_file, 'w', encoding='utf-8') as json_file:
+        json.dump(merged_data, json_file, ensure_ascii=False, indent=4)
+    logger.info(f"Data successfully written to {output_json_file}")
+except Exception as e:
+    logger.error(f"Error writing {output_json_file}: {e}")
+
+# 写入iafd_merge.csv
+try:
+    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=merged_data[0].keys(), delimiter='\t')
+        writer.writeheader()
+        writer.writerows(merged_data)
+    logger.info(f"Data successfully written to {output_csv_file}")
+except Exception as e:
+    logger.error(f"Error writing {output_csv_file}: {e}")
+
+
+# 输出 all_meta_data 到 all_person.txt，并按字母顺序排序
+try:
+    # 排序 all_meta_data
+    all_meta_data_list = sorted(list(all_meta_data))  # 将集合转换为列表并排序
+    all_meta_data_str = ','.join(all_meta_data_list)  # 使用逗号连接元素
+    with open(output_person_txt, 'w', encoding='utf-8') as txt_file:
+        txt_file.write(all_meta_data_str)
+    logger.info(f"all_meta_data successfully written to all_person.txt")
+except Exception as e:
+    logger.error(f"Error writing all_person.txt: {e}")
--- a/iafd/tools/iafd_scrape.py
+++ b/iafd/tools/iafd_scrape.py
@ -0,0 +1,163 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import json
+import os
+import subprocess
+import time
+import logging
+from typing import List
+
+# 设置日志配置
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# 预定义的 scrapers 目录
+scrapers_dir = "/root/gitlabs/stashapp_CommunityScrapers/scrapers"
+meta_file = "./data/iafd_meta.json"
+cursor_file = "./data/iafd_cursor.txt"
+output_dir = f"{scrapers_dir}/iafd_meta"
+
+# 重试次数和间隔
+MAX_RETRIES = 10
+RETRY_DELAY = 5  # 5秒重试间隔
+
+
+# 创建输出目录
+os.makedirs(output_dir, exist_ok=True)
+
+
+def read_processed_hrefs() -> set:
+    """
+    读取已经处理过的 href
+    """
+    processed_hrefs = set()
+    if os.path.exists(cursor_file):
+        with open(cursor_file, "r", encoding="utf-8") as f:
+            processed_hrefs = {line.strip().split(",")[1] for line in f if "," in line}
+    return processed_hrefs
+
+
+def execute_scraper_command(href: str, idv: str) -> bool:
+    """
+    执行命令抓取数据，成功则返回True，否则返回False。
+    包含重试机制。
+    """
+    command = f"cd {scrapers_dir}; python3 -m IAFD.IAFD performer {href} > {output_dir}/{idv}.json"
+    attempt = 0
+    while attempt < MAX_RETRIES:
+        try:
+            logger.info(f"执行命令: {command}")
+            subprocess.run(command, shell=True, check=True)
+            return True
+        except subprocess.CalledProcessError as e:
+            logger.error(f"执行命令失败: {e}. 重试 {attempt + 1}/{MAX_RETRIES}...")
+            time.sleep(RETRY_DELAY)
+            attempt += 1
+    logger.error(f"命令执行失败，已尝试 {MAX_RETRIES} 次: {command}")
+    return False
+
+
+def validate_json_file(idv: str) -> bool:
+    """
+    校验 JSON 文件是否有效
+    """
+    output_file = f"{output_dir}/{idv}.json"
+    try:
+        with open(output_file, "r", encoding="utf-8") as f:
+            content = f.read().strip()
+            json_data = json.loads(content)  # 尝试解析 JSON
+            if "name" not in json_data:
+                raise ValueError("缺少 'name' 字段")
+            return True
+    except (json.JSONDecodeError, ValueError) as e:
+        logger.error(f"解析失败，删除无效文件: {output_file}. 错误: {e}")
+        os.remove(output_file)
+        return False
+
+
+def process_iafd_meta(data: List[dict], processed_hrefs: set) -> None:
+    """
+    处理 iafd_meta.json 中的数据
+    """
+    for entry in data:
+        person = entry.get("person")
+        href = entry.get("href")
+
+        if not person or not href:
+            logger.warning(f"跳过无效数据: {entry}")
+            continue
+
+        # 解析 href 提取 id
+        try:
+            idv = href.split("id=")[-1]
+        except IndexError:
+            logger.error(f"无法解析 ID: {href}")
+            continue
+
+        output_file = f"{output_dir}/{idv}.json"
+
+        # 跳过已处理的 href
+        if href in processed_hrefs:
+            logger.info(f"已处理，跳过: {person},  {href}")
+            continue
+
+        # 执行数据抓取
+        if not execute_scraper_command(href, idv):
+            continue
+
+        # 校验 JSON 文件
+        if not validate_json_file(idv):
+            continue
+
+        # 记录已处理数据
+        with open(cursor_file, "a", encoding="utf-8") as f:
+            f.write(f"{person},{href}\n")
+
+        logger.info(f"成功处理: {person} - {href}")
+
+
+def main():
+    """
+    主程序执行函数
+    """
+    # 读取已处理的 href
+    processed_hrefs = read_processed_hrefs()
+
+    # 读取 iafd_meta.json 数据
+    try:
+        with open(meta_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        logger.error(f"读取 iafd_meta.json 错误: {e}")
+        return
+
+    # 处理数据
+    process_iafd_meta(data, processed_hrefs)
+
+
+if __name__ == "__main__":
+    main()
--- a/iafd/tools/stashdb_merge.py
+++ b/iafd/tools/stashdb_merge.py
@ -0,0 +1,90 @@
+"""
+Script Name: 
+Description: 从 https://www.iafd.com 上获取信息。利用cloudscraper绕过cloudflare
+    detail_fetch.py 从本地已经保存的列表数据,逐个拉取详情,并输出到文件。
+    list_fetch_astro.py 按照星座拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_birth.py 按照生日拉取数据,获得演员的信息列表。数据量适中,各详细字段较全
+    list_fetch_ethnic.py 按照人种拉取数据,获得演员的信息列表。数据量大,但详细字段很多无效的
+    list_merge.py 上面三个列表的数据,取交集,得到整体数据。
+    iafd_scrape.py 借助 https://github.com/stashapp/CommunityScrapers 实现的脚本,可以输入演员的 iafd链接,获取兼容 stashapp 格式的数据。（作用不大,因为国籍、照片等字段不匹配）
+
+    html_format.py 负责读取已经保存的html目录, 提取信息,格式化输出。
+    data_merge.py 负责合并数据,它把从 iafd, javhd, thelordofporn 以及搭建 stashapp, 从上面更新到的演员数据(需导出)进行合并；
+    stashdb_merge.py 负责把从stashapp中导出的单个演员的json文件, 批量合并并输出; 通常我们需要把stashapp中导出的批量文件压缩并传输到data/tmp目录,解压后合并
+    从而获取到一份完整的数据列表。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import os
+import json
+import csv
+import logging
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# 输入和输出目录
+input_dir = 'data/tmp'  # 假设metadata目录在当前目录下
+output_json_file = 'stashdb.json'
+output_csv_file = 'stashdb.csv'
+
+# 用于保存所有的条目
+data_list = []
+
+# 遍历metadata文件夹，读取所有json文件
+for filename in os.listdir(input_dir):
+    if filename.endswith('.json'):
+        file_path = os.path.join(input_dir, filename)
+        
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+                
+                # 提取需要的字段
+                person = {
+                    'name': data.get('name'),
+                    'gender': data.get('gender'),
+                    'birthdate': data.get('birthdate'),
+                    'ethnicity': data.get('ethnicity'),
+                    'country': data.get('country'),
+                    'height': data.get('height'),
+                    'measurements': data.get('measurements'),
+                    'fake_tits': data.get('fake_tits'),
+                    'career_length': data.get('career_length'),
+                    'aliases': ', '.join(data.get('aliases', []))  # 连接aliases数组元素
+                }
+                
+                # 将数据添加到列表中
+                data_list.append(person)
+                logger.info(f"Processed file: {filename}")
+        
+        except Exception as e:
+            logger.error(f"Error processing file {filename}: {e}")
+
+# 输出到 JSON 文件
+try:
+    with open(output_json_file, 'w', encoding='utf-8') as json_file:
+        json.dump(data_list, json_file, ensure_ascii=False, indent=4)
+    logger.info(f"Data successfully written to {output_json_file}")
+except Exception as e:
+    logger.error(f"Error writing JSON file: {e}")
+
+# 输出到 CSV 文件
+try:
+    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=data_list[0].keys())
+        writer.writeheader()
+        writer.writerows(data_list)
+    logger.info(f"Data successfully written to {output_csv_file}")
+except Exception as e:
+    logger.error(f"Error writing CSV file: {e}")