modify some scripts.

2025-03-03 19:01:41 +08:00
parent 8fd48687fc
commit f1e5abd6b3
10 changed files with 1642 additions and 0 deletions
--- a/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json
+++ b/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json
@ -0,0 +1,20 @@
+{
+    "href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
+    "title": "Barebackin' Men",
+    "Minutes": "No Data",
+    "Distributor": "1 Distribution",
+    "Studio": "1 Distribution",
+    "ReleaseDate": "No Data",
+    "AddedtoIAFDDate": "Jan 1, 2006",
+    "All-Girl": "No",
+    "All-Male": "Yes",
+    "Compilation": "No",
+    "Webscene": "",
+    "Director": "No Data",
+    "DirectorHref": "",
+    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
+    "StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
+    "Performers": [],
+    "SceneBreakdowns": [],
+    "AppearsIn": []
+}
--- a/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json
+++ b/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json
@ -0,0 +1,56 @@
+{
+    "href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
+    "title": "Slim Goodies POV 2",
+    "Minutes": "84",
+    "Distributor": "Exotic Vixen Films",
+    "Studio": "Exotic Vixen Films",
+    "ReleaseDate": "No Data",
+    "AddedtoIAFDDate": "Jan 17, 2024",
+    "All-Girl": "No",
+    "All-Male": "No",
+    "Compilation": "No",
+    "Webscene": "",
+    "Director": "Just Mike Starks",
+    "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
+    "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
+    "Performers": [
+        {
+            "name": "Amica Mea",
+            "href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
+            "tags": [
+                "Amica Mea"
+            ]
+        },
+        {
+            "name": "Baby Breezy",
+            "href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
+            "tags": [
+                "Baby Breezy"
+            ]
+        },
+        {
+            "name": "Blu Mere",
+            "href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
+            "tags": [
+                "Blu Mere"
+            ]
+        },
+        {
+            "name": "Just Mike Starks",
+            "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+            "tags": [
+                "Just Mike Starks"
+            ]
+        },
+        {
+            "name": "Mocha Menage",
+            "href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
+            "tags": [
+                "Mocha Menage"
+            ]
+        }
+    ],
+    "SceneBreakdowns": [],
+    "AppearsIn": []
+}
--- a/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json
+++ b/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json
@ -0,0 +1,70 @@
+{
+    "href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
+    "title": "Atlanta U: College Freaks",
+    "Minutes": "No Data",
+    "Distributor": "Exotic Vixen Films",
+    "Studio": "Exotic Vixen Films",
+    "ReleaseDate": "No Data",
+    "AddedtoIAFDDate": "Sep 19, 2020",
+    "All-Girl": "No",
+    "All-Male": "No",
+    "Compilation": "No",
+    "Webscene": "",
+    "Director": "Just Mike Starks",
+    "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
+    "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
+    "Performers": [
+        {
+            "name": "Aaliyah Ali",
+            "href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
+            "tags": [
+                "Aaliyah Ali"
+            ]
+        },
+        {
+            "name": "Bones Montana",
+            "href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
+            "tags": [
+                "Bones Montana"
+            ]
+        },
+        {
+            "name": "Cameron Cox",
+            "href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
+            "tags": [
+                "Cameron Cox"
+            ]
+        },
+        {
+            "name": "Crystal Cooper",
+            "href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
+            "tags": [
+                "Crystal Cooper"
+            ]
+        },
+        {
+            "name": "Jazmine Adore",
+            "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
+            "tags": [
+                "Jazmine Adore"
+            ]
+        },
+        {
+            "name": "Just Mike Starks",
+            "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+            "tags": [
+                "Just Mike Starks"
+            ]
+        },
+        {
+            "name": "Lala Ivey",
+            "href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
+            "tags": [
+                "Lala Ivey"
+            ]
+        }
+    ],
+    "SceneBreakdowns": [],
+    "AppearsIn": []
+}
--- a/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json
+++ b/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json
@ -0,0 +1,85 @@
+{
+    "href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
+    "title": "Slim Goodies POV",
+    "Minutes": "61",
+    "Distributor": "Exotic Vixen Films",
+    "Studio": "Exotic Vixen Films",
+    "ReleaseDate": "No Data",
+    "AddedtoIAFDDate": "Sep 19, 2020",
+    "All-Girl": "No",
+    "All-Male": "No",
+    "Compilation": "No",
+    "Webscene": "",
+    "Director": "Just Mike Starks",
+    "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
+    "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
+    "Performers": [
+        {
+            "name": "Gina Ferrero",
+            "href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
+            "tags": [
+                "Gina Ferrero"
+            ]
+        },
+        {
+            "name": "Imani Reign",
+            "href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
+            "tags": [
+                "Imani Reign"
+            ]
+        },
+        {
+            "name": "Jazmine Adore",
+            "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
+            "tags": [
+                "Jazmine Adore"
+            ]
+        },
+        {
+            "name": "Just Mike Starks",
+            "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
+            "tags": [
+                "Just Mike Starks"
+            ]
+        },
+        {
+            "name": "Niomie King",
+            "href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
+            "tags": [
+                "Niomie King"
+            ]
+        }
+    ],
+    "SceneBreakdowns": [
+        {
+            "scene": "Scene 1",
+            "performers": [
+                "Imani Reign",
+                "Just Mike Starks"
+            ]
+        },
+        {
+            "scene": "Scene 2",
+            "performers": [
+                "Jazmine Adore",
+                "Just Mike Starks"
+            ]
+        },
+        {
+            "scene": "Scene 3",
+            "performers": [
+                "Gina Ferrero",
+                "Just Mike Starks"
+            ]
+        },
+        {
+            "scene": "Scene 4",
+            "performers": [
+                "Niomie King",
+                "Just Mike Starks"
+            ]
+        }
+    ],
+    "AppearsIn": []
+}
--- a/scripts/iafd/src/config.py
+++ b/scripts/iafd/src/config.py
@ -0,0 +1,26 @@
+import logging
+import os
+import inspect
+from datetime import datetime
+
+global_share_data_dir = '/root/sharedata'
+global_host_data_dir = '/root/hostdir/scripts_data'
+
+# 设置日志配置
+def setup_logging(log_filename=None):
+    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
+    if log_filename is None:
+        # 获取调用 setup_logging 的脚本文件名
+        caller_frame = inspect.stack()[1]
+        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+
+        # 获取当前日期，格式为 yyyymmdd
+        current_date = datetime.now().strftime('%Y%m%d')
+        # 拼接 log 文件名，将日期加在扩展名前
+        log_filename = f'../log/{caller_filename}_{current_date}.log'
+    
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
+                        handlers=[
+                            logging.FileHandler(log_filename),
+                            logging.StreamHandler()
+                        ])
--- a/scripts/iafd/src/fetch.py
+++ b/scripts/iafd/src/fetch.py
@ -0,0 +1,320 @@
+
+import json
+import time
+import csv
+import argparse
+import logging
+from functools import partial
+import config
+import sqlite_utils as utils
+import iafd_scraper as scraper
+import utils as func
+
+config.setup_logging()
+
+debug = True
+
+# 按星座获取演员列表，无翻页
+def fetch_performers_by_astro(existed_performer_hrefs):    
+    performers = []
+
+    for astro in scraper.astro_list:
+        url = scraper.astr_base_url + astro
+        logging.info(f"Fetching data for {astro}, url {url} ...")
+
+        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_page_astro(soup, astro)
+            if list_data:
+                for row in list_data :
+                    if row['href'] not in existed_performer_hrefs:
+                        performers.append({
+                            'person' : row['person'],
+                            'href' : row['href']
+                        })       
+            else:
+                logging.warning(f'fetch astro error. {url} ...')
+        else:
+            logging.warning(f'fetch astro error. {url} ...')
+
+        # 调试添加break
+        if debug:
+            break
+    return performers
+
+
+# 按生日获取演员列表，无翻页
+def fetch_performers_by_birth(existed_performer_hrefs):    
+    performers = []
+
+    for month in range(1, 13):  # 遍历1到12月
+        for day in range(1, 32):  # 遍历1到31天
+            url = scraper.birth_base_url.format(month=month, day=day)
+            logging.info(f"Fetching data for birth, url {url}")
+            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
+            if soup:
+                list_data, next_url = scraper.parse_page_birth(soup, month, day)
+                if list_data:
+                    for row in list_data :
+                        if row['href'] not in existed_performer_hrefs:
+                            performers.append({
+                                'person' : row['person'],
+                                'href' : row['href']
+                            }) 
+                else:
+                    logging.warning(f'fetch astro error. {url} ...')
+            else:
+                logging.warning(f'fetch astro error. {url} ...')
+
+            # 调试添加break
+            if debug:
+                return performers
+
+    return performers
+
+# 处理带空格的种族名
+def format_ethnic(ethnic):
+    return ethnic.replace(' ', '+')
+
+# 按人种获取演员列表，有翻页
+def fetch_performers_by_ethnic(existed_performer_hrefs):    
+    performers = []
+
+    for ethnic in scraper.ethnic_list:
+        url = scraper.ethnic_url + format_ethnic(ethnic)
+        next_url = url
+
+        while next_url:
+            logging.info(f"Fetching data for {ethnic}, url {url} ...")
+            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), 
+                        parser="lxml", preprocessor=scraper.preprocess_html)
+            if soup:
+                list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
+                if list_data:
+                    for row in list_data :
+                        if row['href'] not in existed_performer_hrefs:
+                            performers.append({
+                                'person' : row['person'],
+                                'href' : row['href']
+                            })       
+                else:
+                    logging.warning(f'fetch astro error. {url} ...')
+            else:
+                logging.warning(f'fetch astro error. {url} ...')
+
+            # 调试添加break
+            if debug:
+                return performers
+    return performers
+
+
+# 获取distributors列表
+def fetch_distributors_list(existed_distributors_href):
+    url = scraper.distributors_list_url
+    distributors_list = []
+
+    logging.info(f"Fetching data for distributors list, url {url} ...")
+    soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
+    if soup:
+        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
+        if list_data:
+            for row in list_data :
+                dis_url = scraper.distributors_base_url + row['href']
+                if dis_url in existed_distributors_href :
+                    continue
+                distributors_list.append({
+                    'name' : row['name'],
+                    'href' : dis_url
+                })
+        else:
+            logging.warning(f'fetch astro error. {url} ...')
+    else:
+        logging.warning(f'fetch astro error. {url} ...')
+    return distributors_list
+
+# 获取studios列表
+def fetch_studios_list(existed_studios_href):
+    url = scraper.studios_list_url
+    studios_list = []
+
+    logging.info(f"Fetching data for studios list, url {url} ...")
+    soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
+    if soup:
+        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
+        if list_data:
+            for row in list_data :
+                stu_url = scraper.studios_base_url + row['href']
+                if stu_url in existed_studios_href:
+                    continue
+                studios_list.append({
+                    'name' : row['name'],
+                    'href' : stu_url
+                })
+        else:
+            logging.warning(f'fetch astro error. {url} ...')
+    else:
+        logging.warning(f'fetch astro error. {url} ...')
+    return studios_list
+
+# 获取更新
+def check_update():
+    # 读取数据库中的演员列表
+    existed_performer_hrefs = utils.query_performer_hrefs()
+    if not existed_performer_hrefs:
+        logging.warning(f'get existed performers from db error.')
+        return None
+    
+    # 从列表页获取新的演员
+    new_performers = []
+    #new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
+    #new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
+    new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
+
+    # 逐个获取演员信息，并写入到db中
+    new_performers = list({item["href"]: item for item in new_performers}.values())
+    logging.info(f'get new performers count: {len(new_performers)} ')
+    for performer in new_performers:
+        url = performer['href']
+        person = performer['person']
+        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
+        if soup:
+            data, credits = scraper.parse_page_performer(soup)
+            if data:
+                performer_id = utils.insert_or_update_performer({
+                    'href': url,
+                    'person': person,
+                    **data
+                })
+                if performer_id:
+                    logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
+                else:
+                    logging.warning(f'insert person: {person} {url} failed.')
+
+                # 写入到本地json文件
+                func.write_person_json(person, url, {
+                    'href': url,
+                    'person': person,
+                    **data,
+                    'credits': credits if credits else {}
+                })
+            else:
+                logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
+        else:
+            logging.warning(f'fetch_page error. person: {person}, url: {url}')
+        # 调试break
+        if debug:
+            break
+
+    # 从数据库读取distributors列表
+    existed_distributors_href = utils.query_distributor_hrefs()
+    if existed_distributors_href is None:
+        logging.warning(f'get existed distributors from db error.')
+        return
+    new_distributors = fetch_distributors_list(existed_distributors_href)
+    for dist in new_distributors:
+        dist_id = utils.insert_or_update_distributor(dist)
+        if dist_id:
+            logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
+        else:
+            logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
+    
+    # 从数据库读取studios列表
+    existed_studios_href = utils.query_studio_hrefs()
+    if existed_studios_href is None:
+        logging.warning(f'get existed studios from db error.')
+        return
+    new_studios = fetch_studios_list(existed_studios_href)
+    for stu in new_studios:
+        stu_id = utils.insert_or_update_studio(stu)
+        if stu_id:
+            logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
+        else:
+            logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
+    
+    # 从数据库中读取影片列表
+    existed_movies = utils.query_movie_hrefs()
+    if existed_movies is None:
+        logging.warning(f'load movies from db error')
+        return
+    new_movies = []
+    new_movie_hrefs = []
+
+    # 遍历所有 distributors，获取 movies 列表
+    existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
+    if existed_distributors_href is None:
+        logging.warning(f'get existed distributors from db error.')
+        return
+    for url in existed_distributors_href:
+        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
+            if list_data:
+                for movie in list_data:
+                    if movie['href'] in existed_movies:
+                        continue
+                    new_movies.append({
+                        'title' : movie['title'],
+                        'href' : movie['href']
+                    })
+                    new_movie_hrefs.append(movie['href'])
+            else :
+                logging.warning(f'parse_page_movie error. url: {url}')
+        # 调试增加brak
+        if debug:
+            break
+    logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
+    
+    # 遍历所有 studios，获取 movies 列表
+    existed_studios_href = utils.query_studio_hrefs(name='vixen')
+    if existed_studios_href is None:
+        logging.warning(f'get existed studios from db error.')
+        return
+    for url in existed_studios_href:
+        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
+            if list_data:
+                for movie in list_data:
+                    if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
+                        continue
+                    new_movies.append({
+                        'title' : movie['title'],
+                        'href' : movie['href']
+                    })
+                    new_movie_hrefs.append(movie['href'])
+            else :
+                logging.warning(f'parse_page_movie error. url: {url}')
+        # 调试增加brak
+        if debug:
+            break    
+    logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
+
+    # 对新的影片，逐个获取内容
+    new_movies = list({item["href"]: item for item in new_movies}.values())
+    logging.info(f'get merged new movies,  count: {len(new_movies)} ')
+    for movie in new_movies:
+        url = movie['href']
+        title = movie['title']
+        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
+        if soup:
+            movie_data = scraper.parse_page_movie(soup, url, title)
+            if movie_data :
+                movie_id = utils.insert_or_update_movie(movie_data)
+                if movie_id:
+                    logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
+                else:
+                    logging.warning(f'insert movie {url} failed.')
+
+                # 写入到本地json文件
+                func.write_movie_json(url, movie_data)
+            else:
+                logging.warning(f'parse_page_movie error. url: {url}')
+        else:
+            logging.warning(f'fetch_page error. url: {url}')
+        # 调试增加break
+        if debug:
+            break
+
+    logging.info(f'all process completed!')
+if __name__ == "__main__":
+    check_update()
--- a/scripts/iafd/src/iafd_scraper.py
+++ b/scripts/iafd/src/iafd_scraper.py
@ -0,0 +1,513 @@
+
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+import config
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.iafd.com"
+
+astr_base_url = f"{host_url}/astrology.rme/sign="
+astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
+
+birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
+
+ethnic_url = f"{host_url}/lookupethnic.rme/ethnic="
+ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
+
+distributors_list_url = f'{host_url}/distrib.asp'
+distributors_base_url = f"{host_url}/distrib.rme/distrib="
+
+studios_list_url = f"{host_url}/studio.asp"
+studios_base_url = f"{host_url}/studio.rme/studio="
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+    for attempt in range(max_retries):
+        try:
+            response = scraper.get(url, headers=headers)
+            response.raise_for_status()  # 处理 HTTP 错误
+
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(response.text) if preprocessor else response.text
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup
+
+            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+        except cloudscraper.exceptions.CloudflareChallengeError as e:
+            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+        except cloudscraper.exceptions.CloudflareCode1020 as e:
+            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+        except Exception as e:
+            logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+    logging.error(f'Fetching failed after max retries. {url}')
+    return None  # 达到最大重试次数仍然失败
+
+# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
+def preprocess_html(html):
+    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
+
+# 通用的 HTML 结构验证器
+def generic_validator(soup, tag, identifier, attr_type="id"):
+    if attr_type == "id":
+        return soup.find(tag, id=identifier) is not None
+    elif attr_type == "class":
+        return bool(soup.find_all(tag, class_=identifier))
+    elif attr_type == "name": 
+        return bool(soup.find('select', {'name': identifier}))
+    return False
+
+# 检查电影信息是否存在
+def movie_validator(soup, table_id):
+    return soup.find("table", id=table_id) is not None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_astro(soup, astro):
+    astro_div = soup.find("div", id="astro")
+    if not astro_div:
+        logging.warning(f"Warning: No 'astro' div found in {astro}")
+        return None, None
+    
+    flag = False
+    list_cnt = 0
+    list_data = []
+    next_url = None
+
+    birth_date = None
+    for elem in astro_div.find_all(recursive=False):
+        if elem.name == "h3" and "astroday" in elem.get("class", []):
+            birth_date = elem.get_text(strip=True)
+        elif elem.name == "div" and "perficon" in elem.get("class", []):
+            a_tag = elem.find("a")
+            if a_tag:
+                href = host_url + a_tag["href"]
+                name = a_tag.find("span", class_="perfname")
+                if name:
+                    list_data.append({
+                        "astrology": astro,
+                        "birth_date": birth_date,
+                        "person": name.get_text(strip=True),
+                        "href": href
+                    })
+                    flag = True
+                    list_cnt = list_cnt +1
+    if flag:
+        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+        return list_data, next_url
+    else:
+        return None, None
+
+
+# 解析页面内容并更新birth_map
+def parse_page_birth(soup, month, day):
+    datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
+    if not datarows:
+        return None, None
+    
+    flag = False
+    list_cnt = 0
+    list_data = []
+    next_url = None
+    rows = datarows[0].find_all('div', class_='col-sm-4')
+    for row in rows:
+        link_tag = row.find('a')
+        person = link_tag.text.strip() if link_tag else ''
+        href = link_tag['href'] if link_tag else ''
+        href = host_url + href
+        
+        # 如果 href 已经在 birth_map 中，跳过
+        flag = True
+        if any(entry['href'] == href for entry in list_data):
+            continue
+        
+        # 将数据添加到 birth_map
+        list_data.append({
+            'month': month,
+            'day': day,
+            'person': person,
+            'href': href
+        })
+        list_cnt = list_cnt +1
+
+    if flag:
+        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
+        return list_data, next_url
+    else:
+        return None, None
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_ethnic(soup, ethnic):
+    rows = soup.find_all('div', class_='row headshotrow')
+    flag = False
+    list_data = []
+    next_url = None
+
+    for row in rows:
+        for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
+            link_tag = col.find('a')
+            img_tag = col.find('div', class_='pictag')
+            flag = True
+
+            if link_tag and img_tag:
+                href = host_url + link_tag['href']
+                person = img_tag.text.strip()
+
+                # 将数据存储到 ethnic_map
+                list_data.append({
+                    'ethnic': ethnic,
+                    'person': person,
+                    'href': href
+                })
+    if flag:
+        logging.debug(f"get {len(list_data)} persons from this page.")
+
+        next_page = soup.find('a', rel='next')
+        if next_page:
+            next_url = host_url + next_page['href']
+            logging.debug(f"Found next page: {next_url}")
+            return list_data, next_url
+        else:
+            logging.debug(f"All pages fetched for {ethnic}.")
+            return list_data, None
+    else:
+        return None, None
+
+# 解析列表页
+def parse_page_dist_stu_list(soup, select_name):
+    list_data = []
+    next_url = None
+
+    select_element = soup.find('select', {'name': select_name})
+    if select_element :    
+        options = select_element.find_all('option')
+        for option in options:
+            value = option.get('value')  # 获取 value 属性
+            text = option.text.strip()   # 获取文本内容
+            list_data.append({
+                'name' : text,
+                'href' : str(value)
+            })
+        return list_data, next_url
+    else:
+        return None, None
+
+# 解析 HTML 内容，提取需要的数据
+def parse_page_dist_stu(soup, table_id):
+    table = soup.find("table", id=table_id)
+    if not table:
+        logging.warning(f"Warning: No {table_id} table found ")
+        return None, None
+    
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 5:
+            title = cols[0].text.strip()
+            label = cols[1].text.strip()
+            year = cols[2].text.strip()
+            rev = cols[3].text.strip()
+            a_href = cols[0].find('a')
+            href = host_url + a_href['href'] if a_href else ''
+
+            list_data.append({
+                'title': title,
+                'label': label,
+                'year': year,
+                'rev': rev,
+                'href': href
+            })
+    return list_data, next_url
+
+
+# 解析 作品列表，有个人出演，也有导演的
+def parse_credits_table(table, distributor_list):
+    # 找到thead并跳过
+    thead = table.find('thead')
+    if thead:
+        thead.decompose()  # 去掉thead部分，不需要解析
+
+    # 现在只剩下tbody部分
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else []
+
+    movies = []
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+
+    # rows = table.find_all('tr', class_='we')
+    for row in rows:
+        cols = row.find_all('td')
+        if len(cols) >= 6:
+            title = cols[0].text.strip()
+            year = cols[1].text.strip()
+            distributor = cols[2].text.strip().lower()
+            notes = cols[3].text.strip()
+            rev = cols[4].text.strip()
+            formats = cols[5].text.strip()
+
+            for key in distributor_list:
+                if key in distributor:
+                    distributor_count[key] += 1
+
+            movies.append({
+                'title': title,
+                'year': year,
+                'distributor': distributor,
+                'notes': notes,
+                'rev': rev,
+                'formats': formats
+            })
+    return movies, distributor_count
+
+
+# 请求网页并提取所需数据
+def parse_page_performer(soup):
+    # 提取数据
+    data = {}
+
+    # 定义我们需要的字段名称和HTML中对应的标签
+    fields = {
+        'performer_aka': 'Performer AKA',
+        'birthday': 'Birthday',
+        'astrology': 'Astrology',
+        'birthplace': 'Birthplace',
+        'gender': 'Gender',
+        'years_active': 'Years Active',
+        'ethnicity': 'Ethnicity',
+        'nationality': 'Nationality',
+        'hair_colors': 'Hair Colors',
+        'eye_color': 'Eye Color',
+        'height': 'Height',
+        'weight': 'Weight',
+        'measurements': 'Measurements',
+        'tattoos': 'Tattoos',
+        'piercings': 'Piercings'
+    }
+    reversed_map = {v: k for k, v in fields.items()}
+
+    # 解析表格数据, 获取参演或者导演的列表
+    role_list = ['personal', 'directoral']
+    distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']        
+    credits_list = {}
+
+    # 使用字典来存储统计
+    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
+    for role in role_list:
+        table = soup.find('table', id=role)
+        if table :
+            movies, stat_map = parse_credits_table(table, distributor_list)
+            credits_list[role] = movies
+            # 更新 distributor 统计
+            for distributor in distributor_list:
+                distributor_count[distributor] += stat_map.get(distributor, 0)
+
+    # 统计 movies 数量
+    #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
+    movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
+
+    # 如果没有找到
+    if len(credits_list) == 0 :
+        logging.warning(f"movie table empty. url: {url} ")
+
+    # 遍历每个 bioheading, 获取metadata
+    bioheadings = soup.find_all('p', class_='bioheading')
+    for bio in bioheadings:
+        heading = bio.text.strip()
+        biodata = None
+
+        # 如果包含 "Performer",需要特殊处理
+        if 'Performer' in heading:
+            heading = 'Performer AKA'
+            biodata_div = bio.find_next('div', class_='biodata')
+            if biodata_div:
+                div_text = biodata_div.get_text(separator='|').strip()
+                biodata = [b.strip() for b in div_text.split('|') if b.strip()]
+        else:
+            biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
+        
+        # 保存数据
+        if heading in reversed_map:
+            kkey = reversed_map[heading]
+            data[kkey] = biodata
+            
+    # 添加统计数据到 data
+    data['movies_cnt'] = movies_cnt
+    data['vixen_cnt'] = distributor_count['vixen']
+    data['blacked_cnt'] = distributor_count['blacked']
+    data['tushy_cnt'] = distributor_count['tushy']
+    data['x_art_cnt'] = distributor_count['x-art']
+
+    return data, credits_list
+
+
+
+# 解析网页 HTML 并提取电影信息
+def parse_page_movie(soup, href, title):
+    # 解析电影基础信息
+    movie_data = {}
+    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
+    if info_div:
+        labels = info_div.find_all("p", class_="bioheading")
+        values = info_div.find_all("p", class_="biodata")
+        for label, value in zip(labels, values):
+            key = label.text.strip()
+            val = value.text.strip()
+            if key in ["Distributor", "Studio", "Director"]:
+                link = value.find("a")
+                if link:
+                    val = link.text.strip()
+                    movie_data[f'{key}Href'] = host_url + link['href']
+            movie_data[key] = val
+    else:
+        return None
+
+    # 解析演职人员信息
+    performers = []
+    cast_divs = soup.find_all("div", class_="castbox")
+    for cast in cast_divs:
+        performer = {}
+        link = cast.find("a")
+        if link:
+            performer["name"] = link.text.strip()
+            performer["href"] =  host_url + link["href"]
+
+        performer["tags"] = [
+            tag.strip() for br in cast.find_all("br")
+            if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
+        ]
+        
+        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
+        performers.append(performer)
+
+    # 解析场景拆解
+    scene_breakdowns = []
+    scene_table = soup.find("div", id="sceneinfo")
+    if scene_table:
+        rows = scene_table.find_all("tr")
+
+        for row in rows:
+            cols = row.find_all("td")
+            if len(cols) >= 2:
+                scene = cols[0].text.strip()  # 场景编号
+                performer_info = cols[1]  # 包含表演者及链接信息
+
+                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
+                performer_html = str(performer_info)  # 获取所有HTML内容
+                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
+                if split_html:
+                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                else:
+                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
+                    if split_html:
+                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
+                    else:
+                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部
+
+                # 解析为纯文本（去除HTML标签，仅提取文本内容）
+                performers_soup = BeautifulSoup(performers_html, "html.parser")
+                performers_text = performers_soup.get_text()
+
+                # 提取表演者
+                scene_performers = [p.strip() for p in performers_text.split(",")]
+
+                # 尝试获取 `webscene` 和 `studio`
+                links_data = {}
+                links = performer_info.find_all("a")
+                if links:
+                    webscene_title = links[0].text.strip() if len(links)>0 else None
+                    webscene = links[0]["href"] if len(links)>0 else None
+                    studio = links[1].text.strip() if len(links)>1 else None
+                    studio_lnk = links[1]["href"] if len(links)>1 else None
+                    links_data = {
+                        "title": webscene_title,
+                        "webscene": webscene,
+                        "studio": studio,
+                        "studio_lnk": studio_lnk,
+                    }
+
+                scene_data = {
+                    "scene": scene,
+                    "performers": scene_performers,
+                    **links_data,
+                }
+                scene_breakdowns.append(scene_data)
+
+    appears_in = []
+    appears_divs = soup.find("div", id="appearssection")
+    if appears_divs:
+        rows = appears_divs.find_all("li")
+        for row in rows:
+            lnk = row.find("a")
+            if lnk:
+                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
+
+
+    return {
+        "href": href,
+        "title": title,
+        "Minutes": movie_data.get("Minutes", ""),
+        "Distributor": movie_data.get("Distributor", ""),
+        "Studio": movie_data.get("Studio", ""),
+        "ReleaseDate": movie_data.get("Release Date", ""),
+        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
+        "All-Girl": movie_data.get("All-Girl", ""),
+        "All-Male": movie_data.get("All-Male", ""),
+        "Compilation": movie_data.get("Compilation", ""),
+        "Webscene": movie_data.get("Webscene", ""),
+        "Director": movie_data.get("Director", ""),
+        "DirectorHref": movie_data.get("DirectorHref", ""),
+        "DistributorHref": movie_data.get("DistributorHref", ""),
+        "StudioHref": movie_data.get("StudioHref", ""),
+        "Performers": performers,
+        "SceneBreakdowns": scene_breakdowns,
+        "AppearsIn": appears_in,
+    }
+
+
+if __name__ == "__main__":
+
+    for astro in astro_list:
+        url = astr_base_url + astro
+        next_url = url
+        logging.info(f"Fetching data for {astro}, url {url} ...")
+
+        while True:
+            soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
+            if soup:
+                list_data, next_url = parse_page_astro(soup, astro)
+                if list_data:
+                    print(list_data[0] if len(list_data)>0 else 'no data')
+                    break
+            else:
+                logging.info(f"Retrying {next_url} ...")
+                time.sleep(5)  # 等待后再重试
+
+        time.sleep(2)  # 控制访问频率
--- a/scripts/iafd/src/sqlite_utils.py
+++ b/scripts/iafd/src/sqlite_utils.py
@ -0,0 +1,459 @@
+import sqlite3
+import json
+import config
+import utils
+import logging
+from datetime import datetime
+
+# 连接 SQLite 数据库
+DB_PATH = f"{config.global_share_data_dir}/shared.db"  # 替换为你的数据库文件
+conn = sqlite3.connect(DB_PATH)
+cursor = conn.cursor()
+
+# 获取当前时间
+def get_current_time():
+    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+# 插入演员信息
+def insert_or_update_performer(data):
+    try:
+        cursor.execute("""
+            INSERT INTO performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors, 
+                                    eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt, 
+                                    blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET
+                name = excluded.name,
+                gender = excluded.gender,
+                birthday = excluded.birthday,
+                astrology = excluded.astrology,
+                birthplace = excluded.birthplace,
+                years_active = excluded.years_active,
+                ethnicity = excluded.ethnicity,
+                nationality = excluded.nationality,
+                hair_colors = excluded.hair_colors,
+                eye_color = excluded.eye_color,
+                height_str = excluded.height_str,
+                weight_str = excluded.weight_str,
+                measurements = excluded.measurements,
+                tattoos = excluded.tattoos,
+                piercings = excluded.piercings,
+                weight = excluded.weight,
+                height = excluded.height,
+                movies_cnt = excluded.movies_cnt,
+                vixen_cnt = excluded.vixen_cnt,
+                blacked_cnt = excluded.blacked_cnt,
+                tushy_cnt = excluded.tushy_cnt,
+                x_art_cnt = excluded.x_art_cnt,
+                updated_at = datetime('now', 'localtime')
+        """, (
+            data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
+            data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
+            data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')), 
+            data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
+        ))
+        
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM performers WHERE href = ?", (data["href"],))
+        performer_id = cursor.fetchone()[0]
+
+        # 删除旧的 alias
+        cursor.execute("DELETE FROM performer_aliases WHERE performer_id = ?", (performer_id,))
+
+        # 插入新的 alias
+        for alias in data.get("performer_aka", []):
+            if alias.lower() != "no known aliases":
+                cursor.execute("INSERT INTO performer_aliases (performer_id, alias) VALUES (?, ?)", (performer_id, alias))
+
+        conn.commit()
+        logging.debug(f"成功插入/更新演员: {data['person']}")
+        return performer_id
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+    except Exception as e:
+        conn.rollback()
+        logging.error(f"未知错误: {e}")
+        return None
+
+# 按 id 或 href 删除演员
+def delete_performer(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM performers WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM performers WHERE href = ?", (identifier,))
+        else:
+            logging.warning("无效的删除参数")
+            return
+        conn.commit()
+        logging.info(f"成功删除演员: {identifier}")
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# 按 id、href 或 name 查询演员信息
+def query_performer(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM performers WHERE id = ?", (identifier,))
+        elif "http" in identifier:
+            cursor.execute("SELECT * FROM performers WHERE href = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM performers WHERE name LIKE ?", (f"%{identifier}%",))
+
+        performer = cursor.fetchone()
+        if performer:
+            cursor.execute("SELECT alias FROM performer_aliases WHERE performer_id = ?", (performer[0],))
+            aliases = [row[0] for row in cursor.fetchall()]
+            result = dict(zip([desc[0] for desc in cursor.description], performer))
+            result["performer_aka"] = aliases
+            return result
+        else:
+            logging.warning(f"未找到演员: {identifier}")
+            return None
+
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_performer_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM performers WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        return [row[0] for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+    
+
+# 插入或更新发行商 """
+def insert_or_update_distributor(data):
+    try:
+        cursor.execute("""
+            INSERT INTO distributors (name, href, updated_at) 
+            VALUES (?, ? , datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name, 
+                updated_at = datetime('now', 'localtime')
+        """, (data["name"], data["href"]))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM distributors WHERE href = ?", (data["href"],))
+        dist_id = cursor.fetchone()[0]
+        if dist_id:
+            logging.debug(f"成功插入/更新发行商: {data['name']}")
+            return dist_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
+# 删除发行商（按 id 或 name） """
+def delete_distributor(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM distributors WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM distributors WHERE name = ?", (identifier,))
+        conn.commit()
+        logging.info(f"成功删除发行商: {identifier}")
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# 查询发行商（按 id 或 name） """
+def query_distributor(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM distributors WHERE id = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM distributors WHERE name LIKE ?", (f"%{identifier}%",))
+
+        distributor = cursor.fetchone()
+        if distributor:
+            return dict(zip([desc[0] for desc in cursor.description], distributor))
+        else:
+            logging.warning(f"未找到发行商: {identifier}")
+            return None
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_distributor_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM distributors WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "url" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        return [row[0] for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+# """ 插入或更新制作公司 """
+def insert_or_update_studio(data):
+    try:
+        cursor.execute("""
+            INSERT INTO studios (name, href, updated_at) 
+            VALUES (?, ?, datetime('now', 'localtime')) 
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name, 
+                updated_at = datetime('now', 'localtime')
+        """, (data["name"], data["href"]))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM studios WHERE href = ?", (data["href"],))
+        stu_id = cursor.fetchone()[0]
+        if stu_id:
+            logging.debug(f"成功插入/更新发行商: {data['name']}")
+            return stu_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
+# """ 删除制作公司（按 id 或 name） """
+def delete_studio(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM studios WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM studios WHERE name = ?", (identifier,))
+        conn.commit()
+        logging.info(f"成功删除制作公司: {identifier}")
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# """ 查询制作公司（按 id 或 name） """
+def query_studio(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM studios WHERE id = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM studios WHERE name LIKE ?", (f"%{identifier}%",))
+
+        studio = cursor.fetchone()
+        if studio:
+            return dict(zip([desc[0] for desc in cursor.description], studio))
+        else:
+            logging.warning(f"未找到制作公司: {identifier}")
+            return None
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_studio_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM studios WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        return [row[0] for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+# """从指定表中通过 href 查找 id"""
+def get_id_by_href(table: str, href: str) -> int:
+    cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
+    row = cursor.fetchone()
+    return row[0] if row else None
+
+# """插入或更新电影数据"""
+def insert_or_update_movie(movie_data):
+    try:
+        # 获取相关 ID
+        distributor_id = get_id_by_href('distributors', movie_data['DistributorHref'])
+        studio_id = get_id_by_href('studios', movie_data['StudioHref'])
+        director_id = get_id_by_href('performers', movie_data['DirectorHref'])
+
+        # 插入或更新电影信息
+        cursor.execute(
+            """
+            INSERT INTO movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date, 
+                               all_girl, all_male, compilation, webscene, director_id, href, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET 
+                title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id, 
+                studio_id=excluded.studio_id, release_date=excluded.release_date, 
+                added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl, 
+                all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
+                director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
+            """,
+            (movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'], 
+             movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'], 
+             movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
+        )
+        conn.commit()
+        logging.info("Movie inserted/updated: %s", movie_data['title'])
+        
+        # 获取插入的 movie_id
+        cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
+        movie_id = cursor.fetchone()[0]
+        
+        # 插入 performers_movies 关系表
+        for performer in movie_data.get('Performers', []):
+            performer_id = get_id_by_href('performers', performer['href'])
+            if performer_id:
+                notes = '|'.join(performer['tags'])
+                cursor.execute(
+                    """
+                    INSERT INTO performers_movies (performer_id, movie_id, role, notes) 
+                    VALUES (?, ?, ?, ?)
+                    ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
+                    """,
+                    (performer_id, movie_id, "Actor", notes)
+                )
+                logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
+            else:
+                logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
+
+        # 插入 movies_appers_in 表
+        for appears in movie_data.get("AppearsIn", []):
+            appears_in_id = get_id_by_href('movies', appears['href'])
+            if appears_in_id:
+                appears_in_id = appears_in_id[0]
+                cursor.execute("""
+                    INSERT INTO movies_appers_in (movie_id, appears_in_id, gradation, notes)
+                    VALUES (?, ?, ?, ?)
+                    ON CONFLICT(movie_id, appears_in_id) DO NOTHING
+                """, (movie_id, appears_in_id, 1, appears["title"]))
+            else:
+                logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
+
+        conn.commit()
+        return movie_id
+        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+
+# 删除电影数据"""
+def delete_movie(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM movies WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM movies WHERE href = ?", (identifier,))
+        else:
+            logging.warning("无效的删除参数")
+            return
+        conn.commit()
+        logging.info(f"Deleted movie with {identifier}")
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error("Error deleting movie: %s", e)
+
+# 查找电影数据"""
+def query_movies(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM movies WHERE id = ?", (identifier,))
+        elif "http" in identifier:
+            cursor.execute("SELECT * FROM movies WHERE href = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM movies WHERE title LIKE ?", (f"%{identifier}%",))
+
+        movie = cursor.fetchone()
+        if movie:
+            cursor.execute("SELECT * FROM performer_movie WHERE performer_id = ?", (movie[0],))
+            performers = [row[0] for row in cursor.fetchall()]
+            result = dict(zip([desc[0] for desc in cursor.description], performers))
+            result["performers"] = performers
+            return result
+        else:
+            logging.warning(f"find no data: {identifier}")
+            return None
+
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_movie_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM movies WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "title" in filters:
+            sql += " AND title LIKE ?"
+            params.append(f"%{filters['title']}%")
+
+        cursor.execute(sql, params)
+        return [row[0] for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return []
+    
+
+if __name__ == "__main__":
+
+    try:
+        with open('../result/detail.json', 'r') as file:
+            performers = json.load(file)
+            for performer in performers:
+                insert_or_update_performer(performer)
+            
+            print(query_performer("Kirsten"))
+            #delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
+            print(query_performer_hrefs())
+    except FileNotFoundError:
+        logging.info("detail.json not found, starting fresh.")
--- a/scripts/iafd/src/utils.py
+++ b/scripts/iafd/src/utils.py
@ -0,0 +1,92 @@
+import re
+import os
+import json
+import time
+import csv
+import logging
+
+# 解析 height 和 weight（转换成数字）
+def parse_height(height_str):
+    return 0
+    try:
+        return int(height_str.split("(")[-1].replace(" cm)", ""))
+    except:
+        return None
+
+def parse_weight(weight_str):
+    return 0
+    try:
+        return int(weight_str.split(" ")[0])
+    except:
+        return None
+    
+update_dir = '../result'
+performers_dir = f'{update_dir}/performers'
+movies_dir = f'{update_dir}/movies'
+
+def uniq_performers(new_performers):
+    try:
+        if not isinstance(new_performers, list):
+            raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
+
+        seen = set()
+        unique_performers = []
+
+        for item in new_performers:
+            if not item or item['href'] is None:
+                raise ValueError(f"Invalid item in new_performers: {item}")
+
+            if item["href"] not in seen:
+                seen.add(item["href"])
+                unique_performers.append(item)
+
+        return unique_performers
+
+    except Exception as e:
+        logging.error(f"Error in remove_duplicate_performers: {e}")
+        return []  # 返回空列表，避免程序崩溃
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
+def extract_id_from_href(href):
+    """从href中提取id参数"""
+    match = re.search(r'id=([a-f0-9\-]+)', href)
+    return match.group(1) if match else ''
+
+# 写入每个 performer 的单独 JSON 文件
+def write_person_json(person, href, data):
+    # 获取目录
+    person_dir = create_sub_directory(performers_dir, person)
+    person_id = extract_id_from_href(href)
+    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+
+
+# 写入每个 performer 的单独 JSON 文件
+def write_movie_json(href, data):
+    # 获取目录
+    movie_id = extract_id_from_href(href)
+    person_dir = create_sub_directory(movies_dir, movie_id)
+    person_filename = f"{movie_id}.json"  # 用 - 替换空格
+    full_path = os.path.join(person_dir, person_filename)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as json_file:
+            json.dump(data, json_file, indent=4, ensure_ascii=False)
+    except Exception as e:
+        logging.error(f"Error writing file {full_path}: {e}")
+