modify some scripts.

2025-03-03 19:01:41 +08:00
parent 8fd48687fc
commit f1e5abd6b3
10 changed files with 1642 additions and 0 deletions
--- a/1
+++ b/1
@ -12,6 +12,7 @@ scripts/iafd/data/tmp/
 scripts/iafd/result/tmp/
 scripts/iafd/result/bak/
 scripts/iafd/result/performers/
 scripts/iafd/result/movies/
 scripts/iafd/log/
 scripts/thelordofporn/log/
 scripts/vixen_group/log/
--- a/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json
+++ b/scripts/iafd/result/movies/0/0b332f5e-c0b8-4536-a79c-5abd4da9c68a.json
@ -0,0 +1,20 @@
 {
    "href": "https://www.iafd.com/title.rme/id=0b332f5e-c0b8-4536-a79c-5abd4da9c68a",
    "title": "Barebackin' Men",
    "Minutes": "No Data",
    "Distributor": "1 Distribution",
    "Studio": "1 Distribution",
    "ReleaseDate": "No Data",
    "AddedtoIAFDDate": "Jan 1, 2006",
    "All-Girl": "No",
    "All-Male": "Yes",
    "Compilation": "No",
    "Webscene": "",
    "Director": "No Data",
    "DirectorHref": "",
    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=10/1%2ddistribution.htm",
    "StudioHref": "https://www.iafd.com/studio.rme/studio=10/1%2ddistribution.htm",
    "Performers": [],
    "SceneBreakdowns": [],
    "AppearsIn": []
 }
--- a/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json
+++ b/scripts/iafd/result/movies/2/2f582dcf-192e-4adf-9d60-447df8f16b9c.json
@ -0,0 +1,56 @@
 {
    "href": "https://www.iafd.com/title.rme/id=2f582dcf-192e-4adf-9d60-447df8f16b9c",
    "title": "Slim Goodies POV 2",
    "Minutes": "84",
    "Distributor": "Exotic Vixen Films",
    "Studio": "Exotic Vixen Films",
    "ReleaseDate": "No Data",
    "AddedtoIAFDDate": "Jan 17, 2024",
    "All-Girl": "No",
    "All-Male": "No",
    "Compilation": "No",
    "Webscene": "",
    "Director": "Just Mike Starks",
    "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
    "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
    "Performers": [
        {
            "name": "Amica Mea",
            "href": "https://www.iafd.com/person.rme/id=933ab6e6-ba98-49a7-a9e2-c1a4523ab89c",
            "tags": [
                "Amica Mea"
            ]
        },
        {
            "name": "Baby Breezy",
            "href": "https://www.iafd.com/person.rme/id=b0d3673c-20b4-41eb-bb15-622b2f8e5ed3",
            "tags": [
                "Baby Breezy"
            ]
        },
        {
            "name": "Blu Mere",
            "href": "https://www.iafd.com/person.rme/id=8d41dd04-f188-44c3-ac40-dc64711cf905",
            "tags": [
                "Blu Mere"
            ]
        },
        {
            "name": "Just Mike Starks",
            "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
            "tags": [
                "Just Mike Starks"
            ]
        },
        {
            "name": "Mocha Menage",
            "href": "https://www.iafd.com/person.rme/id=63b084e9-a144-41da-acf9-a913b68416cd",
            "tags": [
                "Mocha Menage"
            ]
        }
    ],
    "SceneBreakdowns": [],
    "AppearsIn": []
 }
--- a/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json
+++ b/scripts/iafd/result/movies/9/9af4e9f4-68ce-47ec-a7d7-fde92862af57.json
@ -0,0 +1,70 @@
 {
    "href": "https://www.iafd.com/title.rme/id=9af4e9f4-68ce-47ec-a7d7-fde92862af57",
    "title": "Atlanta U: College Freaks",
    "Minutes": "No Data",
    "Distributor": "Exotic Vixen Films",
    "Studio": "Exotic Vixen Films",
    "ReleaseDate": "No Data",
    "AddedtoIAFDDate": "Sep 19, 2020",
    "All-Girl": "No",
    "All-Male": "No",
    "Compilation": "No",
    "Webscene": "",
    "Director": "Just Mike Starks",
    "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
    "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
    "Performers": [
        {
            "name": "Aaliyah Ali",
            "href": "https://www.iafd.com/person.rme/id=7904b6a0-965c-4137-92e2-2ca0cdc4ff38",
            "tags": [
                "Aaliyah Ali"
            ]
        },
        {
            "name": "Bones Montana",
            "href": "https://www.iafd.com/person.rme/id=8c18f9ad-045d-475f-bee8-cab6bed1fad4",
            "tags": [
                "Bones Montana"
            ]
        },
        {
            "name": "Cameron Cox",
            "href": "https://www.iafd.com/person.rme/id=ce96cf3d-e85e-4d46-ad5e-f05881c88a26",
            "tags": [
                "Cameron Cox"
            ]
        },
        {
            "name": "Crystal Cooper",
            "href": "https://www.iafd.com/person.rme/id=fc02ac73-2892-4578-9bf4-eed87afc4980",
            "tags": [
                "Crystal Cooper"
            ]
        },
        {
            "name": "Jazmine Adore",
            "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
            "tags": [
                "Jazmine Adore"
            ]
        },
        {
            "name": "Just Mike Starks",
            "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
            "tags": [
                "Just Mike Starks"
            ]
        },
        {
            "name": "Lala Ivey",
            "href": "https://www.iafd.com/person.rme/id=7ae838ee-764f-4725-b422-b41ffac1e67b",
            "tags": [
                "Lala Ivey"
            ]
        }
    ],
    "SceneBreakdowns": [],
    "AppearsIn": []
 }
--- a/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json
+++ b/scripts/iafd/result/movies/c/ca753243-8e3a-49ac-88aa-357055187e8c.json
@ -0,0 +1,85 @@
 {
    "href": "https://www.iafd.com/title.rme/id=ca753243-8e3a-49ac-88aa-357055187e8c",
    "title": "Slim Goodies POV",
    "Minutes": "61",
    "Distributor": "Exotic Vixen Films",
    "Studio": "Exotic Vixen Films",
    "ReleaseDate": "No Data",
    "AddedtoIAFDDate": "Sep 19, 2020",
    "All-Girl": "No",
    "All-Male": "No",
    "Compilation": "No",
    "Webscene": "",
    "Director": "Just Mike Starks",
    "DirectorHref": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
    "DistributorHref": "https://www.iafd.com/distrib.rme/distrib=12229/exotic%2dvixen%2dfilms.htm",
    "StudioHref": "https://www.iafd.com/studio.rme/studio=12229/exotic%2dvixen%2dfilms.htm",
    "Performers": [
        {
            "name": "Gina Ferrero",
            "href": "https://www.iafd.com/person.rme/id=54d62a5b-2ce5-40fa-b050-14325a62f4fd",
            "tags": [
                "Gina Ferrero"
            ]
        },
        {
            "name": "Imani Reign",
            "href": "https://www.iafd.com/person.rme/id=8566216f-bcbc-4764-ba50-73405cf79dce",
            "tags": [
                "Imani Reign"
            ]
        },
        {
            "name": "Jazmine Adore",
            "href": "https://www.iafd.com/person.rme/id=1809f7bb-5a2a-4a4a-87c1-caed73a88bc4",
            "tags": [
                "Jazmine Adore"
            ]
        },
        {
            "name": "Just Mike Starks",
            "href": "https://www.iafd.com/person.rme/id=b29f1981-51f7-471a-bda1-c0845fc9b71f",
            "tags": [
                "Just Mike Starks"
            ]
        },
        {
            "name": "Niomie King",
            "href": "https://www.iafd.com/person.rme/id=dbefea32-7f88-41a9-9de3-e467015d687a",
            "tags": [
                "Niomie King"
            ]
        }
    ],
    "SceneBreakdowns": [
        {
            "scene": "Scene 1",
            "performers": [
                "Imani Reign",
                "Just Mike Starks"
            ]
        },
        {
            "scene": "Scene 2",
            "performers": [
                "Jazmine Adore",
                "Just Mike Starks"
            ]
        },
        {
            "scene": "Scene 3",
            "performers": [
                "Gina Ferrero",
                "Just Mike Starks"
            ]
        },
        {
            "scene": "Scene 4",
            "performers": [
                "Niomie King",
                "Just Mike Starks"
            ]
        }
    ],
    "AppearsIn": []
 }
--- a/scripts/iafd/src/config.py
+++ b/scripts/iafd/src/config.py
@ -0,0 +1,26 @@
 import logging
 import os
 import inspect
 from datetime import datetime
 global_share_data_dir = '/root/sharedata'
 global_host_data_dir = '/root/hostdir/scripts_data'
 # 设置日志配置
 def setup_logging(log_filename=None):
    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
    if log_filename is None:
        # 获取调用 setup_logging 的脚本文件名
        caller_frame = inspect.stack()[1]
        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
        # 获取当前日期，格式为 yyyymmdd
        current_date = datetime.now().strftime('%Y%m%d')
        # 拼接 log 文件名，将日期加在扩展名前
        log_filename = f'../log/{caller_filename}_{current_date}.log'
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
                        handlers=[
                            logging.FileHandler(log_filename),
                            logging.StreamHandler()
                        ])
--- a/scripts/iafd/src/fetch.py
+++ b/scripts/iafd/src/fetch.py
@ -0,0 +1,320 @@
 import json
 import time
 import csv
 import argparse
 import logging
 from functools import partial
 import config
 import sqlite_utils as utils
 import iafd_scraper as scraper
 import utils as func
 config.setup_logging()
 debug = True
 # 按星座获取演员列表，无翻页
 def fetch_performers_by_astro(existed_performer_hrefs):    
    performers = []
    for astro in scraper.astro_list:
        url = scraper.astr_base_url + astro
        logging.info(f"Fetching data for {astro}, url {url} ...")
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_page_astro(soup, astro)
            if list_data:
                for row in list_data :
                    if row['href'] not in existed_performer_hrefs:
                        performers.append({
                            'person' : row['person'],
                            'href' : row['href']
                        })       
            else:
                logging.warning(f'fetch astro error. {url} ...')
        else:
            logging.warning(f'fetch astro error. {url} ...')
        # 调试添加break
        if debug:
            break
    return performers
 # 按生日获取演员列表，无翻页
 def fetch_performers_by_birth(existed_performer_hrefs):    
    performers = []
    for month in range(1, 13):  # 遍历1到12月
        for day in range(1, 32):  # 遍历1到31天
            url = scraper.birth_base_url.format(month=month, day=day)
            logging.info(f"Fetching data for birth, url {url}")
            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
            if soup:
                list_data, next_url = scraper.parse_page_birth(soup, month, day)
                if list_data:
                    for row in list_data :
                        if row['href'] not in existed_performer_hrefs:
                            performers.append({
                                'person' : row['person'],
                                'href' : row['href']
                            }) 
                else:
                    logging.warning(f'fetch astro error. {url} ...')
            else:
                logging.warning(f'fetch astro error. {url} ...')
            # 调试添加break
            if debug:
                return performers
    return performers
 # 处理带空格的种族名
 def format_ethnic(ethnic):
    return ethnic.replace(' ', '+')
 # 按人种获取演员列表，有翻页
 def fetch_performers_by_ethnic(existed_performer_hrefs):    
    performers = []
    for ethnic in scraper.ethnic_list:
        url = scraper.ethnic_url + format_ethnic(ethnic)
        next_url = url
        while next_url:
            logging.info(f"Fetching data for {ethnic}, url {url} ...")
            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), 
                        parser="lxml", preprocessor=scraper.preprocess_html)
            if soup:
                list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
                if list_data:
                    for row in list_data :
                        if row['href'] not in existed_performer_hrefs:
                            performers.append({
                                'person' : row['person'],
                                'href' : row['href']
                            })       
                else:
                    logging.warning(f'fetch astro error. {url} ...')
            else:
                logging.warning(f'fetch astro error. {url} ...')
            # 调试添加break
            if debug:
                return performers
    return performers
 # 获取distributors列表
 def fetch_distributors_list(existed_distributors_href):
    url = scraper.distributors_list_url
    distributors_list = []
    logging.info(f"Fetching data for distributors list, url {url} ...")
    soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
    if soup:
        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
        if list_data:
            for row in list_data :
                dis_url = scraper.distributors_base_url + row['href']
                if dis_url in existed_distributors_href :
                    continue
                distributors_list.append({
                    'name' : row['name'],
                    'href' : dis_url
                })
        else:
            logging.warning(f'fetch astro error. {url} ...')
    else:
        logging.warning(f'fetch astro error. {url} ...')
    return distributors_list
 # 获取studios列表
 def fetch_studios_list(existed_studios_href):
    url = scraper.studios_list_url
    studios_list = []
    logging.info(f"Fetching data for studios list, url {url} ...")
    soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
    if soup:
        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
        if list_data:
            for row in list_data :
                stu_url = scraper.studios_base_url + row['href']
                if stu_url in existed_studios_href:
                    continue
                studios_list.append({
                    'name' : row['name'],
                    'href' : stu_url
                })
        else:
            logging.warning(f'fetch astro error. {url} ...')
    else:
        logging.warning(f'fetch astro error. {url} ...')
    return studios_list
 # 获取更新
 def check_update():
    # 读取数据库中的演员列表
    existed_performer_hrefs = utils.query_performer_hrefs()
    if not existed_performer_hrefs:
        logging.warning(f'get existed performers from db error.')
        return None
    # 从列表页获取新的演员
    new_performers = []
    #new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
    #new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
    new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))
    # 逐个获取演员信息，并写入到db中
    new_performers = list({item["href"]: item for item in new_performers}.values())
    logging.info(f'get new performers count: {len(new_performers)} ')
    for performer in new_performers:
        url = performer['href']
        person = performer['person']
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
        if soup:
            data, credits = scraper.parse_page_performer(soup)
            if data:
                performer_id = utils.insert_or_update_performer({
                    'href': url,
                    'person': person,
                    **data
                })
                if performer_id:
                    logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
                else:
                    logging.warning(f'insert person: {person} {url} failed.')
                # 写入到本地json文件
                func.write_person_json(person, url, {
                    'href': url,
                    'person': person,
                    **data,
                    'credits': credits if credits else {}
                })
            else:
                logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
        else:
            logging.warning(f'fetch_page error. person: {person}, url: {url}')
        # 调试break
        if debug:
            break
    # 从数据库读取distributors列表
    existed_distributors_href = utils.query_distributor_hrefs()
    if existed_distributors_href is None:
        logging.warning(f'get existed distributors from db error.')
        return
    new_distributors = fetch_distributors_list(existed_distributors_href)
    for dist in new_distributors:
        dist_id = utils.insert_or_update_distributor(dist)
        if dist_id:
            logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
        else:
            logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')
    # 从数据库读取studios列表
    existed_studios_href = utils.query_studio_hrefs()
    if existed_studios_href is None:
        logging.warning(f'get existed studios from db error.')
        return
    new_studios = fetch_studios_list(existed_studios_href)
    for stu in new_studios:
        stu_id = utils.insert_or_update_studio(stu)
        if stu_id:
            logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
        else:
            logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')
    # 从数据库中读取影片列表
    existed_movies = utils.query_movie_hrefs()
    if existed_movies is None:
        logging.warning(f'load movies from db error')
        return
    new_movies = []
    new_movie_hrefs = []
    # 遍历所有 distributors，获取 movies 列表
    existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
    if existed_distributors_href is None:
        logging.warning(f'get existed distributors from db error.')
        return
    for url in existed_distributors_href:
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
            if list_data:
                for movie in list_data:
                    if movie['href'] in existed_movies:
                        continue
                    new_movies.append({
                        'title' : movie['title'],
                        'href' : movie['href']
                    })
                    new_movie_hrefs.append(movie['href'])
            else :
                logging.warning(f'parse_page_movie error. url: {url}')
        # 调试增加brak
        if debug:
            break
    logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')
    # 遍历所有 studios，获取 movies 列表
    existed_studios_href = utils.query_studio_hrefs(name='vixen')
    if existed_studios_href is None:
        logging.warning(f'get existed studios from db error.')
        return
    for url in existed_studios_href:
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
            if list_data:
                for movie in list_data:
                    if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
                        continue
                    new_movies.append({
                        'title' : movie['title'],
                        'href' : movie['href']
                    })
                    new_movie_hrefs.append(movie['href'])
            else :
                logging.warning(f'parse_page_movie error. url: {url}')
        # 调试增加brak
        if debug:
            break    
    logging.info(f'all new moives found for studios, now total new {len(new_movies)}')
    # 对新的影片，逐个获取内容
    new_movies = list({item["href"]: item for item in new_movies}.values())
    logging.info(f'get merged new movies,  count: {len(new_movies)} ')
    for movie in new_movies:
        url = movie['href']
        title = movie['title']
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
        if soup:
            movie_data = scraper.parse_page_movie(soup, url, title)
            if movie_data :
                movie_id = utils.insert_or_update_movie(movie_data)
                if movie_id:
                    logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
                else:
                    logging.warning(f'insert movie {url} failed.')
                # 写入到本地json文件
                func.write_movie_json(url, movie_data)
            else:
                logging.warning(f'parse_page_movie error. url: {url}')
        else:
            logging.warning(f'fetch_page error. url: {url}')
        # 调试增加break
        if debug:
            break
    logging.info(f'all process completed!')
 if __name__ == "__main__":
    check_update()
--- a/scripts/iafd/src/iafd_scraper.py
+++ b/scripts/iafd/src/iafd_scraper.py
@ -0,0 +1,513 @@
 import cloudscraper
 import time
 import json
 import csv
 import logging
 import signal
 import sys
 import os
 import re
 from bs4 import BeautifulSoup
 from requests.exceptions import RequestException
 from functools import partial
 import config
 # 定义基础 URL 和可变参数
 host_url = "https://www.iafd.com"
 astr_base_url = f"{host_url}/astrology.rme/sign="
 astro_list = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
 birth_base_url = "https://www.iafd.com/calendar.asp?calmonth={month}&calday={day}"
 ethnic_url = f"{host_url}/lookupethnic.rme/ethnic="
 ethnic_list = ['caucasian', 'black', 'asian', 'latin', 'native american', 'middle eastern', 'mediteranean', 'indian', 'polynesian', 'multi-ethnic', 'ethnic', 'romani', 'eurasian', 'north african', 'south asian']
 distributors_list_url = f'{host_url}/distrib.asp'
 distributors_base_url = f"{host_url}/distrib.rme/distrib="
 studios_list_url = f"{host_url}/studio.asp"
 studios_base_url = f"{host_url}/studio.rme/studio="
 # 设置 headers 和 scraper
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 }
 scraper = cloudscraper.create_scraper()
 #使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
    for attempt in range(max_retries):
        try:
            response = scraper.get(url, headers=headers)
            response.raise_for_status()  # 处理 HTTP 错误
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text
            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup
            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"Unexpected error on {url}: {e}, Retring...")
    logging.error(f'Fetching failed after max retries. {url}')
    return None  # 达到最大重试次数仍然失败
 # 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
 def preprocess_html(html):
    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
 # 通用的 HTML 结构验证器
 def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name": 
        return bool(soup.find('select', {'name': identifier}))
    return False
 # 检查电影信息是否存在
 def movie_validator(soup, table_id):
    return soup.find("table", id=table_id) is not None
 # 解析 HTML 内容，提取需要的数据
 def parse_page_astro(soup, astro):
    astro_div = soup.find("div", id="astro")
    if not astro_div:
        logging.warning(f"Warning: No 'astro' div found in {astro}")
        return None, None
    flag = False
    list_cnt = 0
    list_data = []
    next_url = None
    birth_date = None
    for elem in astro_div.find_all(recursive=False):
        if elem.name == "h3" and "astroday" in elem.get("class", []):
            birth_date = elem.get_text(strip=True)
        elif elem.name == "div" and "perficon" in elem.get("class", []):
            a_tag = elem.find("a")
            if a_tag:
                href = host_url + a_tag["href"]
                name = a_tag.find("span", class_="perfname")
                if name:
                    list_data.append({
                        "astrology": astro,
                        "birth_date": birth_date,
                        "person": name.get_text(strip=True),
                        "href": href
                    })
                    flag = True
                    list_cnt = list_cnt +1
    if flag:
        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
        return list_data, next_url
    else:
        return None, None
 # 解析页面内容并更新birth_map
 def parse_page_birth(soup, month, day):
    datarows = soup.find_all('div', class_='col-sm-12 col-lg-9')
    if not datarows:
        return None, None
    flag = False
    list_cnt = 0
    list_data = []
    next_url = None
    rows = datarows[0].find_all('div', class_='col-sm-4')
    for row in rows:
        link_tag = row.find('a')
        person = link_tag.text.strip() if link_tag else ''
        href = link_tag['href'] if link_tag else ''
        href = host_url + href
        # 如果 href 已经在 birth_map 中，跳过
        flag = True
        if any(entry['href'] == href for entry in list_data):
            continue
        # 将数据添加到 birth_map
        list_data.append({
            'month': month,
            'day': day,
            'person': person,
            'href': href
        })
        list_cnt = list_cnt +1
    if flag:
        logging.debug(f"get {list_cnt} persons from this page. total persons: {len(list_data)}")
        return list_data, next_url
    else:
        return None, None
 # 解析 HTML 内容，提取需要的数据
 def parse_page_ethnic(soup, ethnic):
    rows = soup.find_all('div', class_='row headshotrow')
    flag = False
    list_data = []
    next_url = None
    for row in rows:
        for col in row.find_all('div', class_='col-lg-2 col-md-3 col-sm-4 col-xs-6'):
            link_tag = col.find('a')
            img_tag = col.find('div', class_='pictag')
            flag = True
            if link_tag and img_tag:
                href = host_url + link_tag['href']
                person = img_tag.text.strip()
                # 将数据存储到 ethnic_map
                list_data.append({
                    'ethnic': ethnic,
                    'person': person,
                    'href': href
                })
    if flag:
        logging.debug(f"get {len(list_data)} persons from this page.")
        next_page = soup.find('a', rel='next')
        if next_page:
            next_url = host_url + next_page['href']
            logging.debug(f"Found next page: {next_url}")
            return list_data, next_url
        else:
            logging.debug(f"All pages fetched for {ethnic}.")
            return list_data, None
    else:
        return None, None
 # 解析列表页
 def parse_page_dist_stu_list(soup, select_name):
    list_data = []
    next_url = None
    select_element = soup.find('select', {'name': select_name})
    if select_element :    
        options = select_element.find_all('option')
        for option in options:
            value = option.get('value')  # 获取 value 属性
            text = option.text.strip()   # 获取文本内容
            list_data.append({
                'name' : text,
                'href' : str(value)
            })
        return list_data, next_url
    else:
        return None, None
 # 解析 HTML 内容，提取需要的数据
 def parse_page_dist_stu(soup, table_id):
    table = soup.find("table", id=table_id)
    if not table:
        logging.warning(f"Warning: No {table_id} table found ")
        return None, None
    # 找到thead并跳过
    thead = table.find('thead')
    if thead:
        thead.decompose()  # 去掉thead部分，不需要解析
    # 现在只剩下tbody部分
    tbody = table.find('tbody')
    rows = tbody.find_all('tr') if tbody else []
    list_data = []
    next_url = None
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 5:
            title = cols[0].text.strip()
            label = cols[1].text.strip()
            year = cols[2].text.strip()
            rev = cols[3].text.strip()
            a_href = cols[0].find('a')
            href = host_url + a_href['href'] if a_href else ''
            list_data.append({
                'title': title,
                'label': label,
                'year': year,
                'rev': rev,
                'href': href
            })
    return list_data, next_url
 # 解析 作品列表，有个人出演，也有导演的
 def parse_credits_table(table, distributor_list):
    # 找到thead并跳过
    thead = table.find('thead')
    if thead:
        thead.decompose()  # 去掉thead部分，不需要解析
    # 现在只剩下tbody部分
    tbody = table.find('tbody')
    rows = tbody.find_all('tr') if tbody else []
    movies = []
    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
    # rows = table.find_all('tr', class_='we')
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 6:
            title = cols[0].text.strip()
            year = cols[1].text.strip()
            distributor = cols[2].text.strip().lower()
            notes = cols[3].text.strip()
            rev = cols[4].text.strip()
            formats = cols[5].text.strip()
            for key in distributor_list:
                if key in distributor:
                    distributor_count[key] += 1
            movies.append({
                'title': title,
                'year': year,
                'distributor': distributor,
                'notes': notes,
                'rev': rev,
                'formats': formats
            })
    return movies, distributor_count
 # 请求网页并提取所需数据
 def parse_page_performer(soup):
    # 提取数据
    data = {}
    # 定义我们需要的字段名称和HTML中对应的标签
    fields = {
        'performer_aka': 'Performer AKA',
        'birthday': 'Birthday',
        'astrology': 'Astrology',
        'birthplace': 'Birthplace',
        'gender': 'Gender',
        'years_active': 'Years Active',
        'ethnicity': 'Ethnicity',
        'nationality': 'Nationality',
        'hair_colors': 'Hair Colors',
        'eye_color': 'Eye Color',
        'height': 'Height',
        'weight': 'Weight',
        'measurements': 'Measurements',
        'tattoos': 'Tattoos',
        'piercings': 'Piercings'
    }
    reversed_map = {v: k for k, v in fields.items()}
    # 解析表格数据, 获取参演或者导演的列表
    role_list = ['personal', 'directoral']
    distributor_list = ['vixen', 'blacked', 'tushy', 'x-art']        
    credits_list = {}
    # 使用字典来存储统计
    distributor_count = {key: 0 for key in distributor_list}  # 初始化每个 distributor 的计数
    for role in role_list:
        table = soup.find('table', id=role)
        if table :
            movies, stat_map = parse_credits_table(table, distributor_list)
            credits_list[role] = movies
            # 更新 distributor 统计
            for distributor in distributor_list:
                distributor_count[distributor] += stat_map.get(distributor, 0)
    # 统计 movies 数量
    #movies_cnt = sum(len(credits_list[role]) for role in role_list if credits_list[role])
    movies_cnt = sum(len(credits_list.get(role, [])) for role in role_list if credits_list.get(role, []))
    # 如果没有找到
    if len(credits_list) == 0 :
        logging.warning(f"movie table empty. url: {url} ")
    # 遍历每个 bioheading, 获取metadata
    bioheadings = soup.find_all('p', class_='bioheading')
    for bio in bioheadings:
        heading = bio.text.strip()
        biodata = None
        # 如果包含 "Performer",需要特殊处理
        if 'Performer' in heading:
            heading = 'Performer AKA'
            biodata_div = bio.find_next('div', class_='biodata')
            if biodata_div:
                div_text = biodata_div.get_text(separator='|').strip()
                biodata = [b.strip() for b in div_text.split('|') if b.strip()]
        else:
            biodata = bio.find_next('p', class_='biodata').text.strip() if bio.find_next('p', class_='biodata') else ''
        # 保存数据
        if heading in reversed_map:
            kkey = reversed_map[heading]
            data[kkey] = biodata
    # 添加统计数据到 data
    data['movies_cnt'] = movies_cnt
    data['vixen_cnt'] = distributor_count['vixen']
    data['blacked_cnt'] = distributor_count['blacked']
    data['tushy_cnt'] = distributor_count['tushy']
    data['x_art_cnt'] = distributor_count['x-art']
    return data, credits_list
 # 解析网页 HTML 并提取电影信息
 def parse_page_movie(soup, href, title):
    # 解析电影基础信息
    movie_data = {}
    info_div = soup.find("div", class_="col-xs-12 col-sm-3")
    if info_div:
        labels = info_div.find_all("p", class_="bioheading")
        values = info_div.find_all("p", class_="biodata")
        for label, value in zip(labels, values):
            key = label.text.strip()
            val = value.text.strip()
            if key in ["Distributor", "Studio", "Director"]:
                link = value.find("a")
                if link:
                    val = link.text.strip()
                    movie_data[f'{key}Href'] = host_url + link['href']
            movie_data[key] = val
    else:
        return None
    # 解析演职人员信息
    performers = []
    cast_divs = soup.find_all("div", class_="castbox")
    for cast in cast_divs:
        performer = {}
        link = cast.find("a")
        if link:
            performer["name"] = link.text.strip()
            performer["href"] =  host_url + link["href"]
        performer["tags"] = [
            tag.strip() for br in cast.find_all("br")
            if (tag := br.next_sibling) and isinstance(tag, str) and tag.strip()
        ]
        #performer["tags"] = [br.next_sibling.strip() for br in cast.find_all("br") if br.next_sibling and (br.next_sibling).strip()]
        performers.append(performer)
    # 解析场景拆解
    scene_breakdowns = []
    scene_table = soup.find("div", id="sceneinfo")
    if scene_table:
        rows = scene_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:
                scene = cols[0].text.strip()  # 场景编号
                performer_info = cols[1]  # 包含表演者及链接信息
                # 获取 <br> 之前的完整 HTML（保留 <i> 标签等格式）
                performer_html = str(performer_info)  # 获取所有HTML内容
                split_html = performer_html.split("<br/>")  # 按 <br> 进行分割
                if split_html:
                    performers_html = split_html[0].strip()  # 取 <br> 之前的部分
                else:
                    split_html = performer_html.split("<br>")  # 按 <br> 进行分割
                    if split_html:
                        performers_html = split_html[0].strip()  # 取 <br> 之前的部分
                    else:
                        performers_html = performer_html.strip()  # 如果没有 <br>，取全部
                # 解析为纯文本（去除HTML标签，仅提取文本内容）
                performers_soup = BeautifulSoup(performers_html, "html.parser")
                performers_text = performers_soup.get_text()
                # 提取表演者
                scene_performers = [p.strip() for p in performers_text.split(",")]
                # 尝试获取 `webscene` 和 `studio`
                links_data = {}
                links = performer_info.find_all("a")
                if links:
                    webscene_title = links[0].text.strip() if len(links)>0 else None
                    webscene = links[0]["href"] if len(links)>0 else None
                    studio = links[1].text.strip() if len(links)>1 else None
                    studio_lnk = links[1]["href"] if len(links)>1 else None
                    links_data = {
                        "title": webscene_title,
                        "webscene": webscene,
                        "studio": studio,
                        "studio_lnk": studio_lnk,
                    }
                scene_data = {
                    "scene": scene,
                    "performers": scene_performers,
                    **links_data,
                }
                scene_breakdowns.append(scene_data)
    appears_in = []
    appears_divs = soup.find("div", id="appearssection")
    if appears_divs:
        rows = appears_divs.find_all("li")
        for row in rows:
            lnk = row.find("a")
            if lnk:
                appears_in.append({'title': lnk.text.strip(), 'href': host_url + lnk['href']})
    return {
        "href": href,
        "title": title,
        "Minutes": movie_data.get("Minutes", ""),
        "Distributor": movie_data.get("Distributor", ""),
        "Studio": movie_data.get("Studio", ""),
        "ReleaseDate": movie_data.get("Release Date", ""),
        "AddedtoIAFDDate": movie_data.get("Date Added to IAFD", ""),
        "All-Girl": movie_data.get("All-Girl", ""),
        "All-Male": movie_data.get("All-Male", ""),
        "Compilation": movie_data.get("Compilation", ""),
        "Webscene": movie_data.get("Webscene", ""),
        "Director": movie_data.get("Director", ""),
        "DirectorHref": movie_data.get("DirectorHref", ""),
        "DistributorHref": movie_data.get("DistributorHref", ""),
        "StudioHref": movie_data.get("StudioHref", ""),
        "Performers": performers,
        "SceneBreakdowns": scene_breakdowns,
        "AppearsIn": appears_in,
    }
 if __name__ == "__main__":
    for astro in astro_list:
        url = astr_base_url + astro
        next_url = url
        logging.info(f"Fetching data for {astro}, url {url} ...")
        while True:
            soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="astro", attr_type="id"))
            if soup:
                list_data, next_url = parse_page_astro(soup, astro)
                if list_data:
                    print(list_data[0] if len(list_data)>0 else 'no data')
                    break
            else:
                logging.info(f"Retrying {next_url} ...")
                time.sleep(5)  # 等待后再重试
        time.sleep(2)  # 控制访问频率
--- a/scripts/iafd/src/sqlite_utils.py
+++ b/scripts/iafd/src/sqlite_utils.py
@ -0,0 +1,459 @@
 import sqlite3
 import json
 import config
 import utils
 import logging
 from datetime import datetime
 # 连接 SQLite 数据库
 DB_PATH = f"{config.global_share_data_dir}/shared.db"  # 替换为你的数据库文件
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
 # 获取当前时间
 def get_current_time():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 # 插入演员信息
 def insert_or_update_performer(data):
    try:
        cursor.execute("""
            INSERT INTO performers (href, name, gender, birthday, astrology, birthplace, years_active, ethnicity, nationality, hair_colors, 
                                    eye_color, height_str, weight_str, measurements, tattoos, piercings, weight, height, movies_cnt, vixen_cnt, 
                                    blacked_cnt, tushy_cnt, x_art_cnt, updated_at)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
            ON CONFLICT(href) DO UPDATE SET
                name = excluded.name,
                gender = excluded.gender,
                birthday = excluded.birthday,
                astrology = excluded.astrology,
                birthplace = excluded.birthplace,
                years_active = excluded.years_active,
                ethnicity = excluded.ethnicity,
                nationality = excluded.nationality,
                hair_colors = excluded.hair_colors,
                eye_color = excluded.eye_color,
                height_str = excluded.height_str,
                weight_str = excluded.weight_str,
                measurements = excluded.measurements,
                tattoos = excluded.tattoos,
                piercings = excluded.piercings,
                weight = excluded.weight,
                height = excluded.height,
                movies_cnt = excluded.movies_cnt,
                vixen_cnt = excluded.vixen_cnt,
                blacked_cnt = excluded.blacked_cnt,
                tushy_cnt = excluded.tushy_cnt,
                x_art_cnt = excluded.x_art_cnt,
                updated_at = datetime('now', 'localtime')
        """, (
            data["href"], data["person"], data.get("gender"), data.get("birthday"), data.get("astrology"), data.get("birthplace"), data.get("years_active"),
            data.get("ethnicity"), data.get("nationality"), data.get("hair_colors"), data.get("eye_color"), data.get("height"),
            data.get("weight"), data.get("measurements"), data.get("tattoos"), data.get("piercings"), utils.parse_weight(data.get('weight')), utils.parse_height(data.get('height')), 
            data.get("movies_cnt", 0), data.get("vixen_cnt", 0), data.get("blacked_cnt", 0), data.get("tushy_cnt", 0), data.get("x_art_cnt", 0)
        ))
        # 获取 performer_id
        cursor.execute("SELECT id FROM performers WHERE href = ?", (data["href"],))
        performer_id = cursor.fetchone()[0]
        # 删除旧的 alias
        cursor.execute("DELETE FROM performer_aliases WHERE performer_id = ?", (performer_id,))
        # 插入新的 alias
        for alias in data.get("performer_aka", []):
            if alias.lower() != "no known aliases":
                cursor.execute("INSERT INTO performer_aliases (performer_id, alias) VALUES (?, ?)", (performer_id, alias))
        conn.commit()
        logging.debug(f"成功插入/更新演员: {data['person']}")
        return performer_id
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"数据库错误: {e}")
        return None
    except Exception as e:
        conn.rollback()
        logging.error(f"未知错误: {e}")
        return None
 # 按 id 或 href 删除演员
 def delete_performer(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("DELETE FROM performers WHERE id = ?", (identifier,))
        elif isinstance(identifier, str):
            cursor.execute("DELETE FROM performers WHERE href = ?", (identifier,))
        else:
            logging.warning("无效的删除参数")
            return
        conn.commit()
        logging.info(f"成功删除演员: {identifier}")
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"删除失败: {e}")
 # 按 id、href 或 name 查询演员信息
 def query_performer(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("SELECT * FROM performers WHERE id = ?", (identifier,))
        elif "http" in identifier:
            cursor.execute("SELECT * FROM performers WHERE href = ?", (identifier,))
        else:
            cursor.execute("SELECT * FROM performers WHERE name LIKE ?", (f"%{identifier}%",))
        performer = cursor.fetchone()
        if performer:
            cursor.execute("SELECT alias FROM performer_aliases WHERE performer_id = ?", (performer[0],))
            aliases = [row[0] for row in cursor.fetchall()]
            result = dict(zip([desc[0] for desc in cursor.description], performer))
            result["performer_aka"] = aliases
            return result
        else:
            logging.warning(f"未找到演员: {identifier}")
            return None
    except sqlite3.Error as e:
        logging.error(f"查询失败: {e}")
        return None
 # 按条件查询 href 列表 
 def query_performer_hrefs(**filters):
    try:
        sql = "SELECT href FROM performers WHERE 1=1"
        params = []
        if "id" in filters:
            sql += " AND id = ?"
            params.append(filters["id"])
        if "href" in filters:
            sql += " AND href = ?"
            params.append(filters["href"])
        if "name" in filters:
            sql += " AND name LIKE ?"
            params.append(f"%{filters['name']}%")
        cursor.execute(sql, params)
        return [row[0] for row in cursor.fetchall()]
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 # 插入或更新发行商 """
 def insert_or_update_distributor(data):
    try:
        cursor.execute("""
            INSERT INTO distributors (name, href, updated_at) 
            VALUES (?, ? , datetime('now', 'localtime'))
            ON CONFLICT(href) DO UPDATE SET 
                name = excluded.name, 
                updated_at = datetime('now', 'localtime')
        """, (data["name"], data["href"]))
        conn.commit()
        # 获取 performer_id
        cursor.execute("SELECT id FROM distributors WHERE href = ?", (data["href"],))
        dist_id = cursor.fetchone()[0]
        if dist_id:
            logging.debug(f"成功插入/更新发行商: {data['name']}")
            return dist_id
        else:
            return None
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"数据库错误: {e}")
        return None
 # 删除发行商（按 id 或 name） """
 def delete_distributor(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("DELETE FROM distributors WHERE id = ?", (identifier,))
        elif isinstance(identifier, str):
            cursor.execute("DELETE FROM distributors WHERE name = ?", (identifier,))
        conn.commit()
        logging.info(f"成功删除发行商: {identifier}")
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"删除失败: {e}")
 # 查询发行商（按 id 或 name） """
 def query_distributor(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("SELECT * FROM distributors WHERE id = ?", (identifier,))
        else:
            cursor.execute("SELECT * FROM distributors WHERE name LIKE ?", (f"%{identifier}%",))
        distributor = cursor.fetchone()
        if distributor:
            return dict(zip([desc[0] for desc in cursor.description], distributor))
        else:
            logging.warning(f"未找到发行商: {identifier}")
            return None
    except sqlite3.Error as e:
        logging.error(f"查询失败: {e}")
        return None
 # 按条件查询 href 列表 
 def query_distributor_hrefs(**filters):
    try:
        sql = "SELECT href FROM distributors WHERE 1=1"
        params = []
        if "id" in filters:
            sql += " AND id = ?"
            params.append(filters["id"])
        if "url" in filters:
            sql += " AND href = ?"
            params.append(filters["href"])
        if "name" in filters:
            sql += " AND name LIKE ?"
            params.append(f"%{filters['name']}%")
        cursor.execute(sql, params)
        return [row[0] for row in cursor.fetchall()]
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 # """ 插入或更新制作公司 """
 def insert_or_update_studio(data):
    try:
        cursor.execute("""
            INSERT INTO studios (name, href, updated_at) 
            VALUES (?, ?, datetime('now', 'localtime')) 
            ON CONFLICT(href) DO UPDATE SET 
                name = excluded.name, 
                updated_at = datetime('now', 'localtime')
        """, (data["name"], data["href"]))
        conn.commit()
        # 获取 performer_id
        cursor.execute("SELECT id FROM studios WHERE href = ?", (data["href"],))
        stu_id = cursor.fetchone()[0]
        if stu_id:
            logging.debug(f"成功插入/更新发行商: {data['name']}")
            return stu_id
        else:
            return None
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"数据库错误: {e}")
        return None
 # """ 删除制作公司（按 id 或 name） """
 def delete_studio(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("DELETE FROM studios WHERE id = ?", (identifier,))
        elif isinstance(identifier, str):
            cursor.execute("DELETE FROM studios WHERE name = ?", (identifier,))
        conn.commit()
        logging.info(f"成功删除制作公司: {identifier}")
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"删除失败: {e}")
 # """ 查询制作公司（按 id 或 name） """
 def query_studio(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("SELECT * FROM studios WHERE id = ?", (identifier,))
        else:
            cursor.execute("SELECT * FROM studios WHERE name LIKE ?", (f"%{identifier}%",))
        studio = cursor.fetchone()
        if studio:
            return dict(zip([desc[0] for desc in cursor.description], studio))
        else:
            logging.warning(f"未找到制作公司: {identifier}")
            return None
    except sqlite3.Error as e:
        logging.error(f"查询失败: {e}")
        return None
 # 按条件查询 href 列表 
 def query_studio_hrefs(**filters):
    try:
        sql = "SELECT href FROM studios WHERE 1=1"
        params = []
        if "id" in filters:
            sql += " AND id = ?"
            params.append(filters["id"])
        if "href" in filters:
            sql += " AND href = ?"
            params.append(filters["href"])
        if "name" in filters:
            sql += " AND name LIKE ?"
            params.append(f"%{filters['name']}%")
        cursor.execute(sql, params)
        return [row[0] for row in cursor.fetchall()]
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 # """从指定表中通过 href 查找 id"""
 def get_id_by_href(table: str, href: str) -> int:
    cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
    row = cursor.fetchone()
    return row[0] if row else None
 # """插入或更新电影数据"""
 def insert_or_update_movie(movie_data):
    try:
        # 获取相关 ID
        distributor_id = get_id_by_href('distributors', movie_data['DistributorHref'])
        studio_id = get_id_by_href('studios', movie_data['StudioHref'])
        director_id = get_id_by_href('performers', movie_data['DirectorHref'])
        # 插入或更新电影信息
        cursor.execute(
            """
            INSERT INTO movies (title, minutes, distributor_id, studio_id, release_date, added_to_IAFD_date, 
                               all_girl, all_male, compilation, webscene, director_id, href, updated_at)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
            ON CONFLICT(href) DO UPDATE SET 
                title=excluded.title, minutes=excluded.minutes, distributor_id=excluded.distributor_id, 
                studio_id=excluded.studio_id, release_date=excluded.release_date, 
                added_to_IAFD_date=excluded.added_to_IAFD_date, all_girl=excluded.all_girl, 
                all_male=excluded.all_male, compilation=excluded.compilation, webscene=excluded.webscene,
                director_id=excluded.director_id, updated_at = datetime('now', 'localtime')
            """,
            (movie_data['title'], movie_data['Minutes'], distributor_id, studio_id, movie_data['ReleaseDate'], 
             movie_data['AddedtoIAFDDate'], movie_data['All-Girl'], movie_data['All-Male'], 
             movie_data['Compilation'], movie_data['Webscene'], director_id, movie_data['href'])
        )
        conn.commit()
        logging.info("Movie inserted/updated: %s", movie_data['title'])
        # 获取插入的 movie_id
        cursor.execute("SELECT id FROM movies WHERE href = ?", (movie_data['href'],))
        movie_id = cursor.fetchone()[0]
        # 插入 performers_movies 关系表
        for performer in movie_data.get('Performers', []):
            performer_id = get_id_by_href('performers', performer['href'])
            if performer_id:
                notes = '|'.join(performer['tags'])
                cursor.execute(
                    """
                    INSERT INTO performers_movies (performer_id, movie_id, role, notes) 
                    VALUES (?, ?, ?, ?)
                    ON CONFLICT(movie_id, performer_id) DO UPDATE SET notes=excluded.notes
                    """,
                    (performer_id, movie_id, "Actor", notes)
                )
                logging.debug(f"Performers {performer['href']} linked to movie: %s", movie_data['title'])
            else:
                logging.warning(f'missing performer, url {performer['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
        # 插入 movies_appers_in 表
        for appears in movie_data.get("AppearsIn", []):
            appears_in_id = get_id_by_href('movies', appears['href'])
            if appears_in_id:
                appears_in_id = appears_in_id[0]
                cursor.execute("""
                    INSERT INTO movies_appers_in (movie_id, appears_in_id, gradation, notes)
                    VALUES (?, ?, ?, ?)
                    ON CONFLICT(movie_id, appears_in_id) DO NOTHING
                """, (movie_id, appears_in_id, 1, appears["title"]))
            else:
                logging.warning(f'missing AppearsIn movie in movies table, parent_url {appears['href']}, in movie: ({movie_data['title']}) {movie_data['href']}')
        conn.commit()
        return movie_id
    except Exception as e:
        conn.rollback()
        logging.error("Error inserting movie: %s", e)
        return None
 # 删除电影数据"""
 def delete_movie(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("DELETE FROM movies WHERE id = ?", (identifier,))
        elif isinstance(identifier, str):
            cursor.execute("DELETE FROM movies WHERE href = ?", (identifier,))
        else:
            logging.warning("无效的删除参数")
            return
        conn.commit()
        logging.info(f"Deleted movie with {identifier}")
    except sqlite3.Error as e:
        conn.rollback()
        logging.error("Error deleting movie: %s", e)
 # 查找电影数据"""
 def query_movies(identifier):
    try:
        if isinstance(identifier, int):
            cursor.execute("SELECT * FROM movies WHERE id = ?", (identifier,))
        elif "http" in identifier:
            cursor.execute("SELECT * FROM movies WHERE href = ?", (identifier,))
        else:
            cursor.execute("SELECT * FROM movies WHERE title LIKE ?", (f"%{identifier}%",))
        movie = cursor.fetchone()
        if movie:
            cursor.execute("SELECT * FROM performer_movie WHERE performer_id = ?", (movie[0],))
            performers = [row[0] for row in cursor.fetchall()]
            result = dict(zip([desc[0] for desc in cursor.description], performers))
            result["performers"] = performers
            return result
        else:
            logging.warning(f"find no data: {identifier}")
            return None
    except sqlite3.Error as e:
        logging.error(f"查询失败: {e}")
        return None
 # 按条件查询 href 列表 
 def query_movie_hrefs(**filters):
    try:
        sql = "SELECT href FROM movies WHERE 1=1"
        params = []
        if "id" in filters:
            sql += " AND id = ?"
            params.append(filters["id"])
        if "href" in filters:
            sql += " AND href = ?"
            params.append(filters["href"])
        if "title" in filters:
            sql += " AND title LIKE ?"
            params.append(f"%{filters['title']}%")
        cursor.execute(sql, params)
        return [row[0] for row in cursor.fetchall()]
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return []
 if __name__ == "__main__":
    try:
        with open('../result/detail.json', 'r') as file:
            performers = json.load(file)
            for performer in performers:
                insert_or_update_performer(performer)
            print(query_performer("Kirsten"))
            #delete_performer("https://www.iafd.com/person.rme/id=ca699282-1b57-4ce7-9bcc-d7799a292e34")
            print(query_performer_hrefs())
    except FileNotFoundError:
        logging.info("detail.json not found, starting fresh.")
--- a/scripts/iafd/src/utils.py
+++ b/scripts/iafd/src/utils.py
@ -0,0 +1,92 @@
 import re
 import os
 import json
 import time
 import csv
 import logging
 # 解析 height 和 weight（转换成数字）
 def parse_height(height_str):
    return 0
    try:
        return int(height_str.split("(")[-1].replace(" cm)", ""))
    except:
        return None
 def parse_weight(weight_str):
    return 0
    try:
        return int(weight_str.split(" ")[0])
    except:
        return None
 update_dir = '../result'
 performers_dir = f'{update_dir}/performers'
 movies_dir = f'{update_dir}/movies'
 def uniq_performers(new_performers):
    try:
        if not isinstance(new_performers, list):
            raise TypeError(f"new_performers should be a list, but got {type(new_performers)}")
        seen = set()
        unique_performers = []
        for item in new_performers:
            if not item or item['href'] is None:
                raise ValueError(f"Invalid item in new_performers: {item}")
            if item["href"] not in seen:
                seen.add(item["href"])
                unique_performers.append(item)
        return unique_performers
    except Exception as e:
        logging.error(f"Error in remove_duplicate_performers: {e}")
        return []  # 返回空列表，避免程序崩溃
 # 创建目录
 def create_sub_directory(base_dir, str):
    # 获取 person 的前两个字母并转为小写
    sub_dir = str[:1].lower()
    full_path = os.path.join(base_dir, sub_dir)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    return full_path
 # 从 https://www.iafd.com/person.rme/id=21898a3c-1ddd-4793-8d93-375d6db20586 中抽取 id 的值
 def extract_id_from_href(href):
    """从href中提取id参数"""
    match = re.search(r'id=([a-f0-9\-]+)', href)
    return match.group(1) if match else ''
 # 写入每个 performer 的单独 JSON 文件
 def write_person_json(person, href, data):
    # 获取目录
    person_dir = create_sub_directory(performers_dir, person)
    person_id = extract_id_from_href(href)
    person_filename = f"{person.replace(' ', '-')}({person_id}).json"  # 用 - 替换空格
    full_path = os.path.join(person_dir, person_filename)
    try:
        with open(full_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        logging.error(f"Error writing file {full_path}: {e}")
 # 写入每个 performer 的单独 JSON 文件
 def write_movie_json(href, data):
    # 获取目录
    movie_id = extract_id_from_href(href)
    person_dir = create_sub_directory(movies_dir, movie_id)
    person_filename = f"{movie_id}.json"  # 用 - 替换空格
    full_path = os.path.join(person_dir, person_filename)
    try:
        with open(full_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        logging.error(f"Error writing file {full_path}: {e}")