modify some scripts.

2025-03-06 16:55:28 +08:00
parent 977338a281
commit 6cebf3f8ac
9 changed files with 1718 additions and 287 deletions
--- a/scripts/javdb/src/config.py
+++ b/scripts/javdb/src/config.py
@ -0,0 +1,26 @@
+import logging
+import os
+import inspect
+from datetime import datetime
+
+global_share_data_dir = '/root/sharedata'
+global_host_data_dir = '/root/hostdir/scripts_data'
+
+# 设置日志配置
+def setup_logging(log_filename=None):
+    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
+    if log_filename is None:
+        # 获取调用 setup_logging 的脚本文件名
+        caller_frame = inspect.stack()[1]
+        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+
+        # 获取当前日期，格式为 yyyymmdd
+        current_date = datetime.now().strftime('%Y%m%d')
+        # 拼接 log 文件名，将日期加在扩展名前
+        log_filename = f'../log/{caller_filename}_{current_date}.log'
+    
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
+                        handlers=[
+                            logging.FileHandler(log_filename),
+                            logging.StreamHandler()
+                        ])
--- a/scripts/javdb/src/fetch.py
+++ b/scripts/javdb/src/fetch.py
@ -0,0 +1,271 @@
+
+import json
+import time
+import csv
+import argparse
+import logging
+from functools import partial
+import config
+import sqlite_utils as db_tools
+import scraper
+import utils 
+
+config.setup_logging()
+
+debug = False
+force = False
+
+# 获取演员列表
+def fetch_actor_list():    
+    next_url = scraper.actors_uncensored_base_url
+    while next_url:
+        logging.info(f'fetching page {next_url}')
+        soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
+            if list_data :
+                # 写入数据库
+                for row in list_data:
+                    actor_id = db_tools.insert_actor_index(name=row['name'], href=row['href'] if row['href'] else '')
+                    if actor_id:
+                        logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
+                    else:
+                        logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
+            else:
+                logging.warning(f'fetch actor error. {next_url} ...')
+
+# 获取makers列表
+def fetch_makers_list():
+    next_url = scraper.makers_uncensored_base_url
+    while next_url:
+        logging.info(f'fetching page {next_url}')
+        soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
+            if list_data :
+                # 写入数据库
+                for row in list_data:
+                    maker_id = db_tools.insert_or_update_makers(row)
+                    if maker_id:
+                        logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
+                    else:
+                        logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
+            else:
+                logging.warning(f'fetch actor error. {next_url} ...')
+
+# 获取series列表
+def fetch_series_list():
+    next_url = scraper.series_uncensored_base_url
+    while next_url:
+        logging.info(f'fetching page {next_url}')
+        soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
+        if soup:
+            list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
+            if list_data :
+                # 写入数据库
+                for row in list_data:
+                    maker_id = db_tools.insert_or_update_series(row)
+                    if maker_id:
+                        logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
+                    else:
+                        logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
+            else:
+                logging.warning(f'fetch actor error. {next_url} ...')
+
+
+# 更新makers列表中的影片信息
+def fetch_movies_by_maker():
+    url_list = db_tools.query_maker_hrefs()
+    if debug:
+        url_list = db_tools.query_maker_hrefs(name='muramura')
+    for url in url_list:
+        next_url = url
+        while True:
+            logging.info(f"Fetching data for maker url {next_url} ...")
+            soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+            if soup:
+                list_data, next_url = scraper.parse_maker_detail(soup, next_url)
+                if list_data:                
+                    for movie in list_data:
+                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
+                        if tmp_id:
+                            logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
+                        else:
+                            logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
+                else :
+                    logging.warning(f'parse_page_movie error. url: {next_url}')
+            # 调试增加brak
+            if debug:
+                return True
+
+# 更新series列表中的影片信息
+def fetch_movies_by_series():
+    url_list = db_tools.query_series_hrefs()
+    if debug:
+        url_list = db_tools.query_series_hrefs(name='10musume')
+    for url in url_list:
+        next_url = url
+        while True:
+            logging.info(f"Fetching data for series url {next_url} ...")
+            soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+            if soup:
+                list_data, next_url = scraper.parse_series_detail(soup, next_url)
+                if list_data:                
+                    for movie in list_data:
+                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
+                        if tmp_id:
+                            logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
+                        else:
+                            logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
+                else :
+                    logging.warning(f'parse_page_movie error. url: {next_url}')
+            # 调试增加brak
+            if debug:
+                return True
+
+
+# 更新演员信息
+def fetch_performers_detail():
+    perfomers_list = []
+    while True:
+        # 每次从数据库中取一部分，避免一次全量获取
+        perfomers_list = db_tools.query_actors(is_full_data=0, limit=10)
+        if len(perfomers_list) < 1:
+            logging.info(f'all performers fetched.')
+            break
+        for performer in perfomers_list:
+            url = performer['href']
+            person = performer['name']
+
+            next_url = url
+            all_movies = []
+            while next_url:
+                logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
+                soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+                if soup:
+                    data, next_url = scraper.parse_actor_detail(soup, next_url)
+                    if data:
+                        all_movies.extend(data)
+                else:
+                    logging.warning(f'fetch_page error. person: ({person}), url: {url}')
+            
+            # 获取完了个人的所有影片，开始插入数据
+            performer_id = db_tools.insert_or_update_actor({
+                'href': url,
+                'name': person,
+                'pic' : '',
+                'alias' : [],
+                'credits':all_movies
+            })
+            if performer_id:
+                logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
+            else:
+                logging.warning(f'insert person: ({person}) {url} failed.')
+        # 调试break
+        if debug:
+            return True
+
+# 更新影片信息
+def fetch_movies_detail():
+    movies_list = []
+    while True:
+        movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=10)
+        if len(movies_list) < 1:
+            logging.info(f'all movies fetched.')
+            break
+        for movie in movies_list:
+            url = movie['href']
+            title = movie['title']
+            logging.info(f"Fetching data for movie ({title}), url {url} ...")
+            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
+            if soup:
+                movie_data = scraper.parse_movie_detail(soup, url, title)
+                if movie_data :
+                    movie_id = db_tools.insert_or_update_movie(movie_data)
+                    if movie_id:
+                        logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
+                    else:
+                        logging.warning(f'insert movie {url} failed.')
+                else:
+                    logging.warning(f'parse_page_movie error. url: {url}')
+            else:
+                logging.warning(f'fetch_page error. url: {url}')
+        # 调试增加break
+        if debug:
+            return True
+
+# 获取更新
+def check_update():
+
+    # 开启任务
+    task_id = db_tools.insert_task_log()
+    if task_id is None:
+        logging.warning(f'insert task log error.')
+        return None
+
+    if False:
+        # 刷新演员列表
+        db_tools.update_task_log(task_id, task_status='fetching actor list')
+        fetch_actor_list()
+
+        # 刷新makers列表
+        db_tools.update_task_log(task_id, task_status='fetching maker list')
+        fetch_makers_list()
+
+        # 刷新series列表
+        db_tools.update_task_log(task_id, task_status='fetching series list')
+        fetch_series_list()
+
+        # 刷新影片列表
+        db_tools.update_task_log(task_id, task_status='fetching movie list by maker')
+        fetch_movies_by_maker()
+        db_tools.update_task_log(task_id, task_status='fetching movie list by series')
+        fetch_movies_by_series()
+
+    # 更新演员信息
+    db_tools.update_task_log(task_id, task_status='fetching performers')
+    fetch_performers_detail()
+
+    # 更新影片信息
+    db_tools.update_task_log(task_id, task_status='fetching movies')
+    fetch_movies_detail()
+
+    logging.info(f'all process completed!')
+    db_tools.finalize_task_log(task_id)
+
+    # TODO:
+    # 1, 
+    
+# 处理本地数据
+def load_data():
+    return True
+
+# 主函数
+def main(task, args_debug, args_force):
+    global debug
+    debug = args_debug
+    if debug:
+        logging.info('Debug mode enabled.')
+
+    global force
+    force = args_force
+    if force:
+        logging.info('force update for all data.')
+
+    if task == 'fetch':
+        check_update()
+    elif task == 'load':
+        load_data()
+    else:
+        print(f'unkown command. see --help.')
+    
+
+if __name__ == "__main__":
+    # 命令行参数处理
+    parser = argparse.ArgumentParser(description='fetch iafd data.')
+    parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
+    parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
+    args = parser.parse_args()
+    
+    main(args.task, args.debug, args.force)
--- a/scripts/javdb/src/scraper.py
+++ b/scripts/javdb/src/scraper.py
@ -0,0 +1,454 @@
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+import config
+
+# 定义基础 URL 和可变参数
+host_url = "https://www.javdb.com"
+actors_uncensored_base_url = f'{host_url}/actors/uncensored' 
+series_uncensored_base_url = f'{host_url}/series/uncensored'
+makers_uncensored_base_url = f'{host_url}/makers/uncensored'
+
+# 设置 headers 和 scraper
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+scraper = cloudscraper.create_scraper()
+
+#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+    for attempt in range(max_retries):
+        try:
+            if 'javdb.com' not in url.lower():
+                logging.error(f'wrong url format: {url}')
+                return None
+            
+            response = scraper.get(url, headers=headers)
+            response.raise_for_status()  # 处理 HTTP 错误
+
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(response.text) if preprocessor else response.text
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup
+
+            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+        except cloudscraper.exceptions.CloudflareChallengeError as e:
+            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+        except cloudscraper.exceptions.CloudflareCode1020 as e:
+            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+        except Exception as e:
+            logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+    logging.error(f'Fetching failed after max retries. {url}')
+    return None  # 达到最大重试次数仍然失败
+
+# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
+def preprocess_html(html):
+    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
+
+# 通用的 HTML 结构验证器
+def generic_validator(soup, tag, identifier, attr_type="id"):
+    if attr_type == "id":
+        return soup.find(tag, id=identifier) is not None
+    elif attr_type == "class":
+        return bool(soup.find_all(tag, class_=identifier))
+    elif attr_type == "name": 
+        return bool(soup.find('select', {'name': identifier}))
+    return False
+
+# 解析链接中的页码
+def url_page_num(href):
+    if href is None:
+        return None
+    match = re.search(r'page=(\d+)', href)
+    if match:
+        next_page_number = int(match.group(1))
+        return next_page_number
+    else:
+        return None
+    
+# 解析 HTML 内容，提取需要的数据
+def parse_actors_uncensored(soup, href):
+    div_actors = soup.find("div", id='actors')
+    if not div_actors:
+        logging.warning(f"Warning: No actors div found ")
+        return None, None
+    
+    # 解析元素
+    rows = div_actors.find_all('div', class_='box actor-box')
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        # 获取演员详情链接
+        actor_link = row.find('a')['href']
+        # 获取演员名字
+        actor_name = row.find('strong').text.strip()
+        # 获取头像图片链接
+        avatar_url = row.find('img', class_='avatar')['src']
+        # 获取 title 属性中的别名
+        alias_list = row.find('a')['title'].split(", ")
+
+        list_data.append({
+            'name' : actor_name,
+            'href' : host_url + actor_link if actor_link else '',
+            'pic'  : avatar_url,
+            'alias': alias_list 
+        })
+    
+    # 查找 "下一页" 按钮
+    next_page_element = soup.find('a', class_='pagination-next')
+    if next_page_element:
+        next_page_url = next_page_element['href']
+        next_page_number = url_page_num(next_page_url)
+        current_page_number = url_page_num(href)
+        if current_page_number is None:
+            current_page_number = 0
+        if next_page_number and next_page_number > current_page_number :
+            next_url = host_url + next_page_url
+
+    return list_data, next_url
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_actor_detail(soup, href):
+    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
+    if not div_movies:
+        logging.warning(f"Warning: No movies div found ")
+        return None, None
+    
+    # 解析元素
+    rows = div_movies.find_all('div', class_='item')
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        link = row.find('a', class_='box')['href']
+        serial_number = row.find('strong').text.strip()
+        title = row.find('div', class_='video-title').text.strip()
+        release_date = row.find('div', class_='meta').text.strip()
+        list_data.append({
+            'href' : host_url + link if link else '',
+            'serial_number' : serial_number,
+            'title'  : title,
+            'release_date': release_date 
+        })
+    
+    # 查找 "下一页" 按钮
+    next_page_element = soup.find('a', class_='pagination-next')
+    if next_page_element:
+        next_page_url = next_page_element['href']
+        next_page_number = url_page_num(next_page_url)
+        current_page_number = url_page_num(href)
+        logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
+        if current_page_number is None:
+            current_page_number = 0
+        if next_page_number and next_page_number > current_page_number :
+            next_url = host_url + next_page_url
+
+    return list_data, next_url
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_movie_detail(soup, href, title):
+    div_video = soup.find("div", class_='video-meta-panel')
+    if not div_video:
+        logging.warning(f"Warning: No movies div found ")
+        return None, None
+    
+    # 获取封面图片
+    cover_img = soup.select_one('.column-video-cover a')
+    cover_url = cover_img['href'] if cover_img else None
+
+    # 获取番号
+    serial = soup.select_one('.panel-block:first-child .value')
+    serial_number = serial.text.strip() if serial else None
+
+    # 获取日期
+    date = soup.select_one('.panel-block:nth-of-type(2) .value')
+    release_date = date.text.strip() if date else None
+
+    # 获取时长
+    duration = soup.select_one('.panel-block:nth-of-type(3) .value')
+    video_duration = duration.text.strip() if duration else None
+
+    # 获取片商
+    maker = soup.select_one('.panel-block:nth-of-type(4) .value a')
+    maker_name = maker.text.strip() if maker else None
+    maker_link = maker['href'] if maker else None
+
+    # 获取系列
+    series = soup.select_one('.panel-block:nth-of-type(5) .value a')
+    series_name = series.text.strip() if series else None
+    series_link = series['href'] if series else None
+
+    # 获取演员（名字 + 链接）
+    actors = [{'name': actor.text.strip(), 'href': host_url + actor['href']} for actor in soup.select('.panel-block:nth-of-type(8) .value a')]
+
+    return {
+        'href'  : href,
+        'title' : title,
+        'cover_url': cover_url,
+        'serial_number': serial_number,
+        'release_date': release_date,
+        'duration': video_duration,
+        'maker_name': maker_name,
+        'maker_link': host_url + maker_link if maker_link else  '',
+        'series_name': series_name,
+        'series_link': host_url + series_link if series_link else '',
+        'actors': actors
+    }
+
+# 解析 HTML 内容，提取需要的数据
+def parse_series_uncensored(soup, href):
+    div_series = soup.find("div", id='series')
+    if not div_series:
+        logging.warning(f"Warning: No div_series div found ")
+        return None, None
+    
+    # 解析元素
+    rows = div_series.find_all('a', class_='box')
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        name = row.find('strong').text.strip()
+        href = row['href']
+        div_movies = row.find('span')
+        movies = 0
+        if div_movies:
+            match = re.search(r'\((\d+)\)', div_movies.text.strip())
+            if match:
+                movies = int(match.group(1))
+
+        list_data.append({
+            'name' : name,
+            'href' : host_url + href if href else '',
+            'movies'  : movies
+        })
+    
+    # 查找 "下一页" 按钮
+    next_page_element = soup.find('a', class_='pagination-next')
+    if next_page_element:
+        next_page_url = next_page_element['href']
+        next_page_number = url_page_num(next_page_url)
+        current_page_number = url_page_num(href)
+        if current_page_number is None:
+            current_page_number = 0
+        if next_page_number and next_page_number > current_page_number :
+            next_url = host_url + next_page_url
+
+    return list_data, next_url
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_series_detail(soup, href):
+    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
+    if not div_movies:
+        logging.warning(f"Warning: No movies div found ")
+        return None, None
+    
+    # 解析元素
+    rows = div_movies.find_all('div', class_='item')
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        link = row.find('a', class_='box')['href']
+        serial_number = row.find('strong').text.strip()
+        title = row.find('div', class_='video-title').text.strip()
+        release_date = row.find('div', class_='meta').text.strip()
+        list_data.append({
+            'href' : host_url + link if link else '',
+            'serial_number' : serial_number,
+            'title'  : title,
+            'release_date': release_date 
+        })
+    
+    # 查找 "下一页" 按钮
+    next_page_element = soup.find('a', class_='pagination-next')
+    if next_page_element:
+        next_page_url = next_page_element['href']
+        next_page_number = url_page_num(next_page_url)
+        current_page_number = url_page_num(href)
+        if current_page_number is None:
+            current_page_number = 0
+        if next_page_number and next_page_number > current_page_number :
+            next_url = host_url + next_page_url
+
+    return list_data, next_url
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_makers_uncensored(soup, href):
+    div_series = soup.find("div", id='makers')
+    if not div_series:
+        logging.warning(f"Warning: No makers div found ")
+        return None, None
+    
+    # 解析元素
+    rows = div_series.find_all('a', class_='box')
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        name = row.find('strong').text.strip()
+        href = row['href']
+        div_movies = row.find('span')
+        movies = 0
+        if div_movies:
+            match = re.search(r'\((\d+)\)', div_movies.text.strip())
+            if match:
+                movies = int(match.group(1))
+
+        list_data.append({
+            'name' : name,
+            'href' : host_url + href if href else '',
+            'movies'  : movies
+        })
+    
+    # 查找 "下一页" 按钮
+    next_page_element = soup.find('a', class_='pagination-next')
+    if next_page_element:
+        next_page_url = next_page_element['href']
+        next_page_number = url_page_num(next_page_url)
+        current_page_number = url_page_num(href)
+        if current_page_number is None:
+            current_page_number = 0
+        if next_page_number and next_page_number > current_page_number :
+            next_url = host_url + next_page_url
+
+    return list_data, next_url
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_maker_detail(soup, href):
+    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
+    if not div_movies:
+        logging.warning(f"Warning: No movies div found ")
+        return None, None
+    
+    # 解析元素
+    rows = div_movies.find_all('div', class_='item')
+
+    list_data = []
+    next_url = None
+    for row in rows:
+        link = row.find('a', class_='box')['href']
+        serial_number = row.find('strong').text.strip()
+        title = row.find('div', class_='video-title').text.strip()
+        release_date = row.find('div', class_='meta').text.strip()
+        list_data.append({
+            'href' : host_url + link if link else '',
+            'serial_number' : serial_number,
+            'title'  : title,
+            'release_date': release_date 
+        })
+    
+    # 查找 "下一页" 按钮
+    next_page_element = soup.find('a', class_='pagination-next')
+    if next_page_element:
+        next_page_url = next_page_element['href']
+        next_page_number = url_page_num(next_page_url)
+        current_page_number = url_page_num(href)
+        if current_page_number is None:
+            current_page_number = 0
+        if next_page_number and next_page_number > current_page_number :
+            next_url = host_url + next_page_url
+
+    return list_data, next_url
+
+
+
+###### 以下为测试代码 ######
+def test_actors_list():
+    next_url = actors_uncensored_base_url
+    while next_url:
+        print(f'fetching page {next_url}')
+        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="actors", attr_type="id"))
+        if soup:
+            list_data, next_url = parse_actors_uncensored(soup, next_url)
+            if list_data :
+                print(list_data)
+            else:
+                print('get wrong page.')
+            if next_url:
+                print(next_url)
+        break
+
+def test_actor():
+    next_url = 'https://javdb.com/actors/mdRn'
+    all_data = []
+    while next_url:
+        print(f'fetching page {next_url}')
+        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+        if soup:
+            list_data, next_url = parse_actor_detail(soup, next_url)
+            if list_data :
+                all_data.extend(list_data)
+            else:
+                print('get wrong page.')
+    print(all_data)
+
+def test_movie_detail():
+    movie_url = 'https://javdb.com/v/gB2Q7'
+    while True:
+        soup = fetch_page(movie_url, partial(generic_validator, tag="div", identifier="video-detail", attr_type="class"))
+        if soup:
+            detail = parse_movie_detail(soup, movie_url, 'RED193 無碼 レッドホットフェティッシュコレクション 中出し120連発 4 : 波多野結衣, 愛乃なみ, 夢実あくび, 他多数')
+            if detail:
+                print(detail)
+                break
+
+
+def test_series_list():
+    next_url = 'https://javdb.com/series/uncensored'
+    all_data = []
+    while next_url:
+        print(f'fetching page {next_url}')
+        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="series", attr_type="id"))
+        if soup:
+            list_data, next_url = parse_series_uncensored(soup, next_url)
+            if list_data :
+                all_data.extend(list_data)
+            else:
+                print('get wrong page.')
+        break
+
+    print(all_data)
+
+def test_series_detail():
+    next_url = 'https://javdb.com/series/39za'
+    all_data = []
+    while next_url:
+        print(f'fetching page {next_url}')
+        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+        if soup:
+            list_data, next_url = parse_series_detail(soup, next_url)
+            if list_data :
+                all_data.extend(list_data)
+            else:
+                print('get wrong page.')
+    print(all_data)
+    
+
+
+if __name__ == "__main__":
+    #test_actors_list()
+    #test_actor()
+    test_movie_detail()
+    #test_series_list()
+    #test_series_detail()
+
+    
--- a/scripts/javdb/src/sqlite_utils.py
+++ b/scripts/javdb/src/sqlite_utils.py
@ -0,0 +1,551 @@
+import sqlite3
+import json
+import config
+import logging
+from datetime import datetime
+
+# 连接 SQLite 数据库
+DB_PATH = f"{config.global_share_data_dir}/shared.db"  # 替换为你的数据库文件
+conn = sqlite3.connect(DB_PATH, check_same_thread=False)
+cursor = conn.cursor()
+
+# """从指定表中通过 href 查找 id"""
+def get_id_by_href(table: str, href: str) -> int:
+    if href is None:
+        return None
+    cursor.execute(f"SELECT id FROM {table} WHERE href = ?", (href,))
+    row = cursor.fetchone()
+    return row[0] if row else None
+
+# 插入演员索引，来自于列表数据
+def insert_actor_index(name, href):
+    try:
+        cursor.execute("""
+            INSERT OR IGNORE INTO javdb_actors (href, name) VALUES (?, ?)
+        """, (
+            href, name
+        ))
+        conn.commit()
+
+        performer_id = get_id_by_href('javdb_actors', href)
+        if performer_id:
+            logging.debug(f'insert one actor index, id: {performer_id}, name: {name}, href: {href}')
+
+        return performer_id    
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+    except Exception as e:
+        conn.rollback()
+        logging.error(f"未知错误: {e}")
+        return None
+
+
+# """插入电影索引，来自于列表数据"""
+def insert_movie_index(title, href):
+    try:
+        # 插入或更新电影信息
+        cursor.execute("""
+            INSERT OR IGNORE INTO javdb_movies (title, href) VALUES (?, ?)
+        """,
+            (title, href)
+        )
+        conn.commit()
+
+        movie_id = get_id_by_href('javdb_movies', href)
+        if movie_id:
+            logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')    
+
+        return movie_id        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+    
+# 插入演员和电影的关联数据
+def insert_actor_movie(performer_id, movie_id, tags=''):
+    try:
+        cursor.execute("""
+            INSERT INTO javdb_actors_movies (actor_id, movie_id, tags) 
+            VALUES (?, ?, ?)
+            ON CONFLICT(actor_id, movie_id) DO UPDATE SET tags=excluded.tags
+        """,
+            (performer_id, movie_id, tags)
+        )
+        conn.commit()
+
+        #logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}')  
+
+        return performer_id
+        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+
+# 插入演员数据
+def insert_or_update_actor(actor):
+    try:
+        cursor.execute('''
+            INSERT INTO javdb_actors (name, href, pic, is_full_data, updated_at)
+            VALUES (?, ?, ?, 1, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET name=excluded.name, pic=excluded.pic, is_full_data=1, updated_at=datetime('now', 'localtime')
+        ''', (actor['name'], actor['href'], actor['pic']))
+        
+        cursor.execute('SELECT id FROM javdb_actors WHERE href = ?', (actor['href'],))
+        conn.commit()
+
+        actor_id = get_id_by_href('javdb_actors', actor['href'])
+        if actor_id is None:
+            logging.warning(f'insert data error. name: {actor['name']}, href: {actor['href']}')
+            return None
+        
+        logging.debug(f'insert one actor, id: {actor_id}, name: {actor['name']}, href: {actor['href']}')
+        
+        # 插入别名
+        for alias in actor.get("alias") or []:
+            cursor.execute('''
+                INSERT OR IGNORE INTO javdb_actors_alias (actor_id, alias, updated_at)
+                VALUES (?, ?, datetime('now', 'localtime'))
+            ''', (actor_id, alias))
+        
+        conn.commit()
+
+        # 插入影片列表
+        for movie in actor.get("credits") or []:
+            movie_id = get_id_by_href('javdb_movies', movie['href'])
+            # 影片不存在，先插入
+            if movie_id is None:
+                movie_id = insert_movie_index(movie['title'], movie['href'])
+            if movie_id:
+                tmp_id = insert_actor_movie(actor_id, movie_id)
+                if tmp_id :
+                    logging.debug(f'insert one performer_movie, performer_id: {actor_id}, movie_id: {movie_id}')  
+            else:
+                logging.warning(f'insert performer_movie failed. performer_id: {actor_id}, moive href: {movie['href']}')
+
+        return actor_id
+    except Exception as e:
+        logging.error(f"插入/更新演员 {actor['name']} 失败: {e}")
+        conn.rollback()
+
+# 删除演员
+def delete_actor_by_href(href):
+    try:
+        cursor.execute('DELETE FROM javdb_actors WHERE href = ?', (href,))
+        conn.commit()
+        logging.info(f"成功删除演员: {href}")
+    except Exception as e:
+        logging.error(f"删除演员 {href} 失败: {e}")
+        conn.rollback()
+
+# 查询
+def query_actors(**filters):
+    try:
+        sql = "SELECT href, name FROM javdb_actors WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+        if "is_full_data" in filters:
+            sql += " AND is_full_data = ?"
+            params.append(filters["is_full_data"])
+        if 'limit' in filters:
+            sql += " limit ?"
+            params.append(filters["limit"])
+
+        cursor.execute(sql, params)
+        #return [row[0].lower() for row in cursor.fetchall()]   # 返回小写
+        return [{'href': row[0], 'name': row[1]} for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+
+# 插入或更新发行商 """
+def insert_or_update_makers(data):
+    try:
+        cursor.execute("""
+            INSERT INTO javdb_makers (name, href, updated_at) 
+            VALUES (?, ? , datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name, 
+                updated_at = datetime('now', 'localtime')
+        """, (data["name"], data["href"]))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM javdb_makers WHERE href = ?", (data["href"],))
+        dist_id = cursor.fetchone()[0]
+        if dist_id:
+            logging.debug(f"成功插入/更新发行商: {data['name']}")
+            return dist_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
+# 删除发行商（按 id 或 name） """
+def delete_maker(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM javdb_makers WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM javdb_makers WHERE name = ?", (identifier,))
+        conn.commit()
+        logging.info(f"成功删除发行商: {identifier}")
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# 查询发行商（按 id 或 name） """
+def query_maker(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM javdb_makers WHERE id = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM javdb_makers WHERE name LIKE ?", (f"%{identifier}%",))
+
+        distributor = cursor.fetchone()
+        if distributor:
+            return dict(zip([desc[0] for desc in cursor.description], distributor))
+        else:
+            logging.warning(f"未找到发行商: {identifier}")
+            return None
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_maker_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM javdb_makers WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "url" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        return [row[0] for row in cursor.fetchall()]  # 链接使用小写
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+# """ 插入或更新制作公司 """
+def insert_or_update_series(data):
+    try:
+        cursor.execute("""
+            INSERT INTO javdb_series (name, href, updated_at) 
+            VALUES (?, ?, datetime('now', 'localtime')) 
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name, 
+                updated_at = datetime('now', 'localtime')
+        """, (data["name"], data["href"]))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM javdb_series WHERE href = ?", (data["href"],))
+        stu_id = cursor.fetchone()[0]
+        if stu_id:
+            logging.debug(f"成功插入/更新发行商: {data['name']}")
+            return stu_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
+# """ 删除制作公司（按 id 或 name） """
+def delete_series(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM javdb_series WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM javdb_series WHERE name = ?", (identifier,))
+        conn.commit()
+        logging.info(f"成功删除制作公司: {identifier}")
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"删除失败: {e}")
+
+# """ 查询制作公司（按 id 或 name） """
+def query_series(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM javdb_series WHERE id = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM javdb_series WHERE name LIKE ?", (f"%{identifier}%",))
+
+        studio = cursor.fetchone()
+        if studio:
+            return dict(zip([desc[0] for desc in cursor.description], studio))
+        else:
+            logging.warning(f"未找到制作公司: {identifier}")
+            return None
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_series_hrefs(**filters):
+    try:
+        sql = "SELECT href FROM javdb_series WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+
+        cursor.execute(sql, params)
+        return [row[0] for row in cursor.fetchall()]    # 链接使用小写
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
+
+# """插入或更新电影数据"""
+def insert_or_update_movie(movie):
+    try:
+        # 获取相关 ID
+        makers_id = get_id_by_href('javdb_makers', movie['maker_link'])
+        series_id = get_id_by_href('javdb_series', movie['series_link'])
+
+
+        cursor.execute("""
+            INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration, 
+                                    maker_id, series_id, is_full_data, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET
+                title=excluded.title,
+                cover_url=excluded.cover_url,
+                serial_number=excluded.serial_number,
+                release_date=excluded.release_date,
+                duration=excluded.duration,
+                maker_id=excluded.maker_id,
+                series_id=excluded.series_id,
+                is_full_data=1,
+                updated_at=datetime('now', 'localtime')
+        """, (movie['href'], movie['title'], movie['cover_url'], movie['serial_number'],
+              movie['release_date'], movie['duration'], makers_id, series_id))
+        
+        conn.commit()
+        
+        # 获取插入的 movie_id
+        movie_id = get_id_by_href('javdb_movies', movie['href'])
+        if movie_id is None:
+            return None
+        
+        logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}')
+        
+        # 插入 performers_movies 关系表
+        for performer in movie.get('actors', []):
+            performer_id = get_id_by_href('javdb_actors', performer['href'])
+            # 如果演员不存在，先插入
+            if performer_id is None:
+                performer_id = insert_actor_index(performer['name'], performer['href'])
+            if performer_id:
+                tmp_id = insert_actor_movie(performer_id, movie_id)
+                if tmp_id:
+                    logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
+                else:
+                    logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
+            else:
+                logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
+
+        return movie_id
+        
+    except Exception as e:
+        conn.rollback()
+        logging.error("Error inserting movie: %s", e)
+        return None
+
+# 删除电影数据"""
+def delete_movie(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("DELETE FROM javdb_movies WHERE id = ?", (identifier,))
+        elif isinstance(identifier, str):
+            cursor.execute("DELETE FROM javdb_movies WHERE href = ?", (identifier,))
+        else:
+            logging.warning("无效的删除参数")
+            return
+        conn.commit()
+        logging.info(f"Deleted movie with {identifier}")
+
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error("Error deleting movie: %s", e)
+
+# 查找电影数据"""
+def query_movies(identifier):
+    try:
+        if isinstance(identifier, int):
+            cursor.execute("SELECT * FROM javdb_movies WHERE id = ?", (identifier,))
+        elif "http" in identifier:
+            cursor.execute("SELECT * FROM javdb_movies WHERE href = ?", (identifier,))
+        else:
+            cursor.execute("SELECT * FROM javdb_movies WHERE title LIKE ?", (f"%{identifier}%",))
+
+        movie = cursor.fetchone()
+        if movie:
+            cursor.execute("SELECT * FROM javdb_actors_movies WHERE performer_id = ?", (movie[0],))
+            performers = [row[0] for row in cursor.fetchall()]
+            result = dict(zip([desc[0] for desc in cursor.description], performers))
+            result["performers"] = performers
+            return result
+        else:
+            logging.warning(f"find no data: {identifier}")
+            return None
+
+    except sqlite3.Error as e:
+        logging.error(f"查询失败: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_movie_hrefs(**filters):
+    try:
+        sql = "SELECT href, title FROM javdb_movies WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "title" in filters:
+            sql += " AND title LIKE ?"
+            params.append(f"%{filters['title']}%")
+        if "is_full_data" in filters:
+            sql += " AND is_full_data = ?"
+            params.append(filters["is_full_data"])
+        if 'limit' in filters:
+            sql += " limit ?"
+            params.append(filters["limit"])
+
+        cursor.execute(sql, params)
+        #return [row[0].lower() for row in cursor.fetchall()]    # 链接使用小写
+        return [{'href': row[0], 'title': row[1]} for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return []
+    
+# 插入一条任务日志
+def insert_task_log():
+    try:
+        cursor.execute("""
+            INSERT INTO javdb_task_log (task_status) VALUES ('Start')
+        """)
+        conn.commit()
+        return cursor.lastrowid  # 获取插入的 task_id
+    except sqlite3.Error as e:
+        logging.error(f"插入任务失败: {e}")
+        return None
+
+# 更新任务日志的字段
+def update_task_log_inner(task_id, **kwargs):
+    try:
+        fields = ", ".join(f"{key} = ?" for key in kwargs.keys())
+        params = list(kwargs.values()) + [task_id]
+
+        sql = f"UPDATE javdb_task_log SET {fields}, updated_at = datetime('now', 'localtime') WHERE task_id = ?"
+        cursor.execute(sql, params)
+        conn.commit()
+    except sqlite3.Error as e:
+        logging.error(f"更新任务 {task_id} 失败: {e}")
+
+# 更新任务日志的字段
+def update_task_log(task_id, task_status):
+    try:
+        # 获取 performers、studios 等表的最终行数
+        cursor.execute("SELECT COUNT(*) FROM javdb_actors where is_full_data=1")
+        full_data_actors = cursor.fetchone()[0]
+        cursor.execute("SELECT COUNT(*) FROM javdb_actors")
+        total_actors = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM javdb_movies where is_full_data=1")
+        full_data_movies = cursor.fetchone()[0]
+        cursor.execute("SELECT COUNT(*) FROM javdb_movies")
+        total_movies = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM javdb_makers")
+        total_makers = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM javdb_series")
+        total_series = cursor.fetchone()[0]
+
+        # 更新 task_log
+        update_task_log_inner(task_id, 
+                    full_data_actors=full_data_actors,
+                    total_actors=total_actors,
+                    full_data_movies=full_data_movies,
+                    total_movies=total_movies,
+                    total_makers=total_makers,
+                    total_series=total_series,
+                    task_status=task_status)
+        
+    except sqlite3.Error as e:
+        logging.error(f"更新任务 {task_id} 失败: {e}")
+
+
+# 任务结束，更新字段
+def finalize_task_log(task_id):
+    try:
+        # 更新 task_log
+        update_task_log(task_id, task_status="Success")
+    except sqlite3.Error as e:
+        logging.error(f"任务 {task_id} 结束失败: {e}")
+
+
+# 测试代码
+if __name__ == "__main__":
+    
+    sample_data = [
+        {
+            'name': '上原亜衣',
+            'href': 'https://www.javdb.com/actors/MkAX',
+            'pic': 'https://c0.jdbstatic.com/avatars/mk/MkAX.jpg',
+            'alias': ['上原亜衣', '下原舞', '早瀬クリスタル', '阿蘇山百式屏風奉行']
+        },
+        {
+            'name': '大橋未久',
+            'href': 'https://www.javdb.com/actors/21Jp',
+            'pic': 'https://c0.jdbstatic.com/avatars/21/21Jp.jpg',
+            'alias': ['大橋未久']
+        },
+    ]
+    
+    for actor in sample_data:
+        insert_or_update_actor(actor)
+    
+    print(query_actors("name LIKE '%未久%'"))
+    #delete_actor_by_href('https://www.javdb.com/actors/MkAX')
+    print(query_actors())
--- a/scripts/javdb/src/utils.py
+++ b/scripts/javdb/src/utils.py