modify scripts

2025-06-24 11:39:29 +08:00
parent 12c53b043d
commit c5feab2c22
7 changed files with 808 additions and 1773 deletions
--- a/src/crawling/craw.py
+++ b/src/crawling/craw.py
@ -0,0 +1,593 @@
 import logging
 import sys
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 import src.utils.utils as utils
 http_code_404       = 404
 http_code_redirect  = 401
 http_code_url       = 601
 # 通用的爬取类，主要实现了底层的网络交互封装
 class GenericCrawler:
    def __init__(self, use_cloudscraper=None, headers=None, cookies=None, max_retries=3, html_parser='html.parser'):
        if use_cloudscraper is None:
            use_cloudscraper = sys.version_info >= (3, 8)
        self.use_cloudscraper = use_cloudscraper
        self.headers = headers or {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0'
        }
        self.cookies = cookies or {}
        self.scraper = None  # 延迟初始化
        self.max_retries = max_retries
        self.parser = html_parser
        # 不在这里导入 cloudscraper，而是在需要时导入
    def _initialize_scraper(self):
        """延迟初始化请求客户端，避免不必要的 cloudscraper 导入"""
        if self.scraper is not None:
            return
        if self.use_cloudscraper:
            try:
                # 延迟导入 cloudscraper
                import cloudscraper
                self.scraper = cloudscraper.create_scraper()
                logging.info("Using cloudscraper for requests")
            except ImportError:
                logging.warning("cloudscraper not installed. Falling back to requests.")
                self.use_cloudscraper = False
                self.scraper = requests.Session()
        else:
            self.scraper = requests.Session()
            logging.info("Using requests for HTTP operations")
    def fetch_page(self, url, validator):
        # 在使用前初始化 scraper
        self._initialize_scraper()
        for attempt in range(self.max_retries):
            try:
                if not utils.is_valid_url(url):
                    logging.error(f'wrong url format: {url}')
                    return None, http_code_url
                response = self.scraper.get(url, headers=self.headers, cookies=self.cookies)
                # 处理 HTTP 状态码
                if response.status_code == http_code_404:
                    logging.debug(f"Page not found (404): {url}")
                    return None, http_code_404  # 直接返回 404，调用方可以跳过
                response.raise_for_status()  # 处理 HTTP 错误
                # 检查是否发生跳转，比如到登录页面
                if response.history:
                    logging.debug(f"Page redirected on {url}. Checking if it's a verify page.")
                    soup = BeautifulSoup(response.text, self.parser)
                    if self.check_redirect(soup) :
                        logging.warning(f"Page redirected to verify page on {url}.")
                        return None, http_code_redirect
                    # 判断是否为登录页面
                    #if soup.find('div', id='ageVerify'):
                # 预处理 HTML（如果提供了 preprocessor）
                html_text = self.preprocessor(response.text)
                soup = BeautifulSoup(html_text, self.parser)
                if validator(soup):  # 进行自定义页面检查
                    return soup, response.status_code
                logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
            except Exception as e:
                logging.error(f"Unexpected error on {url}: {e}, Retrying...")
        logging.error(f'Fetching failed after max retries. {url}')
        return None, None  # 达到最大重试次数仍然失败
    # 对页面的预处理，通常是修复标签之类的
    def preprocessor(self, html):
        return html
    # 检查是否发生了跳转，偏离了正常解析
    def check_redirect(self, soup):
        """默认的页面验证器，子类可重写"""
        return False  # 默认验证通过
    @staticmethod
    def generic_validator(soup, tag, identifier, attr_type="id"):
        if attr_type == "id":
            return soup.find(tag, id=identifier) is not None
        elif attr_type == "class":
            return bool(soup.find_all(tag, class_=identifier))
        elif attr_type == "name":
            return bool(soup.find('select', {'name': identifier}))
        return False
 # javbus.com 网页爬取类
 class JavbusCrawler(GenericCrawler):
    def __init__(self, use_cloudscraper=None):
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Sec-Fetch-Site": "none",
            "Accept-Encoding": "gzip, deflate, br",
            "Sec-Fetch-Mode": "navigate",
            "Host": "www.javbus.com",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
            "Accept-Language": "zh-CN,zh-Hans;q=0.9",
            "Sec-Fetch-Dest": "document",
            "Connection": "keep-alive",
        }
        cookies = {
            'PHPSESSID': 'l9m4ugaaao1hgvl3micr22u3o6',
            'existmag': 'all',
            'age': 'verified'
        }
        super().__init__(use_cloudscraper, headers=headers, cookies=cookies)
        self.host_url = "https://www.javbus.com"
    # 以下是原有的解析函数，保持不变
    def parse_actors_list(self, soup, href):
        div_actors = soup.find("div", id='waterfall')
        if not div_actors:
            logging.warning(f"Warning: No actors div found ")
            return None, None
        # 解析元素
        rows = div_actors.find_all('div', class_='item')
        list_data = []
        next_url = None
        for row in rows:
            # 获取演员详情链接
            actor_link = row.find('a')['href']
            # 获取演员名字
            actor_name = row.find('span').text.strip()
            # 获取头像图片链接
            avatar_url = row.find('img')['src']
            list_data.append({
                'name': actor_name,
                'href': urljoin(self.host_url, actor_link),
                'pic': avatar_url
            })
        # 查找 "下一页" 按钮
        div_link = soup.find("div", class_='text-center hidden-xs')
        if div_link:
            next_page_element = soup.find('a', id='next')
            if next_page_element:
                next_page_url = next_page_element['href']
                next_url = urljoin(href, next_page_url)
        return list_data, next_url
    def parse_actor_detail(self, soup, href):
        # 先找一下别名
        alias_list = []
        div_meta = soup.find('span', class_='actor-section-name')
        if not div_meta:
            logging.warning(f'warning: no meta data found in page {href}')
            return None, None
        alias_div = soup.find('div', class_='column section-title')
        if alias_div:
            meta_list = alias_div.find_all('span', class_='section-meta')
            if len(meta_list) > 1:
                alias_list = meta_list[0].text.strip().split(", ")
        # 头像
        pic = ''
        avatar = soup.find("div", class_="column actor-avatar")
        if avatar:
            pic = self.parse_avatar_image(avatar)
        # 返回数据
        actor = {}
        # 使用正则表达式查找 class 包含 'movie-list h cols-4' 的 div 元素
        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-'))
        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
        if not div_movies:
            logging.warning(f"Warning: No movies div found ")
            return None, None
        # 解析元素
        rows = div_movies.find_all('div', class_='item')
        list_data = []
        next_url = None
        for row in rows:
            link = row.find('a', class_='box')['href']
            serial_number = row.find('strong').text.strip()
            title = row.find('div', class_='video-title').text.strip()
            release_date = row.find('div', class_='meta').text.strip()
            list_data.append({
                'href': host_url + link if link else '',
                'serial_number': serial_number,
                'title': title,
                'release_date': release_date
            })
        # 查找 "下一页" 按钮
        next_page_element = soup.find('a', class_='pagination-next')
        if next_page_element:
            next_page_url = next_page_element['href']
            next_page_number = self.url_page_num(next_page_url)
            current_page_number = self.url_page_num(href)
            logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
            if current_page_number is None:
                current_page_number = 0
            if next_page_number and next_page_number > current_page_number:
                next_url = host_url + next_page_url
        actor = {
            'pic': pic,
            'alias': alias_list,
            'movies': list_data
        }
        return actor, next_url
    def parse_movie_one(self, soup, keys):
        key_strong = soup.find('strong', string=lambda text: text in keys)
        if key_strong:
            key_span = key_strong.find_next_sibling('span', class_='value')
            if key_span:
                return key_span.text.strip()
        return None
    def parse_movie_val_href(self, soup, keys):
        key_strong = soup.find('strong', string=lambda text: text in keys)
        if key_strong:
            key_span = key_strong.find_next_sibling('span', class_='value')
            if key_span:
                a_tag = key_span.find('a')
                if a_tag:
                    return a_tag.text.strip(), host_url + a_tag.get('href')
                else:
                    return key_span.text.strip(), None
        return None, None
    def parse_movie_arr(self, soup, keys):
        key_strong = soup.find('strong', string=lambda text: text in keys)
        if key_strong:
            key_span = key_strong.find_next_sibling('span', class_='value')
            if key_span:
                actors = []
                a_tags = key_span.find_all('a')
                for a_tag in a_tags:
                    actors.append({
                        'name': a_tag.text.strip(),
                        'href': host_url + a_tag.get('href')
                    })
                return actors
        return []
    def parse_movie_detail(self, soup, href, title):
        div_video = soup.find("div", class_='video-meta-panel')
        if not div_video:
            logging.warning(f"Warning: No movies div found ")
            return None, None
        result = {}
        result['href'] = href
        result['title'] = title
        # 获取封面图片
        cover_img = soup.select_one('.column-video-cover a')
        result['cover_url'] = cover_img['href'] if cover_img else None
        # 获取番号
        result['serial_number'] = self.parse_movie_one(soup, ['番號:', 'ID:'])
        result['release_date'] = self.parse_movie_one(soup, ['日期:', 'Released Date:'])
        result['duration'] = self.parse_movie_one(soup, ['時長:', 'Duration:'])
        # 获取maker，系列
        result['maker_name'], result['maker_link'] = self.parse_movie_val_href(soup, ['片商:', 'Maker:'])
        result['series_name'], result['series_link'] = self.parse_movie_val_href(soup, ['系列:', 'Series:'])
        result['pub_name'], result['pub_link'] = self.parse_movie_val_href(soup, ['發行:', 'Publisher:'])
        # 获取演员，tags
        result['tags'] = self.parse_movie_arr(soup, ['類別:', 'Tags:'])
        result['actors'] = self.parse_movie_arr(soup, ['演員:', 'Actor(s):'])
        return result
    def parse_series_uncensored(self, soup, href):
        div_series = soup.find("div", id='series')
        if not div_series:
            logging.warning(f"Warning: No div_series div found ")
            return None, None
        # 解析元素
        rows = div_series.find_all('a', class_='box')
        list_data = []
        next_url = None
        for row in rows:
            name = row.find('strong').text.strip()
            href = row['href']
            div_movies = row.find('span')
            movies = 0
            if div_movies:
                match = re.search(r'\((\d+)\)', div_movies.text.strip())
                if match:
                    movies = int(match.group(1))
            list_data.append({
                'name': name,
                'href': host_url + href if href else '',
                'movies': movies
            })
        # 查找 "下一页" 按钮
        next_page_element = soup.find('a', class_='pagination-next')
        if next_page_element:
            next_page_url = next_page_element['href']
            next_page_number = self.url_page_num(next_page_url)
            current_page_number = self.url_page_num(href)
            if current_page_number is None:
                current_page_number = 0
            if next_page_number and next_page_number > current_page_number:
                next_url = host_url + next_page_url
        return list_data, next_url
    def parse_series_detail(self, soup, href):
        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
        if not div_movies:
            logging.warning(f"Warning: No movies div found ")
            return [], None
        # 解析元素
        rows = div_movies.find_all('div', class_='item')
        list_data = []
        next_url = None
        for row in rows:
            link = row.find('a', class_='box')['href']
            serial_number = row.find('strong').text.strip()
            title = row.find('div', class_='video-title').text.strip()
            release_date = row.find('div', class_='meta').text.strip()
            list_data.append({
                'href': host_url + link if link else '',
                'serial_number': serial_number,
                'title': title,
                'release_date': release_date
            })
        # 查找 "下一页" 按钮
        next_page_element = soup.find('a', class_='pagination-next')
        if next_page_element:
            next_page_url = next_page_element['href']
            next_page_number = self.url_page_num(next_page_url)
            current_page_number = self.url_page_num(href)
            if current_page_number is None:
                current_page_number = 0
            if next_page_number and next_page_number > current_page_number:
                next_url = host_url + next_page_url
        return list_data, next_url
    def parse_makers_uncensored(self, soup, href):
        div_series = soup.find("div", id='makers')
        if not div_series:
            logging.warning(f"Warning: No makers div found ")
            return None, None
        # 解析元素
        rows = div_series.find_all('a', class_='box')
        list_data = []
        next_url = None
        for row in rows:
            name = row.find('strong').text.strip()
            href = row['href']
            div_movies = row.find('span')
            movies = 0
            if div_movies:
                match = re.search(r'\((\d+)\)', div_movies.text.strip())
                if match:
                    movies = int(match.group(1))
            list_data.append({
                'name': name,
                'href': host_url + href if href else '',
                'movies': movies
            })
        # 查找 "下一页" 按钮
        next_page_element = soup.find('a', class_='pagination-next')
        if next_page_element:
            next_page_url = next_page_element['href']
            next_page_number = self.url_page_num(next_page_url)
            current_page_number = self.url_page_num(href)
            if current_page_number is None:
                current_page_number = 0
            if next_page_number and next_page_number > current_page_number:
                next_url = host_url + next_page_url
        return list_data, next_url
    def parse_maker_detail(self, soup, href):
        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
        if not div_movies:
            logging.warning(f"Warning: No movies div found ")
            return [], None
        # 解析元素
        rows = div_movies.find_all('div', class_='item')
        list_data = []
        next_url = None
        for row in rows:
            link = row.find('a', class_='box')['href']
            serial_number = row.find('strong').text.strip()
            title = row.find('div', class_='video-title').text.strip()
            release_date = row.find('div', class_='meta').text.strip()
            list_data.append({
                'href': host_url + link if link else '',
                'serial_number': serial_number,
                'title': title,
                'release_date': release_date
            })
        # 查找 "下一页" 按钮
        next_page_element = soup.find('a', class_='pagination-next')
        if next_page_element:
            next_page_url = next_page_element['href']
            next_page_number = self.url_page_num(next_page_url)
            current_page_number = self.url_page_num(href)
            if current_page_number is None:
                current_page_number = 0
            if next_page_number and next_page_number > current_page_number:
                next_url = host_url + next_page_url
        return list_data, next_url
    def parse_publisher_detail(self, soup, href):
        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
        if not div_movies:
            logging.warning(f"Warning: No movies div found ")
            return [], None
        # 解析元素
        rows = div_movies.find_all('div', class_='item')
        list_data = []
        next_url = None
        for row in rows:
            link = row.find('a', class_='box')['href']
            serial_number = row.find('strong').text.strip()
            title = row.find('div', class_='video-title').text.strip()
            release_date = row.find('div', class_='meta').text.strip()
            list_data.append({
                'href': host_url + link if link else '',
                'serial_number': serial_number,
                'title': title,
                'release_date': release_date
            })
        # 查找 "下一页" 按钮
        next_page_element = soup.find('a', class_='pagination-next')
        if next_page_element:
            next_page_url = next_page_element['href']
            next_page_number = self.url_page_num(next_page_url)
            current_page_number = self.url_page_num(href)
            if current_page_number is None:
                current_page_number = 0
            if next_page_number and next_page_number > current_page_number:
                next_url = host_url + next_page_url
        return list_data, next_url
    def parse_uncensored(self, soup, href):
        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
        if not div_movies:
            logging.warning(f"Warning: No movies div found ")
            return [], None
        # 解析元素
        rows = div_movies.find_all('div', class_='item')
        list_data = []
        next_url = None
        for row in rows:
            link = row.find('a', class_='box')['href']
            serial_number = row.find('strong').text.strip()
            title = row.find('div', class_='video-title').text.strip()
            release_date = row.find('div', class_='meta').text.strip()
            list_data.append({
                'href': host_url + link if link else '',
                'serial_number': serial_number,
                'title': title,
                'release_date': release_date
            })
        # 查找 "下一页" 按钮
        next_page_element = soup.find('a', class_='pagination-next')
        if next_page_element:
            next_page_url = next_page_element['href']
            next_page_number = self.url_page_num(next_page_url)
            current_page_number = self.url_page_num(href)
            if current_page_number is None:
                current_page_number = 0
            if next_page_number and next_page_number > current_page_number:
                next_url = host_url + next_page_url
        return list_data, next_url
    @staticmethod
    def pretty_print_json(data, n=10, indent=4, sort_keys=False):
        """
        以美化格式打印数组的前n个元素，其他元素用"..."表示
        参数:
        - data: 要打印的数据（应为数组）
        - n: 要显示的元素数量
        - indent: 缩进空格数
        - sort_keys: 是否按键排序
        """
        try:
            # 处理非数组数据
            if not isinstance(data, list):
                print(formatted)
                return
            # 复制原始数据，避免修改原数组
            data_copy = data.copy()
            # 切片取前n个元素
            first_n_elements = data_copy[:n]
            # 如果数组长度超过n，添加"..."标记
            if len(data) > n:
                result = first_n_elements + ["... ({} more elements)".format(len(data) - n)]
            else:
                result = first_n_elements
            # 格式化输出
            formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys)
            print(formatted)
        except TypeError as e:
            print(f"错误：无法格式化数据。详情：{e}")
        except Exception as e:
            print(f"打印时发生意外错误：{e}")
    def test_actor_list(self, url='https://www.javbus.com/uncensored/actresses/1'):
        next_url = url
        all_data = []
        while next_url:
            print(f'fetching page {next_url}')
            soup, status_code = self.fetch_page(next_url, partial(self.generic_validator, tag="div", identifier="waterfall", attr_type="id"),
                                                max_retries=1)
            if soup:
                list_data, next_url = self.parse_actors_list(soup, next_url)
                if list_data:
                    all_data.extend(list_data)
                    self.pretty_print_json(all_data)
                else:
                    print('get wrong page.')
                if next_url:
                    print(f"\n\nnext url: {next_url}")
            else:
                print(f"wrong request. url: {next_url}, status_code: {status_code}")
            break
    def url_page_num(self, url):
        # 这里需要根据实际情况实现提取页码的逻辑
        return None
--- a/src/crawling/craw_common.py
+++ b/src/crawling/craw_common.py
@ -1,71 +0,0 @@
 import logging
 import cloudscraper
 from bs4 import BeautifulSoup
 import src.utils.utils as utils
 # 设置 headers 和 scraper
 headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0'
 }
 # 定义 cookie
 cookies = {
 }
 scraper = cloudscraper.create_scraper()
 http_code_404   = 404
 http_code_login = 401
 http_code_local = 99
 logging.getLogger().setLevel(logging.DEBUG)
 #使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None, headers=headers, cookies=cookies):
    for attempt in range(max_retries):
        try:
            if not utils.is_valid_url(url):
                logging.error(f'wrong url format: {url}')
                return None, None
            response = scraper.get(url, headers=headers, cookies=cookies)
            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.debug(f"Page not found (404): {url}")
                return None, http_code_404  # 直接返回 404，调用方可以跳过
            response.raise_for_status()  # 处理 HTTP 错误
            # 检查是否发生跳转，比如到登录页面
            if response.history:
                logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
                soup = BeautifulSoup(response.text, parser)
                # 判断是否为登录页面，
                if soup.find('div', id='ageVerify'):
                    logging.warning(f"Page redirected to login page on {url}.")
                    return None, http_code_login
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text
            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code
            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"Unexpected error on {url}: {e}, Retring...")
    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败
 # 通用的 HTML 结构验证器
 def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name": 
        return bool(soup.find('select', {'name': identifier}))
    return False
--- a/src/crawling/craw_javbus.py
+++ b/src/crawling/craw_javbus.py
@ -1,515 +0,0 @@
 import cloudscraper
 import logging
 import re
 import json
 from functools import partial
 from urllib.parse import urljoin
 import src.config.config as config
 import src.utils.utils as utils
 import src.crawling.craw_common as scraper
 # 定义基础 URL 和可变参数
 host_url = "https://www.javbus.com"
 headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Sec-Fetch-Site": "none",
    "Accept-Encoding": "gzip, deflate, br",
    "Sec-Fetch-Mode": "navigate",
    "Host": "www.javbus.com",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
    "Accept-Language": "zh-CN,zh-Hans;q=0.9",
    "Sec-Fetch-Dest": "document",
    "Connection": "keep-alive",
 }
 cookies = {
    'PHPSESSID': 'l9m4ugaaao1hgvl3micr22u3o6',
    'existmag': 'all',
    'age': 'verified'
 }
 # 解析 HTML 内容，提取需要的数据
 def parse_actors_list(soup, href):
    div_actors = soup.find("div", id='waterfall')
    if not div_actors:
        logging.warning(f"Warning: No actors div found ")
        return None, None
    # 解析元素
    rows = div_actors.find_all('div', class_='item')
    list_data = []
    next_url = None
    for row in rows:
        # 获取演员详情链接
        actor_link = row.find('a')['href']
        # 获取演员名字
        actor_name = row.find('span').text.strip()
        # 获取头像图片链接
        avatar_url = row.find('img')['src']
        list_data.append({
            'name' : actor_name,
            'href' : urljoin(host_url, actor_link),
            'pic'  : avatar_url
        })
    # 查找 "下一页" 按钮
    div_link = soup.find("div", class_='text-center hidden-xs')
    if div_link:
        next_page_element = soup.find('a', id='next')    
        if next_page_element:
            next_page_url = next_page_element['href']
            next_url = urljoin(href, next_page_url)
    return list_data, next_url
 # 解析 HTML 内容，提取需要的数据
 def parse_actor_detail(soup, href):
    # 先找一下别名
    alias_list = []
    div_meta = soup.find('span', class_='actor-section-name')
    if not div_meta:
        logging.warning(f'warning: no meta data found in page {href}')
        return None, None
    alias_div = soup.find('div', class_='column section-title')
    if alias_div:
        meta_list = alias_div.find_all('span', class_='section-meta')
        if len(meta_list) > 1:
            alias_list = meta_list[0].text.strip().split(", ")
    # 头像
    pic = ''
    avatar = soup.find("div", class_="column actor-avatar")
    if avatar:
        pic = parse_avatar_image(avatar)
    # 返回数据
    actor = {}
    # 使用正则表达式查找 class 包含 'movie-list h cols-4' 的 div 元素
    div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-'))
    #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return None, None
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date 
        })
    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url
    actor = {
        'pic' : pic,
        'alias' : alias_list,
        'movies' : list_data
    }
    return actor, next_url
 # 解析单个元素
 def parse_movie_one(soup, keys):
    key_strong = soup.find('strong', string=lambda text: text in keys)
    if key_strong:
        key_span = key_strong.find_next_sibling('span', class_='value')
        if key_span:
            return key_span.text.strip()
    return None
 # 解析值和链接
 def parse_movie_val_href(soup, keys):
    key_strong = soup.find('strong', string=lambda text: text in keys)
    if key_strong:
        key_span = key_strong.find_next_sibling('span', class_='value')
        if key_span:
            a_tag = key_span.find('a')
            if a_tag:
                return a_tag.text.strip(), host_url + a_tag.get('href')
            else:
                return key_span.text.strip(), None
    return None, None
 # 解析多个值和链接
 def parse_movie_arr(soup, keys):
    key_strong = soup.find('strong', string=lambda text: text in keys)
    if key_strong:
        key_span = key_strong.find_next_sibling('span', class_='value')
        if key_span:
            actors = []
            a_tags = key_span.find_all('a')
            for a_tag in a_tags:
                actors.append({
                    'name': a_tag.text.strip(),
                    'href': host_url + a_tag.get('href')
                })
            return actors
    return []
 # 解析 HTML 内容，提取需要的数据
 def parse_movie_detail(soup, href, title):
    div_video = soup.find("div", class_='video-meta-panel')
    if not div_video:
        logging.warning(f"Warning: No movies div found ")
        return None, None
    result = {}
    result['href'] = href
    result['title'] = title
    # 获取封面图片
    cover_img = soup.select_one('.column-video-cover a')
    result['cover_url'] = cover_img['href'] if cover_img else None
    # 获取番号
    result['serial_number'] = parse_movie_one(soup, ['番號:', 'ID:'])
    result['release_date']  = parse_movie_one(soup, ['日期:', 'Released Date:'])
    result['duration']      = parse_movie_one(soup, ['時長:', 'Duration:'])
    # 获取maker，系列
    result['maker_name'], result['maker_link']   = parse_movie_val_href(soup, ['片商:', 'Maker:'])
    result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
    result['pub_name'], result['pub_link'] = parse_movie_val_href(soup, ['發行:', 'Publisher:'])
    # 获取演员，tags
    result['tags']   = parse_movie_arr(soup, ['類別:', 'Tags:'])
    result['actors'] = parse_movie_arr(soup, ['演員:', 'Actor(s):'])
    return result
 # 解析 HTML 内容，提取需要的数据
 def parse_series_uncensored(soup, href):
    div_series = soup.find("div", id='series')
    if not div_series:
        logging.warning(f"Warning: No div_series div found ")
        return None, None
    # 解析元素
    rows = div_series.find_all('a', class_='box')
    list_data = []
    next_url = None
    for row in rows:
        name = row.find('strong').text.strip()
        href = row['href']
        div_movies = row.find('span')
        movies = 0
        if div_movies:
            match = re.search(r'\((\d+)\)', div_movies.text.strip())
            if match:
                movies = int(match.group(1))
        list_data.append({
            'name' : name,
            'href' : host_url + href if href else '',
            'movies'  : movies
        })
    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url
    return list_data, next_url
 # 解析 HTML 内容，提取需要的数据
 def parse_series_detail(soup, href):
    #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return [], None
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date 
        })
    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url
    return list_data, next_url
 # 解析 HTML 内容，提取需要的数据
 def parse_makers_uncensored(soup, href):
    div_series = soup.find("div", id='makers')
    if not div_series:
        logging.warning(f"Warning: No makers div found ")
        return None, None
    # 解析元素
    rows = div_series.find_all('a', class_='box')
    list_data = []
    next_url = None
    for row in rows:
        name = row.find('strong').text.strip()
        href = row['href']
        div_movies = row.find('span')
        movies = 0
        if div_movies:
            match = re.search(r'\((\d+)\)', div_movies.text.strip())
            if match:
                movies = int(match.group(1))
        list_data.append({
            'name' : name,
            'href' : host_url + href if href else '',
            'movies'  : movies
        })
    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url
    return list_data, next_url
 # 解析 HTML 内容，提取需要的数据
 def parse_maker_detail(soup, href):
    #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return [], None
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date 
        })
    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url
    return list_data, next_url
 # 解析 HTML 内容，提取需要的数据
 def parse_publisher_detail(soup, href):
    #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return [], None
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date 
        })
    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url
    return list_data, next_url
 # 解析 HTML 内容，提取需要的数据
 def parse_uncensored(soup, href):
    #div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
    div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return [], None
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date 
        })
    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url
    return list_data, next_url
 def pretty_print_json(data, n=10, indent=4, sort_keys=False):
    """
    以美化格式打印数组的前n个元素，其他元素用"..."表示
    参数:
    - data: 要打印的数据（应为数组）
    - n: 要显示的元素数量
    - indent: 缩进空格数
    - sort_keys: 是否按键排序
    """
    try:
        # 处理非数组数据
        if not isinstance(data, list):
            print(formatted)
            return
        # 复制原始数据，避免修改原数组
        data_copy = data.copy()
        # 切片取前n个元素
        first_n_elements = data_copy[:n]
        # 如果数组长度超过n，添加"..."标记
        if len(data) > n:
            result = first_n_elements + ["... ({} more elements)".format(len(data) - n)]
        else:
            result = first_n_elements
        # 格式化输出
        formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys)
        print(formatted)
    except TypeError as e:
        print(f"错误：无法格式化数据。详情：{e}")
    except Exception as e:
        print(f"打印时发生意外错误：{e}")
 def test_actor_list(url='https://www.javbus.com/uncensored/actresses/1'):
    next_url = url
    all_data = []
    while next_url:
        print(f'fetching page {next_url}')
        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"),max_retries=1, headers=headers, cookies=cookies)
        if soup:
            list_data, next_url = parse_actors_list(soup, next_url)
            if list_data :
                all_data.extend(list_data)
                pretty_print_json(all_data)
            else:
                print('get wrong page.')
            if next_url:
                print(f"\n\nnext url: {next_url}")
        else:
            print(f"wrong request. url: {next_url}, status_code: {status_code}")
        break
 if __name__ == "__main__":
    #test_actors_list()
    #test_actor()
    #test_movie_detail()
    #test_series_list()
    #test_series_detail()
    logging.getLogger().setLevel(logging.DEBUG)
    test_actor_list()
    test_actor_list('https://www.javbus.com/en/actresses')
--- a/src/db_utils/db_common.py
+++ b/src/db_utils/db_common.py
@ -1,121 +0,0 @@
 import sqlite3
 import json
 import logging
 from datetime import datetime
 import src.config.config as config
 # 连接 SQLite 数据库
 DB_PATH = f"{config.global_share_data_dir}/sqlite/shared.db"  # 替换为你的数据库文件
 # 检查 SQLite 版本
 lower_sqlite_version = False
 sqlite_version = sqlite3.sqlite_version_info
 if sqlite_version < (3, 24, 0):
    lower_sqlite_version = True
 # 获取表的列名和默认值
 def get_table_columns_and_defaults(cursor, tbl_name):
    try:
        cursor.execute(f"PRAGMA table_info({tbl_name})")
        columns = cursor.fetchall()
        column_info = {}
        for col in columns:
            col_name = col[1]
            default_value = col[4]
            column_info[col_name] = default_value
        return column_info
    except sqlite3.Error as e:
        logging.error(f"Error getting table columns: {e}")
        return None
 # 检查并处理数据
 def check_and_process_data(cursor, data, tbl_name):
    column_info = get_table_columns_and_defaults(cursor=cursor, tbl_name=tbl_name)
    if column_info is None:
        return None
    processed_data = {}
    for col, default in column_info.items():
        if col == 'id' or col == 'created_at':      # 自增主键，不需要用户提供; 创建日期，使用建表默认值
            continue
        if col == 'updated_at':                     # 日期函数，用户自己指定即可
            processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        if col in data:
            processed_data[col] = data[col]
    return processed_data
 # 插入或更新数据
 def insert_or_update_common(cursor, conn, data, tbl_name, uniq_key='url'):
    if lower_sqlite_version:
        return insert_or_update_common_lower(cursor, conn, data, tbl_name, uniq_key)
    try:
        processed_data = check_and_process_data(cursor, data, tbl_name)
        if processed_data is None:
            return None
        columns = ', '.join(processed_data.keys())
        values = list(processed_data.values())
        placeholders = ', '.join(['?' for _ in values])
        update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key]) 
        sql = f'''
            INSERT INTO {tbl_name} ({columns})
            VALUES ({placeholders})
            ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
        '''
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None
 # 插入或更新数据
 def insert_or_update_common_lower(cursor, conn, data, tbl_name, uniq_key='url'):
    try:
        processed_data = check_and_process_data(cursor, data, tbl_name)
        if processed_data is None:
            return None
        columns = ', '.join(processed_data.keys())
        values = list(processed_data.values())
        placeholders = ', '.join(['?' for _ in values])
        # 先尝试插入数据
        try:
            sql = f'''
                INSERT INTO {tbl_name} ({columns})
                VALUES ({placeholders})
            '''
            cursor.execute(sql, values)
            conn.commit()
        except sqlite3.IntegrityError:  # 唯一键冲突，执行更新操作
            update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) 
            update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
            update_values.append(data[uniq_key])
            sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
            cursor.execute(sql, update_values)
            conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None
 # 测试代码
 if __name__ == "__main__":
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    cursor = conn.cursor()
    tbl_name_actors = 'javhd_models'
    print(get_table_columns_and_defaults(cursor, tbl_name_actors))
--- a/src/db_utils/db_javbus.py
+++ b/src/db_utils/db_javbus.py
--- a/src/db_utils/sqlite_db.py
+++ b/src/db_utils/sqlite_db.py
@ -0,0 +1,204 @@
 import sqlite3
 import logging
 import os
 from datetime import datetime
 import src.config.config as config
 default_dbpath = f"{config.global_share_data_dir}/sqlite/shared.db"
 # 数据库基类，封装了通用的操作。
 class DatabaseHandler:
    def __init__(self, db_path=None):
        # 使用传入的 db_path 或默认路径
        self.DB_PATH = db_path or default_dbpath
        # 验证路径是否存在（可选）
        if db_path and not os.path.exists(os.path.dirname(db_path)):
            os.makedirs(os.path.dirname(db_path))
        self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
        self.cursor = self.conn.cursor()
        # 检查 SQLite 版本
        self.lower_sqlite_version = False
        sqlite_version = sqlite3.sqlite_version_info
        if sqlite_version < (3, 24, 0):
            self.lower_sqlite_version = True
    def get_table_columns_and_defaults(self, tbl_name):
        try:
            self.cursor.execute(f"PRAGMA table_info({tbl_name})")
            columns = self.cursor.fetchall()
            column_info = {}
            for col in columns:
                col_name = col[1]
                default_value = col[4]
                column_info[col_name] = default_value
            return column_info
        except sqlite3.Error as e:
            logging.error(f"Error getting table columns: {e}")
            return None
    def check_and_process_data(self, data, tbl_name):
        column_info = self.get_table_columns_and_defaults(tbl_name)
        if column_info is None:
            return None
        processed_data = {}
        for col, default in column_info.items():
            if col == 'id' or col == 'created_at':  # 自增主键，不需要用户提供; 创建日期，使用建表默认值
                continue
            if col == 'updated_at':  # 日期函数，用户自己指定即可
                processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            if col in data:
                processed_data[col] = data[col]
        return processed_data
    def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
        if self.lower_sqlite_version:
            return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
        try:
            processed_data = self.check_and_process_data(data, tbl_name)
            if processed_data is None:
                return None
            columns = ', '.join(processed_data.keys())
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
            sql = f'''
                INSERT INTO {tbl_name} ({columns})
                VALUES ({placeholders})
                ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
            '''
            self.cursor.execute(sql, values)
            self.conn.commit()
            # 获取插入或更新后的 report_id
            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
            report_id = self.cursor.fetchone()[0]
            return report_id
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
        try:
            processed_data = self.check_and_process_data(data, tbl_name)
            if processed_data is None:
                return None
            columns = ', '.join(processed_data.keys())
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            # 先尝试插入数据
            try:
                sql = f'''
                    INSERT INTO {tbl_name} ({columns})
                    VALUES ({placeholders})
                '''
                self.cursor.execute(sql, values)
                self.conn.commit()
            except sqlite3.IntegrityError:  # 唯一键冲突，执行更新操作
                update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
                update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
                update_values.append(data[uniq_key])
                sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
                self.cursor.execute(sql, update_values)
                self.conn.commit()
            # 获取插入或更新后的 report_id
            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
            report_id = self.cursor.fetchone()[0]
            return report_id
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def insert_task_log(self):
        return 1
    def update_task_log(self, task_id, task_status):
        return 1
    def finalize_task_log(self, task_id):
        return 1
    def close(self):
        self.cursor.close()
        self.conn.close()
 # javbus 类
 class JavbusDBHandler(DatabaseHandler):
    def __init__(self, db_path=None):
        super().__init__(db_path)
        self.tbl_name_actors = 'javbus_actors'
    def insert_actor_index(self, data, uncensored=0, from_actor_list=0, from_movie_list=0):
        data['uncensored'] = uncensored
        if from_actor_list:
            data['from_actor_list'] = from_actor_list
        if from_movie_list:
            data['from_movie_list'] = from_movie_list
        try:
            return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href')
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def update_actor_detail(self, data, is_full_data=1):
        try:
            data['is_full_data'] = is_full_data
            return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href')
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def query_actors(self, **filters):
        try:
            sql = f"SELECT url, en_name as name FROM {self.tbl_name_actors} WHERE 1=1"
            params = []
            conditions = {
                "id": " AND id = ?",
                "url": " AND href = ?",
                "en_name": " AND name LIKE ?",
                "is_full_data": " AND is_full_data = ?",
                "start_id": " AND id > ?",
            }
            for key, condition in conditions.items():
                if key in filters:
                    sql += condition
                    if key == "en_name":
                        params.append(f"%{filters[key]}%")
                    else:
                        params.append(filters[key])
            for key in ["is_full_data_in", "is_full_data_not_in"]:
                if key in filters:
                    values = filters[key]
                    if values:
                        placeholders = ", ".join(["?"] * len(values))
                        operator = "IN" if key == "is_full_data_in" else "NOT IN"
                        sql += f" AND is_full_data {operator} ({placeholders})"
                        params.extend(values)
            if "order_by" in filters:
                # 注意：这里 order by 后面直接跟字段名，不能用占位符，否则会被当作字符串处理
                sql += f" ORDER BY {filters['order_by']} "
            if 'limit' in filters:
                sql += " LIMIT ?"
                params.append(filters["limit"])
            self.cursor.execute(sql, params)
            return [{'url': row[0], 'name': row[1]} for row in self.cursor.fetchall()]
        except sqlite3.Error as e:
            logging.error(f"查询 href 失败: {e}")
            return None
--- a/src/javbus/fetch.py
+++ b/src/javbus/fetch.py
@ -9,12 +9,13 @@ from functools import partial
 from urllib.parse import urljoin, urlparse
 import src.config.config as config
 import src.logger.logger as logger
-import src.db_utils.db_javbus as db_tools
+import src.db_utils.sqlite_db as sqlite_db
-import src.crawling.craw_common as scraper_base
+import src.crawling.craw as craw
 import src.crawling.craw_javbus as scraper
 import src.utils.utils as utils
 logger.setup_logging()
 db_tools = sqlite_db.JavbusDBHandler()
 scraper = craw.JavbusCrawler()
 debug = False
 skip_local = False
@ -34,7 +35,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
    num = 1
    while current_url:
        logging.info(f"fetching url {current_url}")
-        soup, status_code = scraper_base.fetch_page(current_url, partial(scraper_base.generic_validator, tag="div", identifier="waterfall", attr_type="id"), headers=scraper.headers, cookies=scraper.cookies)
+        soup, status_code = scraper.fetch_page(current_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
        if soup:
            list_data, current_url = scraper.parse_actors_list(soup, current_url)
            if list_data :
@ -50,9 +51,12 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
            else:
                logging.warning(f'fetch actor error. {current_url} ...')
-        elif status_code  and status_code == 404:
+        elif status_code :
            logging.warning(f'fetch page error. httpcode: {status_code}, url: {current_url}')
            break
        else:   # 达到失败上限，加上休眠继续重试
            time.sleep(5)
        time.sleep(0.3)
        # 调试break
@ -62,36 +66,13 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
 # 获取演员列表
 def fetch_actor_list():
    #for lang in ["en", "ja", "zh"]:
-    for lang in ['ja']:
+    for lang in ['en']:
        fetch_actor_list_lang(lang=lang, uncensored=1)
    #for lang in ["en", "ja", "zh"]:
-    for lang in ['ja']:
+    for lang in ['en']:
        fetch_actor_list_lang(lang=lang)
 # 获取演员列表
 def fetch_actor_list2(): 
    next_url = scraper.actors_uncensored_base_url
    while next_url:
        logging.info(f'fetching page {next_url}')
        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
            if list_data :
                # 写入数据库
                for row in list_data:
                    actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
                    if actor_id:
                        logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
                    else:
                        logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
            else:
                logging.warning(f'fetch actor error. {next_url} ...')
        elif status_code  and status_code == 404:
            logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
            break
 # 获取makers列表
 def fetch_makers_list():
    next_url = scraper.makers_uncensored_base_url