stock/scripts/javdb/src/scraper.py

import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
import config

# 定义基础 URL 和可变参数
host_url = "https://www.javdb.com"
actors_uncensored_base_url = f'{host_url}/actors/uncensored'
series_uncensored_base_url = f'{host_url}/series/uncensored'
makers_uncensored_base_url = f'{host_url}/makers/uncensored'

# 设置 headers 和 scraper
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
scraper = cloudscraper.create_scraper()

#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
    for attempt in range(max_retries):
        try:
            if 'javdb.com' not in url.lower():
                logging.error(f'wrong url format: {url}')
                return None, None

            response = scraper.get(url, headers=headers)

            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.warning(f"Page not found (404): {url}")
                return None, 404  # 直接返回 404，调用方可以跳过

            response.raise_for_status()  # 处理 HTTP 错误

            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code

            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"Unexpected error on {url}: {e}, Retring...")

    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败

# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
def preprocess_html(html):
    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')

# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name":
        return bool(soup.find('select', {'name': identifier}))
    return False

# 解析链接中的页码
def url_page_num(href):
    if href is None:
        return None
    match = re.search(r'page=(\d+)', href)
    if match:
        next_page_number = int(match.group(1))
        return next_page_number
    else:
        return None


# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
def parse_avatar_image(soup):
    try:
        span = soup.find("span", class_="avatar")
        if not span:
            return ""  # 没有找到 <span> 元素，返回空字符串

        style = span.get("style", "")
        match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
        return match.group(1) if match else ""  # 解析成功返回 URL，否则返回空字符串
    except Exception as e:
        return ""  # 发生异常时，返回空字符串


# 解析 HTML 内容，提取需要的数据
def parse_actors_uncensored(soup, href):
    div_actors = soup.find("div", id='actors')
    if not div_actors:
        logging.warning(f"Warning: No actors div found ")
        return None, None

    # 解析元素
    rows = div_actors.find_all('div', class_='box actor-box')

    list_data = []
    next_url = None
    for row in rows:
        # 获取演员详情链接
        actor_link = row.find('a')['href']
        # 获取演员名字
        actor_name = row.find('strong').text.strip()
        # 获取头像图片链接
        avatar_url = row.find('img', class_='avatar')['src']
        # 获取 title 属性中的别名
        alias_list = row.find('a')['title'].split(", ")

        list_data.append({
            'name' : actor_name,
            'href' : host_url + actor_link if actor_link else '',
            'pic'  : avatar_url,
            'alias': alias_list
        })

    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

    return list_data, next_url


# 解析 HTML 内容，提取需要的数据
def parse_actor_detail(soup, href):
    # 先找一下别名
    alias_list = []

    div_meta = soup.find('span', class_='actor-section-name')
    if not div_meta:
        logging.warning(f'warning: no meta data found in page {href}')
        return None, None
    alias_div = soup.find('div', class_='column section-title')

    if alias_div:
        meta_list = alias_div.find_all('span', class_='section-meta')
        if len(meta_list) > 1:
            alias_list = meta_list[0].text.strip().split(", ")

    # 头像
    pic = ''
    avatar = soup.find("div", class_="column actor-avatar")
    if avatar:
        pic = parse_avatar_image(avatar)

    # 返回数据
    actor = {}

    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return None, None

    # 解析元素
    rows = div_movies.find_all('div', class_='item')

    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date
        })

    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}')
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

    actor = {
        'pic' : pic,
        'alias' : alias_list,
        'movies' : list_data
    }

    return actor, next_url


# 解析 HTML 内容，提取需要的数据
def parse_movie_detail(soup, href, title):
    div_video = soup.find("div", class_='video-meta-panel')
    if not div_video:
        logging.warning(f"Warning: No movies div found ")
        return None, None

    # 获取封面图片
    cover_img = soup.select_one('.column-video-cover a')
    cover_url = cover_img['href'] if cover_img else None

    # 获取番号
    serial = soup.select_one('.panel-block:first-child .value')
    serial_number = serial.text.strip() if serial else None

    # 获取日期
    date = soup.select_one('.panel-block:nth-of-type(2) .value')
    release_date = date.text.strip() if date else None

    # 获取时长
    duration = soup.select_one('.panel-block:nth-of-type(3) .value')
    video_duration = duration.text.strip() if duration else None

    # 获取片商
    maker = soup.select_one('.panel-block:nth-of-type(4) .value a')
    maker_name = maker.text.strip() if maker else None
    maker_link = maker['href'] if maker else None

    # 获取系列
    series = soup.select_one('.panel-block:nth-of-type(5) .value a')
    series_name = series.text.strip() if series else None
    series_link = series['href'] if series else None

    # 获取演员（名字 + 链接）
    actors = [{'name': actor.text.strip(), 'href': host_url + actor['href']} for actor in soup.select('.panel-block:nth-of-type(8) .value a')]

    return {
        'href'  : href,
        'title' : title,
        'cover_url': cover_url,
        'serial_number': serial_number,
        'release_date': release_date,
        'duration': video_duration,
        'maker_name': maker_name,
        'maker_link': host_url + maker_link if maker_link else  '',
        'series_name': series_name,
        'series_link': host_url + series_link if series_link else '',
        'actors': actors
    }

# 解析 HTML 内容，提取需要的数据
def parse_series_uncensored(soup, href):
    div_series = soup.find("div", id='series')
    if not div_series:
        logging.warning(f"Warning: No div_series div found ")
        return None, None

    # 解析元素
    rows = div_series.find_all('a', class_='box')

    list_data = []
    next_url = None
    for row in rows:
        name = row.find('strong').text.strip()
        href = row['href']
        div_movies = row.find('span')
        movies = 0
        if div_movies:
            match = re.search(r'\((\d+)\)', div_movies.text.strip())
            if match:
                movies = int(match.group(1))

        list_data.append({
            'name' : name,
            'href' : host_url + href if href else '',
            'movies'  : movies
        })

    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

    return list_data, next_url


# 解析 HTML 内容，提取需要的数据
def parse_series_detail(soup, href):
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return [], None

    # 解析元素
    rows = div_movies.find_all('div', class_='item')

    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date
        })

    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

    return list_data, next_url


# 解析 HTML 内容，提取需要的数据
def parse_makers_uncensored(soup, href):
    div_series = soup.find("div", id='makers')
    if not div_series:
        logging.warning(f"Warning: No makers div found ")
        return None, None

    # 解析元素
    rows = div_series.find_all('a', class_='box')

    list_data = []
    next_url = None
    for row in rows:
        name = row.find('strong').text.strip()
        href = row['href']
        div_movies = row.find('span')
        movies = 0
        if div_movies:
            match = re.search(r'\((\d+)\)', div_movies.text.strip())
            if match:
                movies = int(match.group(1))

        list_data.append({
            'name' : name,
            'href' : host_url + href if href else '',
            'movies'  : movies
        })

    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

    return list_data, next_url


# 解析 HTML 内容，提取需要的数据
def parse_maker_detail(soup, href):
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
        return [], None

    # 解析元素
    rows = div_movies.find_all('div', class_='item')

    list_data = []
    next_url = None
    for row in rows:
        link = row.find('a', class_='box')['href']
        serial_number = row.find('strong').text.strip()
        title = row.find('div', class_='video-title').text.strip()
        release_date = row.find('div', class_='meta').text.strip()
        list_data.append({
            'href' : host_url + link if link else '',
            'serial_number' : serial_number,
            'title'  : title,
            'release_date': release_date
        })

    # 查找 "下一页" 按钮
    next_page_element = soup.find('a', class_='pagination-next')
    if next_page_element:
        next_page_url = next_page_element['href']
        next_page_number = url_page_num(next_page_url)
        current_page_number = url_page_num(href)
        if current_page_number is None:
            current_page_number = 0
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

    return list_data, next_url


###### 以下为测试代码 ######
def test_actors_list():
    next_url = actors_uncensored_base_url
    while next_url:
        print(f'fetching page {next_url}')
        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="actors", attr_type="id"))
        if soup:
            list_data, next_url = parse_actors_uncensored(soup, next_url)
            if list_data :
                print(list_data)
            else:
                print('get wrong page.')
            if next_url:
                print(next_url)
        break

def test_actor():
    next_url = 'https://javdb.com/actors/mdRn'
    all_data = []
    while next_url:
        print(f'fetching page {next_url}')
        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
        if soup:
            list_data, next_url = parse_actor_detail(soup, next_url)
            if list_data :
                all_data.extend(list_data)
            else:
                print('get wrong page.')
    print(all_data)

def test_movie_detail():
    movie_url = 'https://javdb.com/v/gB2Q7'
    while True:
        soup = fetch_page(movie_url, partial(generic_validator, tag="div", identifier="video-detail", attr_type="class"))
        if soup:
            detail = parse_movie_detail(soup, movie_url, 'RED193 無碼 レッドホットフェティッシュコレクション 中出し120連発 4 : 波多野結衣, 愛乃なみ, 夢実あくび, 他多数')
            if detail:
                print(detail)
                break


def test_series_list():
    next_url = 'https://javdb.com/series/uncensored'
    all_data = []
    while next_url:
        print(f'fetching page {next_url}')
        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="series", attr_type="id"))
        if soup:
            list_data, next_url = parse_series_uncensored(soup, next_url)
            if list_data :
                all_data.extend(list_data)
            else:
                print('get wrong page.')
        break

    print(all_data)

def test_series_detail():
    next_url = 'https://javdb.com/series/39za'
    all_data = []
    while next_url:
        print(f'fetching page {next_url}')
        soup = fetch_page(next_url, partial(generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
        if soup:
            list_data, next_url = parse_series_detail(soup, next_url)
            if list_data :
                all_data.extend(list_data)
            else:
                print('get wrong page.')
    print(all_data)


if __name__ == "__main__":
    #test_actors_list()
    #test_actor()
    test_movie_detail()
    #test_series_list()
    #test_series_detail()