resources/javhd/src/scraper.py

import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
from urllib.parse import urljoin, urlparse
import config
import utils

# 定义基础 URL 和可变参数
host_url = "https://javhd.com"
lang_prefix = ["ja", "en", "zh"]

http_code_404   = 404
http_code_login = 401
http_code_local = 99

save_raw_html = False
load_from_local = False

POST_HEADERS = {
    "accept": "application/json, text/plain, */*",
    "content-type": "application/json",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",
    "x-requested-with": "XMLHttpRequest",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    'content-type': 'application/json',
    'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
    'origin': 'https://javhd.com',
    'priority': 'u=1, i',
    'referer': 'https://javhd.com/ja/model' ,
    'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
    'sec-ch-ua-mobile': '?0' ,
    'sec-ch-ua-platform': '"macOS"' ,
    'sec-fetch-dest': 'empty' ,
    'sec-fetch-mode': 'cors' ,
    'sec-fetch-site': 'same-origin' ,
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
    'x-requested-with': 'XMLHttpRequest' ,
}
POST_DATA = {}  # 空字典表示无数据

HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
    'origin': 'https://javhd.com',
    'priority': 'u=1, i',
    'referer': 'https://javhd.com/ja/model' ,
    'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
    'sec-ch-ua-mobile': '?0' ,
    'sec-ch-ua-platform': '"macOS"' ,
    'sec-fetch-dest': 'empty' ,
    'sec-fetch-mode': 'cors' ,
    'sec-fetch-site': 'same-origin' ,
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
}

scraper = cloudscraper.create_scraper()

# POST 请求，并返回json数据
def fetch_post_page(url, retries=3):
    """从给定 URL 获取数据，带重试机制"""
    for attempt in range(retries):
        try:
            response = scraper.post(url=url, headers=POST_HEADERS, json=POST_DATA, timeout=10)
            response.raise_for_status()
            return response.json()
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
            time.sleep(2)
    return None


#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
    if load_from_local:     # 从本地读取的逻辑
        html = utils.read_raw_html(url)
        if html:
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(html) if preprocessor else html

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                logging.debug(f"read from local. href: {url}")
                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的

    for attempt in range(max_retries):
        try:
            if 'javhd.com' not in url.lower():
                logging.error(f'wrong url format: {url}')
                return None, None

            response = scraper.get(url, headers=HEADERS)

            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.debug(f"Page not found (404): {url}")
                return None, http_code_404  # 直接返回 404，调用方可以跳过

            response.raise_for_status()  # 处理 HTTP 错误

            # 检查是否发生跳转，比如到登录页面
            if response.history:
                logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
                soup = BeautifulSoup(response.text, parser)
                # 判断是否为登录页面，
                if soup.find('nav', class_='panel form-panel'):
                    logging.debug(f"Page redirected to login page on {url}.")
                    return None, http_code_login

            if save_raw_html:
                utils.write_raw_html(url, response.text)

            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code

            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"Unexpected error on {url}: {e}, Retring...")

    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败

# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
def preprocess_html(html):
    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')

# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name":
        return bool(soup.find('select', {'name': identifier}))
    return False


# 解析列表页
def parse_list_json(data, num, lang='en'):
    template = data.get("template", "")
    thumb_components = re.findall(r'<thumb-component[^>]*>', template)

    list_data = []
    for idx, thumb in enumerate(thumb_components, start=1):
        rank = (num - 1) * 36 + idx

        link_content = re.search(r'link-content="(.*?)"', thumb)
        url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
        title = re.search(r'title="(.*?)"', thumb)

        if not url_thumb or not title:
            logging.warning(f"no countent for rank:{rank} title:{title} url:{url_thumb}  {thumb}")
            continue

        pic = url_thumb.group(1)
        name = title.group(1)
        url = link_content.group(1)

        data = {"rank": rank, "url": url, "pic": pic}
        data[f"{lang}_name"] = name

        list_data.append(data)

    return list_data

def process_paragraph(paragraph):
    # 获取完整的 HTML 结构，而不是 get_text()
    paragraph_html = str(paragraph)

    # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
    soup = BeautifulSoup(paragraph_html, 'html.parser')
    cleaned_text = soup.get_text().strip()

    return cleaned_text


# 解析 HTML 内容，提取需要的数据
def parse_actor_detail(soup, href):
    info_section = soup.find("div", class_="info__features")

    if not info_section:
        logging.warning(f"未找到 info__features 区块: {href}")
        return  None, None

    # 页面标题到数据库字段的映射
    FIELD_MAPPING = {
        "Height": "height",
        "Weight": "weight",
        "Breast size": "breast_size",
        "Breast factor": "breast_factor",
        "Hair color": "hair_color",
        "Eye color": "eye_color",
        "Birth date": "birth_date",
        "Ethnicity": "ethnicity",
        "Birth place": "birth_place"
    }
    # 初始化数据字典，使用数据库字段名
    extracted_data = {db_field: "" for db_field in FIELD_MAPPING.values()}
    extracted_data['url'] = href

    for li in info_section.find_all("li", class_="content-desc__list-item"):
        title_tag = li.find("strong", class_="content-desc__list-title")
        value_tag = li.find("span", class_="content-desc__list-text")

        if title_tag and value_tag:
            title = process_paragraph(title_tag)  # 页面原始标题
            value = process_paragraph(value_tag)

            # 通过映射表转换为数据库字段名
            db_field = FIELD_MAPPING.get(title)
            if db_field:
                extracted_data[db_field] = value
    return extracted_data, None

###### 以下为测试代码 ######
def test_actor_list():
    s_url = "/ja/model"
    current_url = urljoin(host_url, s_url)
    while current_url:
        print(f"[信息] 正在抓取 {current_url}")
        data = fetch_post_page(current_url)

        if not data:
            print(f"[错误] 无法获取数据 {current_url}")
            break

        # 检查 JSON 结构
        if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
            print(f"[错误] 数据结构异常: {data}")
            break

        all_data = parse_list_json(data, 1)
        print(all_data)

        # 获取下一页
        next_path = data.get("pagination_params", {}).get("next")
        if next_path:
            current_url = urljoin(host_url, next_path)
            print(f"next page: {current_url}")
        else:
            print("[信息] 已抓取所有页面。")
            break

        break

def test_actor():
    next_url = 'https://javhd.com/en/model/Yui-Hatano'
    all_data = []
    while next_url:
        print(f'fetching page {next_url}')
        soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="div", identifier="info__features", attr_type="class"))
        if soup:
            list_data, next_url = parse_actor_detail(soup, next_url)
            if list_data :
                all_data.append(list_data)
            else:
                print('get wrong page.')
    print(all_data)


if __name__ == "__main__":
    test_actor_list()
    test_actor()