resources/thelordofporn/src/scraper.py

import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
from urllib.parse import urljoin, urlparse
import config
import utils

# 定义基础 URL 和可变参数
host_url        = "https://thelordofporn.com/"
pornstar_url    = "https://thelordofporn.com/pornstars/"
lang_prefix = ["ja", "en", "zh"]

http_code_404   = 404
http_code_login = 401
http_code_local = 99

save_raw_html = False
load_from_local = False

# 伪装成真实浏览器
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://thelordofporn.com/",
}

# 创建 CloudScraper 以绕过 Cloudflare
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "mobile": False}
)

#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
    if load_from_local:     # 从本地读取的逻辑
        html = utils.read_raw_html(url)
        if html:
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(html) if preprocessor else html

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                logging.debug(f"read from local. href: {url}")
                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的

    for attempt in range(max_retries):
        try:
            if 'thelordofporn.com' not in url.lower():
                logging.error(f'wrong url format: {url}')
                return None, None

            response = scraper.get(url, headers=HEADERS)

            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.debug(f"Page not found (404): {url}")
                return None, http_code_404  # 直接返回 404，调用方可以跳过

            response.raise_for_status()  # 处理 HTTP 错误

            # 检查是否发生跳转，比如到登录页面
            if response.history:
                logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
                soup = BeautifulSoup(response.text, parser)
                # 判断是否为登录页面，
                if soup.find('nav', class_='panel form-panel'):
                    logging.debug(f"Page redirected to login page on {url}.")
                    return None, http_code_login

            if save_raw_html:
                utils.write_raw_html(url, response.text)

            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code

            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
        except cloudscraper.exceptions.CloudflareCode1020 as e:
            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
        except Exception as e:
            logging.error(f"Unexpected error on {url}: {e}, Retring...")

    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败

# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
def preprocess_html(html):
    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')

# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name":
        return bool(soup.find('select', {'name': identifier}))
    return False

# 解析列表页
def parse_actor_list(soup, href):
    # 解析演员信息
    actress_list = []
    next_page_url = None

    articles = soup.find_all("article", class_="loop-item")
    for article in articles:
        try:
            # 获取演员详情
            title_tag = article.find("h3", class_="loop-item__title")
            title = title_tag.find("a").text.strip() if title_tag and title_tag.find("a") else "N/A"
            href = title_tag.find("a")["href"] if title_tag and title_tag.find("a") else None

            # 获取评分
            rating_tag = article.find("div", class_="loop-item__rating")
            rating = rating_tag.text.strip() if rating_tag else "N/A"

            # 获取 Rank 和 Votes
            rank_votes_div = article.find("div", class_="loop-item__rank")
            meta_tags = rank_votes_div.find_all("span") if rank_votes_div else []

            # 安全获取 rank 和 votes
            rank = None
            votes = None
            if len(meta_tags) >= 1:
                rank_b = meta_tags[0].find("b")
                rank = rank_b.text.strip() if rank_b else "N/A"
            if len(meta_tags) >= 2:
                votes_b = meta_tags[1].find("b")
                votes = votes_b.text.strip() if votes_b else "N/A"

            # 存入列表
            actress_list.append({
                "pornstar": title,
                "rating": utils.parse_numeric(rating) if rating != "N/A" else None,
                "rank": utils.parse_numeric(rank) if rank is not None else None,
                "votes": utils.parse_numeric(votes) if votes is not None else None,
                "href": href
            })

        except Exception as e:
            logging.error(f"parse list faild: {e}, url: {href}")
            return None, None

    # 查找下一页链接
    next_page_tag = soup.select_one(".nav-links .next.page-numbers")
    if next_page_tag:
        next_page_url = urljoin(host_url, next_page_tag["href"])
        logging.debug(f"next page: {next_page_url}")
    else:
        logging.debug("find all pages.")

    return actress_list, next_page_url

# 解析 HTML 内容，提取需要的数据
def parse_actor_detail(soup, href):
    # 提取基本信息
    entry_header = soup.find("header", class_="entry-header")
    name_el = entry_header.find("h1", class_="entry-title") if entry_header else None
    name = name_el.text.strip() if name_el else ""

    date_modified_el = soup.find("time", itemprop="dateModified")
    if date_modified_el:
        date_modified = date_modified_el.get("content", "").strip()
    else:
        date_modified = ""

    # 提取 metadata
    global_rank = ""
    weekly_rank = ""
    last_month_rating = ""
    current_rating = ""
    total_votes = ""

    for div in entry_header.find_all("div", class_="porn-star-rank__item"):
        text = div.text.strip()
        if "Global Rank" in text:
            global_rank = div.find("b").text.strip()
        elif "Weekly Rank" in text:
            weekly_rank = div.find("b").text.strip()

    for item in soup.find_all("div", class_="specifications__item--horizontal"):
        text = item.text.strip()
        if "Last Month" in text:
            last_month_rating = item.find("b").text.strip()
        elif "Rating Av." in text:
            current_rating = item.find("b").text.strip()
        elif "Total of" in text:
            total_votes = item.find("b").text.strip()

    # 解析详细属性
    attributes = {}
    for row in soup.find_all("div", class_="specifications-grid-row"):
        items = row.find_all("div", class_="specifications-grid-item")
        if len(items) == 2:
            label = items[0].find("h5").text.strip()
            value = items[0].find("span").text.strip()
            attributes[label] = value

            label2 = items[1].find("h5").text.strip()
            value2 = items[1].find("span").text.strip()
            attributes[label2] = value2

    # 解析出生信息、身高、体重等
    birth_info = utils.parse_birth_info(attributes.get("Born", ""))
    height_info = utils.parse_height(attributes.get("Height", ""))
    weight_info = utils.parse_weight(attributes.get("Weight", ""))
    alias_list = utils.clean_alias(attributes.get("Name", ""))

    return {
        'name': name,
        'href': href,
        "alias": alias_list,
        "career_start": attributes.get("Career start", ""),
        "measurements": attributes.get("Measurements", ""),
        "born": attributes.get("Born", ""),
        "height": attributes.get("Height", ""),
        "weight": attributes.get("Weight", ""),
        "date_modified": date_modified,
        "global_rank": utils.parse_numeric(global_rank),
        "weekly_rank": utils.parse_numeric(weekly_rank),
        "last_month_rating": utils.parse_numeric(last_month_rating),
        "current_rating": utils.parse_numeric(current_rating),
        "total_votes": utils.parse_numeric(total_votes),
        **birth_info,
        **height_info,
        **weight_info,
    }, None

###### 以下为测试代码 ######
def test_actor_list():
    next_url = pornstar_url
    all_data = []
    while next_url:
        print(f'fetching page {next_url}')
        soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
        if soup:
            list_data, next_url = parse_actor_list(soup, next_url)
            if list_data :
                all_data.extend(list_data)
            else:
                print('get wrong page.')
            if next_url:
                print(f"next url: {next_url}")
        break
    print(all_data)

def test_actor():
    next_url = 'https://thelordofporn.com/pornstars/eva-elfie/'
    while next_url:
        print(f'fetching page {next_url}')
        soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
        if soup:
            data, next_url = parse_actor_detail(soup, next_url)
            if data :
                print(data)
            else:
                print('get wrong page.')
        break

if __name__ == "__main__":
    test_actor_list()
    test_actor()