resources/scripts/thelordofporn/list_fetch.py

"""
Script Name:
Description: 从 thelordofporn.com 上获取女优列表，并逐个获取女优详细信息。
    由于网站使用了cloudflare, 无法直接爬取，使用 cloudscraper 绕过限制。
    list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件;
    actress_fetch.py 则把上一步获取到的列表，读取详情页面，合并进来一些详细信息。

Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0

Modification History:
    - YYYY-MM-DD [Your Name]:
    - YYYY-MM-DD [Your Name]:
    - YYYY-MM-DD [Your Name]:
"""

import time
import json
import csv
import random
import cloudscraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin

DIR_RES = './result'
ACTRESSES_JSON = f"{DIR_RES}/actresses.json"
ACTRESSES_CSV = f"{DIR_RES}/actresses.csv"

# 设置目标 URL
BASE_URL = "https://thelordofporn.com/pornstars/"

# 伪装成真实浏览器
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://thelordofporn.com/",
}

# 记录抓取数据
actress_list = []

# 创建 CloudScraper 以绕过 Cloudflare
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "mobile": False}
)

# 爬取页面函数（支持分页）
def scrape_page(url):
    print(f"[INFO] 正在抓取: {url}")

    # 网络访问失败时自动重试
    for attempt in range(3):
        try:
            response = scraper.get(url, headers=HEADERS, timeout=10)
            response.raise_for_status()  # 检查 HTTP 状态码
            # 检查是否返回了有效的页面
            soup = BeautifulSoup(response.text, "html.parser")
            main_tag = soup.find("main", class_="site-content")

            if main_tag:
                break  # 如果页面内容正确，则继续解析
            else:
                print(f"[WARNING] 服务器返回的页面不完整，尝试重新获取 ({attempt+1}/3)")
                time.sleep(random.uniform(2, 5))  # 休眠 2-5 秒再试
        except Exception as e:
            print(f"[ERROR] 访问失败 ({attempt+1}/3): {e}")
            time.sleep(random.uniform(2, 5))  # 休眠 2-5 秒再试
    else:
        print("[ERROR] 多次尝试后仍然失败，跳过该页面")
        return None

    #soup = BeautifulSoup(response.text, "html.parser")

    # 解析演员信息
    articles = soup.find_all("article", class_="loop-item")
    for article in articles:
        try:
            # 获取演员详情
            title_tag = article.find("h3", class_="loop-item__title").find("a")
            title = title_tag.text.strip()
            href = title_tag["href"]

            # 获取评分
            rating_tag = article.find("div", class_="loop-item__rating")
            rating = rating_tag.text.strip() if rating_tag else "N/A"

            # 获取 Rank 和 Votes
            meta_tags = article.find("div", class_="loop-item__rank").find_all("span")
            rank = meta_tags[0].find("b").text.strip() if meta_tags else "N/A"
            votes = meta_tags[1].find("b").text.strip() if len(meta_tags) > 1 else "N/A"

            # 存入列表
            actress_list.append({
                "pornstar": title,
                "rating": rating,
                "rank": rank,
                "votes": votes,
                "href": href
            })
            print(f"-----[INFO] 获取演员: {title} (Rank: {rank}, Votes: {votes}, Rating: {rating})-----")

        except Exception as e:
            print(f"[ERROR] 解析演员信息失败: {e}")

    # 查找下一页链接
    next_page_tag = soup.select_one(".nav-links .next.page-numbers")
    if next_page_tag:
        next_page_url = urljoin(BASE_URL, next_page_tag["href"])
        print(f"[INFO] 发现下一页: {next_page_url}")
        time.sleep(random.uniform(1, 3))  # 休眠 1-3 秒，避免被封
        scrape_page(next_page_url)
    else:
        print("[INFO] 已抓取所有页面，爬取结束")

# 保存数据
def save_data():
    # 保存数据为 JSON
    with open(ACTRESSES_JSON, "w", encoding="utf-8") as json_file:
        json.dump(actress_list, json_file, ensure_ascii=False, indent=4)
    print(f"[INFO] 数据已保存到 {ACTRESSES_JSON}")

    # 保存数据为 CSV
    with open(ACTRESSES_CSV, "w", encoding="utf-8", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["pornstar", "rating", "rank", "votes", "href"])
        writer.writeheader()
        writer.writerows(actress_list)
    print(f"[INFO] 数据已保存到 {ACTRESSES_CSV}")


if __name__ == '__main__':
    scrape_page(BASE_URL)
    save_data()