""" Script Name: Description: 从 thelordofporn.com 上获取女优列表,并逐个获取女优详细信息。 由于网站使用了cloudflare, 无法直接爬取,使用 cloudscraper 绕过限制。 list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件; actress_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。 Author: [Your Name] Created Date: YYYY-MM-DD Last Modified: YYYY-MM-DD Version: 1.0 Modification History: - YYYY-MM-DD [Your Name]: - YYYY-MM-DD [Your Name]: - YYYY-MM-DD [Your Name]: """ import time import json import csv import os import random import cloudscraper from bs4 import BeautifulSoup from urllib.parse import urljoin import config DIR_RES = config.global_host_data_dir ACTRESSES_JSON = f"{DIR_RES}/actresses.json" ACTRESSES_CSV = f"{DIR_RES}/actresses.csv" # 设置目标 URL BASE_URL = "https://thelordofporn.com/pornstars/" # 伪装成真实浏览器 HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", "Referer": "https://thelordofporn.com/", } # 记录抓取数据 actress_list = [] # 创建 CloudScraper 以绕过 Cloudflare scraper = cloudscraper.create_scraper( browser={"browser": "chrome", "platform": "windows", "mobile": False} ) # 爬取页面函数(支持分页) def scrape_page(url): print(f"[INFO] 正在抓取: {url}") # 网络访问失败时自动重试 for attempt in range(3): try: response = scraper.get(url, headers=HEADERS, timeout=10) response.raise_for_status() # 检查 HTTP 状态码 # 检查是否返回了有效的页面 soup = BeautifulSoup(response.text, "html.parser") main_tag = soup.find("main", class_="site-content") if main_tag: break # 如果页面内容正确,则继续解析 else: print(f"[WARNING] 服务器返回的页面不完整,尝试重新获取 ({attempt+1}/3)") time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试 except Exception as e: print(f"[ERROR] 访问失败 ({attempt+1}/3): {e}") time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试 else: print("[ERROR] 多次尝试后仍然失败,跳过该页面") return None #soup = BeautifulSoup(response.text, "html.parser") # 解析演员信息 articles = soup.find_all("article", class_="loop-item") for article in articles: try: # 获取演员详情 title_tag = article.find("h3", class_="loop-item__title").find("a") title = title_tag.text.strip() href = title_tag["href"] # 获取评分 rating_tag = article.find("div", class_="loop-item__rating") rating = rating_tag.text.strip() if rating_tag else "N/A" # 获取 Rank 和 Votes meta_tags = article.find("div", class_="loop-item__rank").find_all("span") rank = meta_tags[0].find("b").text.strip() if meta_tags else "N/A" votes = meta_tags[1].find("b").text.strip() if len(meta_tags) > 1 else "N/A" # 存入列表 actress_list.append({ "pornstar": title, "rating": rating, "rank": rank, "votes": votes, "href": href }) print(f"-----[INFO] 获取演员: {title} (Rank: {rank}, Votes: {votes}, Rating: {rating})-----") except Exception as e: print(f"[ERROR] 解析演员信息失败: {e}") # 查找下一页链接 next_page_tag = soup.select_one(".nav-links .next.page-numbers") if next_page_tag: next_page_url = urljoin(BASE_URL, next_page_tag["href"]) print(f"[INFO] 发现下一页: {next_page_url}") time.sleep(random.uniform(1, 3)) # 休眠 1-3 秒,避免被封 scrape_page(next_page_url) else: print("[INFO] 已抓取所有页面,爬取结束") # 保存数据 def save_data(): # 确保目录存在 os.makedirs(DIR_RES, exist_ok=True) # 保存数据为 JSON with open(ACTRESSES_JSON, "w", encoding="utf-8") as json_file: json.dump(actress_list, json_file, ensure_ascii=False, indent=4) print(f"[INFO] 数据已保存到 {ACTRESSES_JSON}") # 保存数据为 CSV with open(ACTRESSES_CSV, "w", encoding="utf-8", newline="") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=["pornstar", "rating", "rank", "votes", "href"]) writer.writeheader() writer.writerows(actress_list) print(f"[INFO] 数据已保存到 {ACTRESSES_CSV}") if __name__ == '__main__': scrape_page(BASE_URL) save_data()