This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scripts/thelordofporn/list_fetch.py
2025-02-11 16:07:43 +08:00

133 lines
4.7 KiB
Python

"""
Script Name:
Description: 从 thelordofporn.com 上获取女优列表,并逐个获取女优详细信息。
由于网站使用了cloudflare, 无法直接爬取,使用 cloudscraper 绕过限制。
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件;
actress_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import time
import json
import csv
import random
import cloudscraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
DIR_RES = './result'
ACTRESSES_JSON = f"{DIR_RES}/actresses.json"
ACTRESSES_CSV = f"{DIR_RES}/actresses.csv"
# 设置目标 URL
BASE_URL = "https://thelordofporn.com/pornstars/"
# 伪装成真实浏览器
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Referer": "https://thelordofporn.com/",
}
# 记录抓取数据
actress_list = []
# 创建 CloudScraper 以绕过 Cloudflare
scraper = cloudscraper.create_scraper(
browser={"browser": "chrome", "platform": "windows", "mobile": False}
)
# 爬取页面函数(支持分页)
def scrape_page(url):
print(f"[INFO] 正在抓取: {url}")
# 网络访问失败时自动重试
for attempt in range(3):
try:
response = scraper.get(url, headers=HEADERS, timeout=10)
response.raise_for_status() # 检查 HTTP 状态码
# 检查是否返回了有效的页面
soup = BeautifulSoup(response.text, "html.parser")
main_tag = soup.find("main", class_="site-content")
if main_tag:
break # 如果页面内容正确,则继续解析
else:
print(f"[WARNING] 服务器返回的页面不完整,尝试重新获取 ({attempt+1}/3)")
time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试
except Exception as e:
print(f"[ERROR] 访问失败 ({attempt+1}/3): {e}")
time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试
else:
print("[ERROR] 多次尝试后仍然失败,跳过该页面")
return None
#soup = BeautifulSoup(response.text, "html.parser")
# 解析演员信息
articles = soup.find_all("article", class_="loop-item")
for article in articles:
try:
# 获取演员详情
title_tag = article.find("h3", class_="loop-item__title").find("a")
title = title_tag.text.strip()
href = title_tag["href"]
# 获取评分
rating_tag = article.find("div", class_="loop-item__rating")
rating = rating_tag.text.strip() if rating_tag else "N/A"
# 获取 Rank 和 Votes
meta_tags = article.find("div", class_="loop-item__rank").find_all("span")
rank = meta_tags[0].find("b").text.strip() if meta_tags else "N/A"
votes = meta_tags[1].find("b").text.strip() if len(meta_tags) > 1 else "N/A"
# 存入列表
actress_list.append({
"pornstar": title,
"rating": rating,
"rank": rank,
"votes": votes,
"href": href
})
print(f"-----[INFO] 获取演员: {title} (Rank: {rank}, Votes: {votes}, Rating: {rating})-----")
except Exception as e:
print(f"[ERROR] 解析演员信息失败: {e}")
# 查找下一页链接
next_page_tag = soup.select_one(".nav-links .next.page-numbers")
if next_page_tag:
next_page_url = urljoin(BASE_URL, next_page_tag["href"])
print(f"[INFO] 发现下一页: {next_page_url}")
time.sleep(random.uniform(1, 3)) # 休眠 1-3 秒,避免被封
scrape_page(next_page_url)
else:
print("[INFO] 已抓取所有页面,爬取结束")
# 保存数据
def save_data():
# 保存数据为 JSON
with open(ACTRESSES_JSON, "w", encoding="utf-8") as json_file:
json.dump(actress_list, json_file, ensure_ascii=False, indent=4)
print(f"[INFO] 数据已保存到 {ACTRESSES_JSON}")
# 保存数据为 CSV
with open(ACTRESSES_CSV, "w", encoding="utf-8", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=["pornstar", "rating", "rank", "votes", "href"])
writer.writeheader()
writer.writerows(actress_list)
print(f"[INFO] 数据已保存到 {ACTRESSES_CSV}")
if __name__ == '__main__':
scrape_page(BASE_URL)
save_data()