138 lines
4.8 KiB
Python
138 lines
4.8 KiB
Python
"""
|
|
Script Name:
|
|
Description: 从 thelordofporn.com 上获取女优列表,并逐个获取女优详细信息。
|
|
由于网站使用了cloudflare, 无法直接爬取,使用 cloudscraper 绕过限制。
|
|
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件;
|
|
actress_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
|
|
|
|
Author: [Your Name]
|
|
Created Date: YYYY-MM-DD
|
|
Last Modified: YYYY-MM-DD
|
|
Version: 1.0
|
|
|
|
Modification History:
|
|
- YYYY-MM-DD [Your Name]:
|
|
- YYYY-MM-DD [Your Name]:
|
|
- YYYY-MM-DD [Your Name]:
|
|
"""
|
|
|
|
import time
|
|
import json
|
|
import csv
|
|
import os
|
|
import random
|
|
import cloudscraper
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
import config
|
|
|
|
DIR_RES = config.global_host_data_dir
|
|
ACTRESSES_JSON = f"{DIR_RES}/actresses.json"
|
|
ACTRESSES_CSV = f"{DIR_RES}/actresses.csv"
|
|
|
|
# 设置目标 URL
|
|
BASE_URL = "https://thelordofporn.com/pornstars/"
|
|
|
|
# 伪装成真实浏览器
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
|
"Referer": "https://thelordofporn.com/",
|
|
}
|
|
|
|
# 记录抓取数据
|
|
actress_list = []
|
|
|
|
# 创建 CloudScraper 以绕过 Cloudflare
|
|
scraper = cloudscraper.create_scraper(
|
|
browser={"browser": "chrome", "platform": "windows", "mobile": False}
|
|
)
|
|
|
|
# 爬取页面函数(支持分页)
|
|
def scrape_page(url):
|
|
print(f"[INFO] 正在抓取: {url}")
|
|
|
|
# 网络访问失败时自动重试
|
|
for attempt in range(3):
|
|
try:
|
|
response = scraper.get(url, headers=HEADERS, timeout=10)
|
|
response.raise_for_status() # 检查 HTTP 状态码
|
|
# 检查是否返回了有效的页面
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
main_tag = soup.find("main", class_="site-content")
|
|
|
|
if main_tag:
|
|
break # 如果页面内容正确,则继续解析
|
|
else:
|
|
print(f"[WARNING] 服务器返回的页面不完整,尝试重新获取 ({attempt+1}/3)")
|
|
time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试
|
|
except Exception as e:
|
|
print(f"[ERROR] 访问失败 ({attempt+1}/3): {e}")
|
|
time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试
|
|
else:
|
|
print("[ERROR] 多次尝试后仍然失败,跳过该页面")
|
|
return None
|
|
|
|
#soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# 解析演员信息
|
|
articles = soup.find_all("article", class_="loop-item")
|
|
for article in articles:
|
|
try:
|
|
# 获取演员详情
|
|
title_tag = article.find("h3", class_="loop-item__title").find("a")
|
|
title = title_tag.text.strip()
|
|
href = title_tag["href"]
|
|
|
|
# 获取评分
|
|
rating_tag = article.find("div", class_="loop-item__rating")
|
|
rating = rating_tag.text.strip() if rating_tag else "N/A"
|
|
|
|
# 获取 Rank 和 Votes
|
|
meta_tags = article.find("div", class_="loop-item__rank").find_all("span")
|
|
rank = meta_tags[0].find("b").text.strip() if meta_tags else "N/A"
|
|
votes = meta_tags[1].find("b").text.strip() if len(meta_tags) > 1 else "N/A"
|
|
|
|
# 存入列表
|
|
actress_list.append({
|
|
"pornstar": title,
|
|
"rating": rating,
|
|
"rank": rank,
|
|
"votes": votes,
|
|
"href": href
|
|
})
|
|
print(f"-----[INFO] 获取演员: {title} (Rank: {rank}, Votes: {votes}, Rating: {rating})-----")
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] 解析演员信息失败: {e}")
|
|
|
|
# 查找下一页链接
|
|
next_page_tag = soup.select_one(".nav-links .next.page-numbers")
|
|
if next_page_tag:
|
|
next_page_url = urljoin(BASE_URL, next_page_tag["href"])
|
|
print(f"[INFO] 发现下一页: {next_page_url}")
|
|
time.sleep(random.uniform(1, 3)) # 休眠 1-3 秒,避免被封
|
|
scrape_page(next_page_url)
|
|
else:
|
|
print("[INFO] 已抓取所有页面,爬取结束")
|
|
|
|
# 保存数据
|
|
def save_data():
|
|
# 确保目录存在
|
|
os.makedirs(DIR_RES, exist_ok=True)
|
|
|
|
# 保存数据为 JSON
|
|
with open(ACTRESSES_JSON, "w", encoding="utf-8") as json_file:
|
|
json.dump(actress_list, json_file, ensure_ascii=False, indent=4)
|
|
print(f"[INFO] 数据已保存到 {ACTRESSES_JSON}")
|
|
|
|
# 保存数据为 CSV
|
|
with open(ACTRESSES_CSV, "w", encoding="utf-8", newline="") as csv_file:
|
|
writer = csv.DictWriter(csv_file, fieldnames=["pornstar", "rating", "rank", "votes", "href"])
|
|
writer.writeheader()
|
|
writer.writerows(actress_list)
|
|
print(f"[INFO] 数据已保存到 {ACTRESSES_CSV}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
scrape_page(BASE_URL)
|
|
save_data() |