This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/thelordofporn/src/scraper.py
2025-06-03 15:36:44 +08:00

276 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
from urllib.parse import urljoin, urlparse
import config
import utils
# 定义基础 URL 和可变参数
host_url = "https://thelordofporn.com/"
pornstar_url = "https://thelordofporn.com/pornstars/"
lang_prefix = ["ja", "en", "zh"]
http_code_404 = 404
http_code_login = 401
http_code_local = 99
save_raw_html = False
load_from_local = False
# 伪装成真实浏览器
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Referer": "https://thelordofporn.com/",
}
# 创建 CloudScraper 以绕过 Cloudflare
scraper = cloudscraper.create_scraper(
browser={"browser": "chrome", "platform": "windows", "mobile": False}
)
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
if load_from_local: # 从本地读取的逻辑
html = utils.read_raw_html(url)
if html:
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(html) if preprocessor else html
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
logging.debug(f"read from local. href: {url}")
return soup, http_code_local # 返回一个小于100的错误码表明是从本地返回的
for attempt in range(max_retries):
try:
if 'thelordofporn.com' not in url.lower():
logging.error(f'wrong url format: {url}')
return None, None
response = scraper.get(url, headers=HEADERS)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.debug(f"Page not found (404): {url}")
return None, http_code_404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 检查是否发生跳转,比如到登录页面
if response.history:
logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
soup = BeautifulSoup(response.text, parser)
# 判断是否为登录页面,
if soup.find('nav', class_='panel form-panel'):
logging.debug(f"Page redirected to login page on {url}.")
return None, http_code_login
if save_raw_html:
utils.write_raw_html(url, response.text)
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retring...")
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
def preprocess_html(html):
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 解析列表页
def parse_actor_list(soup, href):
# 解析演员信息
actress_list = []
next_page_url = None
articles = soup.find_all("article", class_="loop-item")
for article in articles:
try:
# 获取演员详情
title_tag = article.find("h3", class_="loop-item__title")
title = title_tag.find("a").text.strip() if title_tag and title_tag.find("a") else "N/A"
href = title_tag.find("a")["href"] if title_tag and title_tag.find("a") else None
# 获取评分
rating_tag = article.find("div", class_="loop-item__rating")
rating = rating_tag.text.strip() if rating_tag else "N/A"
# 获取 Rank 和 Votes
rank_votes_div = article.find("div", class_="loop-item__rank")
meta_tags = rank_votes_div.find_all("span") if rank_votes_div else []
# 安全获取 rank 和 votes
rank = None
votes = None
if len(meta_tags) >= 1:
rank_b = meta_tags[0].find("b")
rank = rank_b.text.strip() if rank_b else "N/A"
if len(meta_tags) >= 2:
votes_b = meta_tags[1].find("b")
votes = votes_b.text.strip() if votes_b else "N/A"
# 存入列表
actress_list.append({
"pornstar": title,
"rating": utils.parse_numeric(rating) if rating != "N/A" else None,
"rank": utils.parse_numeric(rank) if rank is not None else None,
"votes": utils.parse_numeric(votes) if votes is not None else None,
"href": href
})
except Exception as e:
logging.error(f"parse list faild: {e}, url: {href}")
return None, None
# 查找下一页链接
next_page_tag = soup.select_one(".nav-links .next.page-numbers")
if next_page_tag:
next_page_url = urljoin(host_url, next_page_tag["href"])
logging.debug(f"next page: {next_page_url}")
else:
logging.debug("find all pages.")
return actress_list, next_page_url
# 解析 HTML 内容,提取需要的数据
def parse_actor_detail(soup, href):
# 提取基本信息
entry_header = soup.find("header", class_="entry-header")
name_el = entry_header.find("h1", class_="entry-title") if entry_header else None
name = name_el.text.strip() if name_el else ""
date_modified_el = soup.find("time", itemprop="dateModified")
if date_modified_el:
date_modified = date_modified_el.get("content", "").strip()
else:
date_modified = ""
# 提取 metadata
global_rank = ""
weekly_rank = ""
last_month_rating = ""
current_rating = ""
total_votes = ""
for div in entry_header.find_all("div", class_="porn-star-rank__item"):
text = div.text.strip()
if "Global Rank" in text:
global_rank = div.find("b").text.strip()
elif "Weekly Rank" in text:
weekly_rank = div.find("b").text.strip()
for item in soup.find_all("div", class_="specifications__item--horizontal"):
text = item.text.strip()
if "Last Month" in text:
last_month_rating = item.find("b").text.strip()
elif "Rating Av." in text:
current_rating = item.find("b").text.strip()
elif "Total of" in text:
total_votes = item.find("b").text.strip()
# 解析详细属性
attributes = {}
for row in soup.find_all("div", class_="specifications-grid-row"):
items = row.find_all("div", class_="specifications-grid-item")
if len(items) == 2:
label = items[0].find("h5").text.strip()
value = items[0].find("span").text.strip()
attributes[label] = value
label2 = items[1].find("h5").text.strip()
value2 = items[1].find("span").text.strip()
attributes[label2] = value2
# 解析出生信息、身高、体重等
birth_info = utils.parse_birth_info(attributes.get("Born", ""))
height_info = utils.parse_height(attributes.get("Height", ""))
weight_info = utils.parse_weight(attributes.get("Weight", ""))
alias_list = utils.clean_alias(attributes.get("Name", ""))
return {
'name': name,
'href': href,
"alias": alias_list,
"career_start": attributes.get("Career start", ""),
"measurements": attributes.get("Measurements", ""),
"born": attributes.get("Born", ""),
"height": attributes.get("Height", ""),
"weight": attributes.get("Weight", ""),
"date_modified": date_modified,
"global_rank": utils.parse_numeric(global_rank),
"weekly_rank": utils.parse_numeric(weekly_rank),
"last_month_rating": utils.parse_numeric(last_month_rating),
"current_rating": utils.parse_numeric(current_rating),
"total_votes": utils.parse_numeric(total_votes),
**birth_info,
**height_info,
**weight_info,
}, None
###### 以下为测试代码 ######
def test_actor_list():
next_url = pornstar_url
all_data = []
while next_url:
print(f'fetching page {next_url}')
soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
if soup:
list_data, next_url = parse_actor_list(soup, next_url)
if list_data :
all_data.extend(list_data)
else:
print('get wrong page.')
if next_url:
print(f"next url: {next_url}")
break
print(all_data)
def test_actor():
next_url = 'https://thelordofporn.com/pornstars/eva-elfie/'
while next_url:
print(f'fetching page {next_url}')
soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
if soup:
data, next_url = parse_actor_detail(soup, next_url)
if data :
print(data)
else:
print('get wrong page.')
break
if __name__ == "__main__":
test_actor_list()
test_actor()