276 lines
10 KiB
Python
276 lines
10 KiB
Python
import cloudscraper
|
||
import time
|
||
import json
|
||
import csv
|
||
import logging
|
||
import signal
|
||
import sys
|
||
import os
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from requests.exceptions import RequestException
|
||
from functools import partial
|
||
from urllib.parse import urljoin, urlparse
|
||
import config
|
||
import utils
|
||
|
||
# 定义基础 URL 和可变参数
|
||
host_url = "https://thelordofporn.com/"
|
||
pornstar_url = "https://thelordofporn.com/pornstars/"
|
||
lang_prefix = ["ja", "en", "zh"]
|
||
|
||
http_code_404 = 404
|
||
http_code_login = 401
|
||
http_code_local = 99
|
||
|
||
save_raw_html = False
|
||
load_from_local = False
|
||
|
||
# 伪装成真实浏览器
|
||
HEADERS = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||
"Referer": "https://thelordofporn.com/",
|
||
}
|
||
|
||
# 创建 CloudScraper 以绕过 Cloudflare
|
||
scraper = cloudscraper.create_scraper(
|
||
browser={"browser": "chrome", "platform": "windows", "mobile": False}
|
||
)
|
||
|
||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||
if load_from_local: # 从本地读取的逻辑
|
||
html = utils.read_raw_html(url)
|
||
if html:
|
||
# 预处理 HTML(如果提供了 preprocessor)
|
||
html_text = preprocessor(html) if preprocessor else html
|
||
|
||
soup = BeautifulSoup(html_text, parser)
|
||
if validator(soup): # 进行自定义页面检查
|
||
logging.debug(f"read from local. href: {url}")
|
||
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
if 'thelordofporn.com' not in url.lower():
|
||
logging.error(f'wrong url format: {url}')
|
||
return None, None
|
||
|
||
response = scraper.get(url, headers=HEADERS)
|
||
|
||
# 处理 HTTP 状态码
|
||
if response.status_code == 404:
|
||
logging.debug(f"Page not found (404): {url}")
|
||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||
|
||
response.raise_for_status() # 处理 HTTP 错误
|
||
|
||
# 检查是否发生跳转,比如到登录页面
|
||
if response.history:
|
||
logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
|
||
soup = BeautifulSoup(response.text, parser)
|
||
# 判断是否为登录页面,
|
||
if soup.find('nav', class_='panel form-panel'):
|
||
logging.debug(f"Page redirected to login page on {url}.")
|
||
return None, http_code_login
|
||
|
||
if save_raw_html:
|
||
utils.write_raw_html(url, response.text)
|
||
|
||
# 预处理 HTML(如果提供了 preprocessor)
|
||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||
|
||
soup = BeautifulSoup(html_text, parser)
|
||
if validator(soup): # 进行自定义页面检查
|
||
return soup, response.status_code
|
||
|
||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||
except Exception as e:
|
||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||
|
||
logging.error(f'Fetching failed after max retries. {url}')
|
||
return None, None # 达到最大重试次数仍然失败
|
||
|
||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||
def preprocess_html(html):
|
||
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||
|
||
# 通用的 HTML 结构验证器
|
||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||
if attr_type == "id":
|
||
return soup.find(tag, id=identifier) is not None
|
||
elif attr_type == "class":
|
||
return bool(soup.find_all(tag, class_=identifier))
|
||
elif attr_type == "name":
|
||
return bool(soup.find('select', {'name': identifier}))
|
||
return False
|
||
|
||
# 解析列表页
|
||
def parse_actor_list(soup, href):
|
||
# 解析演员信息
|
||
actress_list = []
|
||
next_page_url = None
|
||
|
||
articles = soup.find_all("article", class_="loop-item")
|
||
for article in articles:
|
||
try:
|
||
# 获取演员详情
|
||
title_tag = article.find("h3", class_="loop-item__title")
|
||
title = title_tag.find("a").text.strip() if title_tag and title_tag.find("a") else "N/A"
|
||
href = title_tag.find("a")["href"] if title_tag and title_tag.find("a") else None
|
||
|
||
# 获取评分
|
||
rating_tag = article.find("div", class_="loop-item__rating")
|
||
rating = rating_tag.text.strip() if rating_tag else "N/A"
|
||
|
||
# 获取 Rank 和 Votes
|
||
rank_votes_div = article.find("div", class_="loop-item__rank")
|
||
meta_tags = rank_votes_div.find_all("span") if rank_votes_div else []
|
||
|
||
# 安全获取 rank 和 votes
|
||
rank = None
|
||
votes = None
|
||
if len(meta_tags) >= 1:
|
||
rank_b = meta_tags[0].find("b")
|
||
rank = rank_b.text.strip() if rank_b else "N/A"
|
||
if len(meta_tags) >= 2:
|
||
votes_b = meta_tags[1].find("b")
|
||
votes = votes_b.text.strip() if votes_b else "N/A"
|
||
|
||
# 存入列表
|
||
actress_list.append({
|
||
"pornstar": title,
|
||
"rating": utils.parse_numeric(rating) if rating != "N/A" else None,
|
||
"rank": utils.parse_numeric(rank) if rank is not None else None,
|
||
"votes": utils.parse_numeric(votes) if votes is not None else None,
|
||
"href": href
|
||
})
|
||
|
||
except Exception as e:
|
||
logging.error(f"parse list faild: {e}, url: {href}")
|
||
return None, None
|
||
|
||
# 查找下一页链接
|
||
next_page_tag = soup.select_one(".nav-links .next.page-numbers")
|
||
if next_page_tag:
|
||
next_page_url = urljoin(host_url, next_page_tag["href"])
|
||
logging.debug(f"next page: {next_page_url}")
|
||
else:
|
||
logging.debug("find all pages.")
|
||
|
||
return actress_list, next_page_url
|
||
|
||
# 解析 HTML 内容,提取需要的数据
|
||
def parse_actor_detail(soup, href):
|
||
# 提取基本信息
|
||
entry_header = soup.find("header", class_="entry-header")
|
||
name_el = entry_header.find("h1", class_="entry-title") if entry_header else None
|
||
name = name_el.text.strip() if name_el else ""
|
||
|
||
date_modified_el = soup.find("time", itemprop="dateModified")
|
||
if date_modified_el:
|
||
date_modified = date_modified_el.get("content", "").strip()
|
||
else:
|
||
date_modified = ""
|
||
|
||
# 提取 metadata
|
||
global_rank = ""
|
||
weekly_rank = ""
|
||
last_month_rating = ""
|
||
current_rating = ""
|
||
total_votes = ""
|
||
|
||
for div in entry_header.find_all("div", class_="porn-star-rank__item"):
|
||
text = div.text.strip()
|
||
if "Global Rank" in text:
|
||
global_rank = div.find("b").text.strip()
|
||
elif "Weekly Rank" in text:
|
||
weekly_rank = div.find("b").text.strip()
|
||
|
||
for item in soup.find_all("div", class_="specifications__item--horizontal"):
|
||
text = item.text.strip()
|
||
if "Last Month" in text:
|
||
last_month_rating = item.find("b").text.strip()
|
||
elif "Rating Av." in text:
|
||
current_rating = item.find("b").text.strip()
|
||
elif "Total of" in text:
|
||
total_votes = item.find("b").text.strip()
|
||
|
||
# 解析详细属性
|
||
attributes = {}
|
||
for row in soup.find_all("div", class_="specifications-grid-row"):
|
||
items = row.find_all("div", class_="specifications-grid-item")
|
||
if len(items) == 2:
|
||
label = items[0].find("h5").text.strip()
|
||
value = items[0].find("span").text.strip()
|
||
attributes[label] = value
|
||
|
||
label2 = items[1].find("h5").text.strip()
|
||
value2 = items[1].find("span").text.strip()
|
||
attributes[label2] = value2
|
||
|
||
# 解析出生信息、身高、体重等
|
||
birth_info = utils.parse_birth_info(attributes.get("Born", ""))
|
||
height_info = utils.parse_height(attributes.get("Height", ""))
|
||
weight_info = utils.parse_weight(attributes.get("Weight", ""))
|
||
alias_list = utils.clean_alias(attributes.get("Name", ""))
|
||
|
||
return {
|
||
'name': name,
|
||
'href': href,
|
||
"alias": alias_list,
|
||
"career_start": attributes.get("Career start", ""),
|
||
"measurements": attributes.get("Measurements", ""),
|
||
"born": attributes.get("Born", ""),
|
||
"height": attributes.get("Height", ""),
|
||
"weight": attributes.get("Weight", ""),
|
||
"date_modified": date_modified,
|
||
"global_rank": utils.parse_numeric(global_rank),
|
||
"weekly_rank": utils.parse_numeric(weekly_rank),
|
||
"last_month_rating": utils.parse_numeric(last_month_rating),
|
||
"current_rating": utils.parse_numeric(current_rating),
|
||
"total_votes": utils.parse_numeric(total_votes),
|
||
**birth_info,
|
||
**height_info,
|
||
**weight_info,
|
||
}, None
|
||
|
||
###### 以下为测试代码 ######
|
||
def test_actor_list():
|
||
next_url = pornstar_url
|
||
all_data = []
|
||
while next_url:
|
||
print(f'fetching page {next_url}')
|
||
soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
|
||
if soup:
|
||
list_data, next_url = parse_actor_list(soup, next_url)
|
||
if list_data :
|
||
all_data.extend(list_data)
|
||
else:
|
||
print('get wrong page.')
|
||
if next_url:
|
||
print(f"next url: {next_url}")
|
||
break
|
||
print(all_data)
|
||
|
||
def test_actor():
|
||
next_url = 'https://thelordofporn.com/pornstars/eva-elfie/'
|
||
while next_url:
|
||
print(f'fetching page {next_url}')
|
||
soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="main", identifier="content", attr_type="id"))
|
||
if soup:
|
||
data, next_url = parse_actor_detail(soup, next_url)
|
||
if data :
|
||
print(data)
|
||
else:
|
||
print('get wrong page.')
|
||
break
|
||
|
||
if __name__ == "__main__":
|
||
test_actor_list()
|
||
test_actor()
|
||
|