Files
stock/scripts/thelordofporn/top_scenes.py
2025-02-11 16:07:43 +08:00

205 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import os
import sys
import random
import time
import re
import logging
import csv
from datetime import datetime
from datetime import date
import config # 日志配置
import cloudscraper
# 日志
config.setup_logging()
httpx_logger = logging.getLogger("httpx")
httpx_logger.setLevel(logging.DEBUG)
# 配置基础URL和输出文件
base_url = 'https://thelordofporn.com/'
list_url_scenes = 'https://thelordofporn.com/category/top-10/porn-scenes-movies/'
list_url_pornstars = 'https://thelordofporn.com/category/pornstars-top-10/'
curr_novel_pages = 0
res_dir = 'result'
top_scenes_file = f'{res_dir}/top_scenes_list.csv'
top_pornstars_file = f'{res_dir}/top_pornstars_list.csv'
# 请求头和 Cookies模拟真实浏览器
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
}
COOKIES = {
"cf_clearance": "your_clearance_token_here" # 需要根据 Cloudflare 的验证情况更新
}
# 定义获取页面内容的函数,带重试机制
def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10):
scraper = cloudscraper.create_scraper(
browser={"browser": "chrome", "platform": "windows", "mobile": False}
)
retries = 0
while retries < max_retries:
try:
response = scraper.get(url, headers=HEADERS, cookies=COOKIES, timeout=default_timeout)
if response.status_code == 200 and "content-area content-area--full-width" in response.text :
return response.text # 请求成功,返回内容
except requests.RequestException as e:
retries += 1
logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...")
if retries >= max_retries:
logging.error(f"Failed to fetch page {url} after {max_retries} retries.")
return None
time.sleep(sleep_time) # 休眠指定的时间,然后重试
# 获取 top scenes and movies
def get_scenes(base_url, output_file=top_scenes_file):
# 初始化变量
current_url = base_url
all_data = []
while current_url:
try:
logging.info(f"Fetching URL: {current_url}")
# 发起网络请求
content = get_page_content(current_url)
# 解析网页内容
soup = BeautifulSoup(content, "html.parser")
articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_movies__scen")
if not articles:
logging.warning(f"No articles found on page: {current_url}")
# 解析每个 article 标签
for article in articles:
try:
# 获取 href 和 title
a_tag = article.find("a", class_="loop-item__image")
title = a_tag.get("title", "").strip()
href = a_tag.get("href", "").strip()
if title and href:
all_data.append({
'title': title,
'href': href
})
logging.info(f"Extracted: {title} -> {href}")
else:
logging.warning("Missing title or href in an article.")
except Exception as e:
logging.error(f"Error parsing article: {e}")
# 找下一页链接
next_page = soup.find("a", class_="next page-numbers")
if next_page:
current_url = next_page.get("href", "").strip()
else:
current_url = None
logging.info("No more pages to fetch.")
# 等待一段时间以避免被目标网站封禁
time.sleep(2)
except requests.exceptions.RequestException as e:
logging.error(f"Network error while fetching {current_url}: {e}")
break
except Exception as e:
logging.error(f"Unexpected error: {e}")
break
# 保存结果到文件
csv_headers = ["title", "href"]
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
writer.writeheader()
writer.writerows(all_data)
logging.info(f"Data successfully saved to {output_file}.")
# 获取 top pornstars
def get_pornstars(base_url, output_file=top_pornstars_file):
# 初始化变量
current_url = base_url
all_data = []
while current_url:
try:
logging.info(f"Fetching URL: {current_url}")
# 发起网络请求
content = get_page_content(current_url)
# 解析网页内容
soup = BeautifulSoup(content, "html.parser")
articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_pornstars")
if not articles:
logging.warning(f"No articles found on page: {current_url}")
# 解析每个 article 标签
for article in articles:
try:
# 获取 href 和 title
a_tag = article.find("a", class_="loop-item__image")
title = a_tag.get("title", "").strip()
href = a_tag.get("href", "").strip()
if title and href:
all_data.append({
'title':title,
'href': href
})
logging.info(f"Extracted: {title} -> {href}")
else:
logging.warning("Missing title or href in an article.")
except Exception as e:
logging.error(f"Error parsing article: {e}")
# 找下一页链接
next_page = soup.find("a", class_="next page-numbers")
if next_page:
current_url = next_page.get("href", "").strip()
else:
current_url = None
logging.info("No more pages to fetch.")
# 等待一段时间以避免被目标网站封禁
time.sleep(2)
except requests.exceptions.RequestException as e:
logging.error(f"Network error while fetching {current_url}: {e}")
break
except Exception as e:
logging.error(f"Unexpected error: {e}")
break
# 保存结果到文件
csv_headers = ["title", "href"]
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
writer.writeheader()
writer.writerows(all_data)
logging.info(f"Data successfully saved to {output_file}.")
def main():
if len(sys.argv) < 2:
print("Usage: python script.py <cmd>")
print("cmd: scenes, pornstars")
sys.exit(1)
cmd = sys.argv[1]
if cmd == "scenes":
get_scenes(list_url_scenes) # 之前已经实现的获取列表功能
elif cmd == "pornstars":
get_pornstars(list_url_pornstars) # 之前已经实现的获取详情功能
else:
print(f"Unknown command: {cmd}")
if __name__ == '__main__':
main()