import requests from bs4 import BeautifulSoup import os import sys import random import time import re import logging import csv from datetime import datetime from datetime import date import config # 日志配置 import cloudscraper # 日志 config.setup_logging() httpx_logger = logging.getLogger("httpx") httpx_logger.setLevel(logging.DEBUG) # 配置基础URL和输出文件 base_url = 'https://thelordofporn.com/' list_url_scenes = 'https://thelordofporn.com/category/top-10/porn-scenes-movies/' list_url_pornstars = 'https://thelordofporn.com/category/pornstars-top-10/' curr_novel_pages = 0 res_dir = 'result' top_scenes_file = f'{res_dir}/top_scenes_list.csv' top_pornstars_file = f'{res_dir}/top_pornstars_list.csv' # 请求头和 Cookies(模拟真实浏览器) HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", "Accept-Language": "en-US,en;q=0.9", } COOKIES = { "cf_clearance": "your_clearance_token_here" # 需要根据 Cloudflare 的验证情况更新 } # 定义获取页面内容的函数,带重试机制 def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10): scraper = cloudscraper.create_scraper( browser={"browser": "chrome", "platform": "windows", "mobile": False} ) retries = 0 while retries < max_retries: try: response = scraper.get(url, headers=HEADERS, cookies=COOKIES, timeout=default_timeout) if response.status_code == 200 and "content-area content-area--full-width" in response.text : return response.text # 请求成功,返回内容 except requests.RequestException as e: retries += 1 logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...") if retries >= max_retries: logging.error(f"Failed to fetch page {url} after {max_retries} retries.") return None time.sleep(sleep_time) # 休眠指定的时间,然后重试 # 获取 top scenes and movies def get_scenes(base_url, output_file=top_scenes_file): # 初始化变量 current_url = base_url all_data = [] while current_url: try: logging.info(f"Fetching URL: {current_url}") # 发起网络请求 content = get_page_content(current_url) # 解析网页内容 soup = BeautifulSoup(content, "html.parser") articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_movies__scen") if not articles: logging.warning(f"No articles found on page: {current_url}") # 解析每个 article 标签 for article in articles: try: # 获取 href 和 title a_tag = article.find("a", class_="loop-item__image") title = a_tag.get("title", "").strip() href = a_tag.get("href", "").strip() if title and href: all_data.append({ 'title': title, 'href': href }) logging.info(f"Extracted: {title} -> {href}") else: logging.warning("Missing title or href in an article.") except Exception as e: logging.error(f"Error parsing article: {e}") # 找下一页链接 next_page = soup.find("a", class_="next page-numbers") if next_page: current_url = next_page.get("href", "").strip() else: current_url = None logging.info("No more pages to fetch.") # 等待一段时间以避免被目标网站封禁 time.sleep(2) except requests.exceptions.RequestException as e: logging.error(f"Network error while fetching {current_url}: {e}") break except Exception as e: logging.error(f"Unexpected error: {e}") break # 保存结果到文件 csv_headers = ["title", "href"] with open(output_file, "w", newline="", encoding="utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=csv_headers) writer.writeheader() writer.writerows(all_data) logging.info(f"Data successfully saved to {output_file}.") # 获取 top pornstars def get_pornstars(base_url, output_file=top_pornstars_file): # 初始化变量 current_url = base_url all_data = [] while current_url: try: logging.info(f"Fetching URL: {current_url}") # 发起网络请求 content = get_page_content(current_url) # 解析网页内容 soup = BeautifulSoup(content, "html.parser") articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_pornstars") if not articles: logging.warning(f"No articles found on page: {current_url}") # 解析每个 article 标签 for article in articles: try: # 获取 href 和 title a_tag = article.find("a", class_="loop-item__image") title = a_tag.get("title", "").strip() href = a_tag.get("href", "").strip() if title and href: all_data.append({ 'title':title, 'href': href }) logging.info(f"Extracted: {title} -> {href}") else: logging.warning("Missing title or href in an article.") except Exception as e: logging.error(f"Error parsing article: {e}") # 找下一页链接 next_page = soup.find("a", class_="next page-numbers") if next_page: current_url = next_page.get("href", "").strip() else: current_url = None logging.info("No more pages to fetch.") # 等待一段时间以避免被目标网站封禁 time.sleep(2) except requests.exceptions.RequestException as e: logging.error(f"Network error while fetching {current_url}: {e}") break except Exception as e: logging.error(f"Unexpected error: {e}") break # 保存结果到文件 csv_headers = ["title", "href"] with open(output_file, "w", newline="", encoding="utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=csv_headers) writer.writeheader() writer.writerows(all_data) logging.info(f"Data successfully saved to {output_file}.") def main(): if len(sys.argv) < 2: print("Usage: python script.py ") print("cmd: scenes, pornstars") sys.exit(1) cmd = sys.argv[1] if cmd == "scenes": get_scenes(list_url_scenes) # 之前已经实现的获取列表功能 elif cmd == "pornstars": get_pornstars(list_url_pornstars) # 之前已经实现的获取详情功能 else: print(f"Unknown command: {cmd}") if __name__ == '__main__': main()