modify scripts

2025-03-17 11:30:35 +08:00
parent e6327fbe73
commit d5dc76b87f
178 changed files with 44 additions and 184447 deletions
--- a/thelordofporn/actress_fetch.py
+++ b/thelordofporn/actress_fetch.py
@ -0,0 +1,225 @@
+"""
+Script Name: 
+Description: 从 thelordofporn.com 上获取女优列表，并逐个获取女优详细信息。
+    由于网站使用了cloudflare, 无法直接爬取，使用 cloudscraper 绕过限制。
+    list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件;
+    actress_fetch.py 则把上一步获取到的列表，读取详情页面，合并进来一些详细信息。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import json
+import csv
+import os
+import re
+import time
+import random
+import cloudscraper
+from bs4 import BeautifulSoup
+import config 
+
+
+# 文件路径
+DIR_RES = config.global_host_data_dir
+ACTRESSES_FILE = f"{DIR_RES}/actresses.json"
+DETAILS_JSON_FILE = f"{DIR_RES}/thelordofporn_pornstars.json"
+DETAILS_CSV_FILE = f"{DIR_RES}/thelordofporn_pornstars.csv"
+
+# 请求头和 Cookies（模拟真实浏览器）
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+    "Accept-Language": "en-US,en;q=0.9",
+}
+COOKIES = {
+    "cf_clearance": "your_clearance_token_here"  # 需要根据 Cloudflare 的验证情况更新
+}
+
+# 解析出生日期和地点
+def parse_birth_info(text):
+    match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text)
+    if match:
+        return {
+            "birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
+            "birth_year": match.group(3),
+            "birth_place": match.group(4),
+        }
+    return {"birth_date": text, "birth_year": "", "birth_place": ""}
+
+# 解析身高
+def parse_height(text):
+    match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text)
+    if match:
+        height_ft = f"{match.group(1)}'{match.group(2)}\""
+        return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
+    return {"height_ft": text, "height_cm": ""}
+
+# 解析体重
+def parse_weight(text):
+    match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text)
+    if match:
+        return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
+    return {"weight_lbs": text, "weight_kg": ""}
+
+# 解析网页内容
+def parse_page(actress, html):
+    soup = BeautifulSoup(html, "html.parser")
+
+    # 确保页面结构正确
+    if not soup.find("main", {"id": "content", "class": "site-content"}):
+        return None
+
+    # 提取基本信息
+    entry_header = soup.find("header", class_="entry-header")
+    name_el = entry_header.find("h1", class_="entry-title") if entry_header else None
+    name = name_el.text.strip() if name_el else ""
+
+    date_modified_el = soup.find("time", itemprop="dateModified")
+    if date_modified_el:
+        date_modified = date_modified_el.get("content", "").strip()
+    else:
+        date_modified = ""
+
+    # 提取 metadata
+    global_rank = ""
+    weekly_rank = ""
+    last_month_rating = ""
+    current_rating = ""
+    total_votes = ""
+
+    for div in entry_header.find_all("div", class_="porn-star-rank__item"):
+        text = div.text.strip()
+        if "Global Rank" in text:
+            global_rank = div.find("b").text.strip()
+        elif "Weekly Rank" in text:
+            weekly_rank = div.find("b").text.strip()
+
+    for item in soup.find_all("div", class_="specifications__item--horizontal"):
+        text = item.text.strip()
+        if "Last Month" in text:
+            last_month_rating = item.find("b").text.strip()
+        elif "Rating Av." in text:
+            current_rating = item.find("b").text.strip()
+        elif "Total of" in text:
+            total_votes = item.find("b").text.strip()
+
+    # 解析详细属性
+    attributes = {}
+    for row in soup.find_all("div", class_="specifications-grid-row"):
+        items = row.find_all("div", class_="specifications-grid-item")
+        if len(items) == 2:
+            label = items[0].find("h5").text.strip()
+            value = items[0].find("span").text.strip()
+            attributes[label] = value
+
+            label2 = items[1].find("h5").text.strip()
+            value2 = items[1].find("span").text.strip()
+            attributes[label2] = value2
+
+    # 解析出生信息、身高、体重等
+    birth_info = parse_birth_info(attributes.get("Born", ""))
+    height_info = parse_height(attributes.get("Height", ""))
+    weight_info = parse_weight(attributes.get("Weight", ""))
+
+    return {
+        "pornstar": actress['pornstar'],
+        "rating": actress['rating'],
+        "rank": actress['rank'],
+        "votes": actress['votes'],
+        "href": actress['href'],
+        'name': name, 
+        "alias": attributes.get("Name", ""),
+        "career_start": attributes.get("Career start", ""),
+        "measurements": attributes.get("Measurements", ""),
+        "born": attributes.get("Born", ""),
+        "height": attributes.get("Height", ""),
+        "weight": attributes.get("Weight", ""),
+        "date_modified": date_modified,
+        "global_rank": global_rank,
+        "weekly_rank": weekly_rank,
+        "last_month_rating": last_month_rating,
+        "current_rating": current_rating,
+        "total_votes": total_votes,
+        **birth_info,
+        **height_info,
+        **weight_info,
+    }
+
+# 读取已处理数据
+def load_existing_data():
+    if os.path.exists(DETAILS_JSON_FILE):
+        with open(DETAILS_JSON_FILE, "r", encoding="utf-8") as f:
+            return {item["pornstar"]: item for item in json.load(f)}
+    return {}
+
+# 访问页面
+def fetch_page(url):
+    scraper = cloudscraper.create_scraper()
+    for _ in range(500):  # 最多重试5次
+        try:
+            response = scraper.get(url, headers=HEADERS, cookies=COOKIES, timeout=10)
+            if response.status_code == 200 and "specifications-grid-row" in response.text:
+                return response.text
+        except Exception as e:
+            print(f"请求 {url} 失败，错误: {e}")
+        time.sleep(random.uniform(2, 5))  # 随机延迟
+    return None
+
+# 处理数据并保存
+def process_data():
+    with open(ACTRESSES_FILE, "r", encoding="utf-8") as f:
+        actresses = json.load(f)
+
+    existing_data = load_existing_data()
+    updated_data = list(existing_data.values())
+
+    for actress in actresses:
+        name, url = actress["pornstar"], actress["href"]
+
+        if name in existing_data:
+            print(f"跳过已处理: {name}")
+            continue
+
+        print(f"正在处理: {name} - {url}")
+        html = fetch_page(url)
+        if not html:
+            print(f"无法获取页面: {url}")
+            continue
+
+        details = parse_page(actress, html)
+        if details:
+            updated_data.append(details)
+            existing_data[name] = details
+
+            with open(DETAILS_JSON_FILE, "w", encoding="utf-8") as jsonfile:
+                json.dump(updated_data, jsonfile, indent=4, ensure_ascii=False)
+
+# 从 JSON 生成 CSV
+def json_to_csv():
+    if not os.path.exists(DETAILS_JSON_FILE):
+        print("没有 JSON 文件，跳过 CSV 生成")
+        return
+
+    with open(DETAILS_JSON_FILE, "r", encoding="utf-8") as jsonfile:
+        data = json.load(jsonfile)
+
+    fieldnames = data[0].keys()
+    with open(DETAILS_CSV_FILE, "w", newline="", encoding="utf-8") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(data)
+
+if __name__ == '__main__':
+    # 确保目录存在
+    os.makedirs(DIR_RES, exist_ok=True)
+    
+    process_data()
+    json_to_csv()
+    print("数据处理完成！")
--- a/thelordofporn/config.py
+++ b/thelordofporn/config.py
@ -0,0 +1,27 @@
+import logging
+import os
+import inspect
+from datetime import datetime
+
+# 映射到宿主机的目录
+home_dir = os.path.expanduser("~")
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data/thelordofporn'
+
+# 设置日志配置
+def setup_logging(log_filename=None):
+    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
+    if log_filename is None:
+        # 获取调用 setup_logging 的脚本文件名
+        caller_frame = inspect.stack()[1]
+        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+
+        # 获取当前日期，格式为 yyyymmdd
+        current_date = datetime.now().strftime('%Y%m%d')
+        # 拼接 log 文件名，将日期加在扩展名前
+        log_filename = f'./log/{caller_filename}_{current_date}.log'
+    
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
+                        handlers=[
+                            logging.FileHandler(log_filename),
+                            logging.StreamHandler()
+                        ])
--- a/thelordofporn/list_fetch.py
+++ b/thelordofporn/list_fetch.py
@ -0,0 +1,138 @@
+"""
+Script Name: 
+Description: 从 thelordofporn.com 上获取女优列表，并逐个获取女优详细信息。
+    由于网站使用了cloudflare, 无法直接爬取，使用 cloudscraper 绕过限制。
+    list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件;
+    actress_fetch.py 则把上一步获取到的列表，读取详情页面，合并进来一些详细信息。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import time
+import json
+import csv
+import os
+import random
+import cloudscraper
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import config 
+
+DIR_RES = config.global_host_data_dir
+ACTRESSES_JSON = f"{DIR_RES}/actresses.json"
+ACTRESSES_CSV = f"{DIR_RES}/actresses.csv"
+
+# 设置目标 URL
+BASE_URL = "https://thelordofporn.com/pornstars/"
+
+# 伪装成真实浏览器
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+    "Referer": "https://thelordofporn.com/",
+}
+
+# 记录抓取数据
+actress_list = []
+
+# 创建 CloudScraper 以绕过 Cloudflare
+scraper = cloudscraper.create_scraper(
+    browser={"browser": "chrome", "platform": "windows", "mobile": False}
+)
+
+# 爬取页面函数（支持分页）
+def scrape_page(url):
+    print(f"[INFO] 正在抓取: {url}")
+
+    # 网络访问失败时自动重试
+    for attempt in range(3):
+        try:
+            response = scraper.get(url, headers=HEADERS, timeout=10)
+            response.raise_for_status()  # 检查 HTTP 状态码
+            # 检查是否返回了有效的页面
+            soup = BeautifulSoup(response.text, "html.parser")
+            main_tag = soup.find("main", class_="site-content")
+
+            if main_tag:
+                break  # 如果页面内容正确，则继续解析
+            else:
+                print(f"[WARNING] 服务器返回的页面不完整，尝试重新获取 ({attempt+1}/3)")
+                time.sleep(random.uniform(2, 5))  # 休眠 2-5 秒再试
+        except Exception as e:
+            print(f"[ERROR] 访问失败 ({attempt+1}/3): {e}")
+            time.sleep(random.uniform(2, 5))  # 休眠 2-5 秒再试
+    else:
+        print("[ERROR] 多次尝试后仍然失败，跳过该页面")
+        return None
+
+    #soup = BeautifulSoup(response.text, "html.parser")
+
+    # 解析演员信息
+    articles = soup.find_all("article", class_="loop-item")
+    for article in articles:
+        try:
+            # 获取演员详情
+            title_tag = article.find("h3", class_="loop-item__title").find("a")
+            title = title_tag.text.strip()
+            href = title_tag["href"]
+
+            # 获取评分
+            rating_tag = article.find("div", class_="loop-item__rating")
+            rating = rating_tag.text.strip() if rating_tag else "N/A"
+
+            # 获取 Rank 和 Votes
+            meta_tags = article.find("div", class_="loop-item__rank").find_all("span")
+            rank = meta_tags[0].find("b").text.strip() if meta_tags else "N/A"
+            votes = meta_tags[1].find("b").text.strip() if len(meta_tags) > 1 else "N/A"
+
+            # 存入列表
+            actress_list.append({
+                "pornstar": title,
+                "rating": rating,
+                "rank": rank,
+                "votes": votes,
+                "href": href
+            })
+            print(f"-----[INFO] 获取演员: {title} (Rank: {rank}, Votes: {votes}, Rating: {rating})-----")
+
+        except Exception as e:
+            print(f"[ERROR] 解析演员信息失败: {e}")
+
+    # 查找下一页链接
+    next_page_tag = soup.select_one(".nav-links .next.page-numbers")
+    if next_page_tag:
+        next_page_url = urljoin(BASE_URL, next_page_tag["href"])
+        print(f"[INFO] 发现下一页: {next_page_url}")
+        time.sleep(random.uniform(1, 3))  # 休眠 1-3 秒，避免被封
+        scrape_page(next_page_url)
+    else:
+        print("[INFO] 已抓取所有页面，爬取结束")
+
+# 保存数据
+def save_data():
+    # 确保目录存在
+    os.makedirs(DIR_RES, exist_ok=True)
+
+    # 保存数据为 JSON
+    with open(ACTRESSES_JSON, "w", encoding="utf-8") as json_file:
+        json.dump(actress_list, json_file, ensure_ascii=False, indent=4)
+    print(f"[INFO] 数据已保存到 {ACTRESSES_JSON}")
+
+    # 保存数据为 CSV
+    with open(ACTRESSES_CSV, "w", encoding="utf-8", newline="") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=["pornstar", "rating", "rank", "votes", "href"])
+        writer.writeheader()
+        writer.writerows(actress_list)
+    print(f"[INFO] 数据已保存到 {ACTRESSES_CSV}")
+
+
+if __name__ == '__main__':
+    scrape_page(BASE_URL)
+    save_data()
--- a/thelordofporn/tools.py
+++ b/thelordofporn/tools.py
@ -0,0 +1,166 @@
+import sqlite3
+import json
+import re
+import logging
+from datetime import datetime
+
+def setup_logging():
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+db_path = "/root/sharedata/shared.db"
+
+def connect_db(db_name=db_path):
+    return sqlite3.connect(db_name)
+
+def create_tables(conn):
+    cursor = conn.cursor()
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS thelordofporn_actress (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            pornstar TEXT,
+            rating REAL,
+            rank INTEGER,
+            votes INTEGER,
+            href TEXT UNIQUE,
+            career_start TEXT,
+            measurements TEXT,
+            born TEXT,
+            height TEXT,
+            weight TEXT,
+            date_modified TEXT,
+            global_rank INTEGER,
+            weekly_rank INTEGER,
+            last_month_rating REAL,
+            current_rating REAL,
+            total_votes INTEGER,
+            birth_date TEXT,
+            birth_year TEXT,
+            birth_place TEXT,
+            height_ft TEXT,
+            height_cm TEXT,
+            weight_lbs TEXT,
+            weight_kg TEXT,
+            created_at TEXT DEFAULT (datetime('now', 'localtime')),
+            updated_at TEXT DEFAULT (datetime('now', 'localtime'))
+        );
+    ''')
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS thelordofporn_alias (
+            actress_id INTEGER NOT NULL,
+            alias TEXT NOT NULL,
+            FOREIGN KEY (actress_id) REFERENCES thelordofporn_actress(id) ON DELETE CASCADE,
+            PRIMARY KEY(`actress_id`, `alias`)
+        );
+    ''')
+    conn.commit()
+
+def load_json(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        logging.error(f"Failed to load JSON file: {e}")
+        return []
+
+def clean_alias(alias):
+    alias = re.sub(r'\(Age \d+\)', '', alias)  # 去掉 (Age XX)
+    return [name.strip() for name in alias.split(',') if name.strip()]
+
+def parse_numeric(value):
+    try:
+        return float(value)
+    except (ValueError, TypeError):
+        return 0  # 默认值为 0
+
+def insert_actress(conn, actress):
+    cursor = conn.cursor()
+    
+    # 插入 thelordofporn_actress 表
+    cursor.execute('''
+        INSERT INTO thelordofporn_actress (
+            pornstar, rating, rank, votes, href, career_start, measurements, born,
+            height, weight, date_modified, global_rank, weekly_rank,
+            last_month_rating, current_rating, total_votes,
+            birth_date, birth_year, birth_place, height_ft, height_cm,
+            weight_lbs, weight_kg, updated_at
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
+        ON CONFLICT(href) DO UPDATE SET
+            rating=excluded.rating,
+            rank=excluded.rank,
+            votes=excluded.votes,
+            career_start=excluded.career_start,
+            measurements=excluded.measurements,
+            born=excluded.born,
+            height=excluded.height,
+            weight=excluded.weight,
+            date_modified=excluded.date_modified,
+            global_rank=excluded.global_rank,
+            weekly_rank=excluded.weekly_rank,
+            last_month_rating=excluded.last_month_rating,
+            current_rating=excluded.current_rating,
+            total_votes=excluded.total_votes,
+            birth_date=excluded.birth_date,
+            birth_year=excluded.birth_year,
+            birth_place=excluded.birth_place,
+            height_ft=excluded.height_ft,
+            height_cm=excluded.height_cm,
+            weight_lbs=excluded.weight_lbs,
+            weight_kg=excluded.weight_kg,
+            updated_at=datetime('now', 'localtime');
+    ''', (
+        actress.get('pornstar', ''),
+        parse_numeric(actress.get('rating', 0)),
+        parse_numeric(actress.get('rank', 0)),
+        parse_numeric(actress.get('votes', 0)),
+        actress.get('href', ''),
+        actress.get('career_start', ''),
+        actress.get('measurements', ''),
+        actress.get('born', ''),
+        actress.get('height', ''),
+        actress.get('weight', ''),
+        actress.get('date_modified', ''),
+        parse_numeric(actress.get('global_rank', 0)),
+        parse_numeric(actress.get('weekly_rank', 0)),
+        parse_numeric(actress.get('last_month_rating', 0)),
+        parse_numeric(actress.get('current_rating', 0)),
+        parse_numeric(actress.get('total_votes', 0)),
+        actress.get('birth_date', ''),
+        str(actress.get('birth_year', '')),
+        actress.get('birth_place', ''),
+        actress.get('height_ft', ''),
+        str(actress.get('height_cm', '')),
+        str(actress.get('weight_lbs', '')),
+        str(actress.get('weight_kg', ''))
+    ))
+    
+    actress_id = cursor.lastrowid if cursor.lastrowid else cursor.execute("SELECT id FROM thelordofporn_actress WHERE href = ?", (actress.get('href', ''),)).fetchone()[0]
+    
+    # 插入 thelordofporn_alias 表
+    if 'alias' in actress:
+        aliases = clean_alias(actress['alias'])
+        cursor.execute("DELETE FROM thelordofporn_alias WHERE actress_id = ?", (actress_id,))
+        for alias in aliases:
+            cursor.execute("INSERT INTO thelordofporn_alias (actress_id, alias) VALUES (?, ?) ON CONFLICT(actress_id, alias) DO NOTHING ", (actress_id, alias))
+    
+    conn.commit()
+
+def main():
+    setup_logging()
+    conn = connect_db()
+    #create_tables(conn)
+    actresses = load_json("./result/actress_detail.json")
+    
+    if actresses:
+        for actress in actresses:
+            try:
+                insert_actress(conn, actress)
+                logging.info(f"Inserted/Updated: {actress.get('pornstar', 'Unknown')}")
+            except Exception as e:
+                logging.error(f"Error inserting actress: {e}")
+    else:
+        logging.warning("No data to insert.")
+    
+    conn.close()
+
+if __name__ == "__main__":
+    main()
--- a/thelordofporn/top_scenes.py
+++ b/thelordofporn/top_scenes.py
@ -0,0 +1,205 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+import sys
+import random
+import time
+import re
+import logging
+import csv
+from datetime import datetime
+from datetime import date
+import config # 日志配置
+import cloudscraper
+
+# 日志
+config.setup_logging()
+httpx_logger = logging.getLogger("httpx")
+httpx_logger.setLevel(logging.DEBUG)
+
+# 配置基础URL和输出文件
+base_url = 'https://thelordofporn.com/'
+list_url_scenes = 'https://thelordofporn.com/category/top-10/porn-scenes-movies/'
+list_url_pornstars = 'https://thelordofporn.com/category/pornstars-top-10/'
+curr_novel_pages = 0
+
+res_dir = 'result'
+
+top_scenes_file = f'{res_dir}/top_scenes_list.csv'
+top_pornstars_file = f'{res_dir}/top_pornstars_list.csv'
+
+# 请求头和 Cookies（模拟真实浏览器）
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+    "Accept-Language": "en-US,en;q=0.9",
+}
+COOKIES = {
+    "cf_clearance": "your_clearance_token_here"  # 需要根据 Cloudflare 的验证情况更新
+}
+# 定义获取页面内容的函数，带重试机制
+def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10):
+    scraper = cloudscraper.create_scraper(
+        browser={"browser": "chrome", "platform": "windows", "mobile": False}
+    )
+
+    retries = 0
+    while retries < max_retries:
+        try:
+            response = scraper.get(url, headers=HEADERS, cookies=COOKIES, timeout=default_timeout)
+            if response.status_code == 200 and "content-area content-area--full-width" in response.text :
+                return response.text # 请求成功，返回内容
+        except requests.RequestException as e:
+            retries += 1
+            logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...")
+            if retries >= max_retries:
+                logging.error(f"Failed to fetch page {url} after {max_retries} retries.")
+                return None
+            time.sleep(sleep_time)  # 休眠指定的时间，然后重试
+
+# 获取 top scenes and movies 
+def get_scenes(base_url, output_file=top_scenes_file):
+    # 初始化变量
+    current_url = base_url
+    all_data = []
+    
+    while current_url:
+        try:
+            logging.info(f"Fetching URL: {current_url}")
+            # 发起网络请求
+            content = get_page_content(current_url)     
+
+            # 解析网页内容
+            soup = BeautifulSoup(content, "html.parser")
+            articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_movies__scen")
+            
+            if not articles:
+                logging.warning(f"No articles found on page: {current_url}")
+            
+            # 解析每个 article 标签
+            for article in articles:
+                try:
+                    # 获取 href 和 title
+                    a_tag = article.find("a", class_="loop-item__image")
+                    title = a_tag.get("title", "").strip()
+                    href = a_tag.get("href", "").strip()
+                    
+                    if title and href:
+                        all_data.append({
+                            'title': title, 
+                            'href': href
+                        })
+                        logging.info(f"Extracted: {title} -> {href}")
+                    else:
+                        logging.warning("Missing title or href in an article.")
+                except Exception as e:
+                    logging.error(f"Error parsing article: {e}")
+
+            # 找下一页链接
+            next_page = soup.find("a", class_="next page-numbers")
+            if next_page:
+                current_url = next_page.get("href", "").strip()
+            else:
+                current_url = None
+                logging.info("No more pages to fetch.")
+            
+            # 等待一段时间以避免被目标网站封禁
+            time.sleep(2)
+        
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Network error while fetching {current_url}: {e}")
+            break
+        except Exception as e:
+            logging.error(f"Unexpected error: {e}")
+            break
+    
+    # 保存结果到文件
+    csv_headers = ["title", "href"]
+    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
+        writer.writeheader()
+        writer.writerows(all_data)
+        logging.info(f"Data successfully saved to {output_file}.")
+        
+
+# 获取 top pornstars
+def get_pornstars(base_url, output_file=top_pornstars_file):
+    # 初始化变量
+    current_url = base_url
+    all_data = []
+    
+    while current_url:
+        try:
+            logging.info(f"Fetching URL: {current_url}")
+            # 发起网络请求
+            content = get_page_content(current_url)     
+
+            # 解析网页内容
+            soup = BeautifulSoup(content, "html.parser")
+            articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_pornstars")
+            
+            if not articles:
+                logging.warning(f"No articles found on page: {current_url}")
+            
+            # 解析每个 article 标签
+            for article in articles:
+                try:
+                    # 获取 href 和 title
+                    a_tag = article.find("a", class_="loop-item__image")
+                    title = a_tag.get("title", "").strip()
+                    href = a_tag.get("href", "").strip()
+                    
+                    if title and href:
+                        all_data.append({
+                            'title':title,
+                            'href': href
+                        })
+                        logging.info(f"Extracted: {title} -> {href}")
+                    else:
+                        logging.warning("Missing title or href in an article.")
+                except Exception as e:
+                    logging.error(f"Error parsing article: {e}")
+
+            # 找下一页链接
+            next_page = soup.find("a", class_="next page-numbers")
+            if next_page:
+                current_url = next_page.get("href", "").strip()
+            else:
+                current_url = None
+                logging.info("No more pages to fetch.")
+            
+            # 等待一段时间以避免被目标网站封禁
+            time.sleep(2)
+        
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Network error while fetching {current_url}: {e}")
+            break
+        except Exception as e:
+            logging.error(f"Unexpected error: {e}")
+            break
+    
+    # 保存结果到文件
+    csv_headers = ["title", "href"]
+    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
+        writer.writeheader()
+        writer.writerows(all_data)
+        logging.info(f"Data successfully saved to {output_file}.")
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python script.py <cmd>")
+        print("cmd: scenes, pornstars")
+        sys.exit(1)
+
+    cmd = sys.argv[1]
+
+    if cmd == "scenes":
+        get_scenes(list_url_scenes)  # 之前已经实现的获取列表功能
+    elif cmd == "pornstars":
+        get_pornstars(list_url_pornstars)  # 之前已经实现的获取详情功能
+    else:
+        print(f"Unknown command: {cmd}")
+
+
+if __name__ == '__main__':
+    main()