modify scripts
This commit is contained in:
225
thelordofporn/actress_fetch.py
Normal file
225
thelordofporn/actress_fetch.py
Normal file
@ -0,0 +1,225 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 thelordofporn.com 上获取女优列表,并逐个获取女优详细信息。
|
||||
由于网站使用了cloudflare, 无法直接爬取,使用 cloudscraper 绕过限制。
|
||||
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件;
|
||||
actress_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import json
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import random
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup
|
||||
import config
|
||||
|
||||
|
||||
# 文件路径
|
||||
DIR_RES = config.global_host_data_dir
|
||||
ACTRESSES_FILE = f"{DIR_RES}/actresses.json"
|
||||
DETAILS_JSON_FILE = f"{DIR_RES}/thelordofporn_pornstars.json"
|
||||
DETAILS_CSV_FILE = f"{DIR_RES}/thelordofporn_pornstars.csv"
|
||||
|
||||
# 请求头和 Cookies(模拟真实浏览器)
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
}
|
||||
COOKIES = {
|
||||
"cf_clearance": "your_clearance_token_here" # 需要根据 Cloudflare 的验证情况更新
|
||||
}
|
||||
|
||||
# 解析出生日期和地点
|
||||
def parse_birth_info(text):
|
||||
match = re.match(r"(.+?) (\d{1,2}), (\d{4}) in (.+)", text)
|
||||
if match:
|
||||
return {
|
||||
"birth_date": f"{match.group(1)} {match.group(2)}, {match.group(3)}",
|
||||
"birth_year": match.group(3),
|
||||
"birth_place": match.group(4),
|
||||
}
|
||||
return {"birth_date": text, "birth_year": "", "birth_place": ""}
|
||||
|
||||
# 解析身高
|
||||
def parse_height(text):
|
||||
match = re.match(r"(\d+)\s*ft\s*(\d*)\s*in\s*\((\d+)\s*cm\)", text)
|
||||
if match:
|
||||
height_ft = f"{match.group(1)}'{match.group(2)}\""
|
||||
return {"height_ft": height_ft.strip(), "height_cm": match.group(3)}
|
||||
return {"height_ft": text, "height_cm": ""}
|
||||
|
||||
# 解析体重
|
||||
def parse_weight(text):
|
||||
match = re.match(r"(\d+)\s*lbs\s*\((\d+)\s*kg\)", text)
|
||||
if match:
|
||||
return {"weight_lbs": match.group(1), "weight_kg": match.group(2)}
|
||||
return {"weight_lbs": text, "weight_kg": ""}
|
||||
|
||||
# 解析网页内容
|
||||
def parse_page(actress, html):
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# 确保页面结构正确
|
||||
if not soup.find("main", {"id": "content", "class": "site-content"}):
|
||||
return None
|
||||
|
||||
# 提取基本信息
|
||||
entry_header = soup.find("header", class_="entry-header")
|
||||
name_el = entry_header.find("h1", class_="entry-title") if entry_header else None
|
||||
name = name_el.text.strip() if name_el else ""
|
||||
|
||||
date_modified_el = soup.find("time", itemprop="dateModified")
|
||||
if date_modified_el:
|
||||
date_modified = date_modified_el.get("content", "").strip()
|
||||
else:
|
||||
date_modified = ""
|
||||
|
||||
# 提取 metadata
|
||||
global_rank = ""
|
||||
weekly_rank = ""
|
||||
last_month_rating = ""
|
||||
current_rating = ""
|
||||
total_votes = ""
|
||||
|
||||
for div in entry_header.find_all("div", class_="porn-star-rank__item"):
|
||||
text = div.text.strip()
|
||||
if "Global Rank" in text:
|
||||
global_rank = div.find("b").text.strip()
|
||||
elif "Weekly Rank" in text:
|
||||
weekly_rank = div.find("b").text.strip()
|
||||
|
||||
for item in soup.find_all("div", class_="specifications__item--horizontal"):
|
||||
text = item.text.strip()
|
||||
if "Last Month" in text:
|
||||
last_month_rating = item.find("b").text.strip()
|
||||
elif "Rating Av." in text:
|
||||
current_rating = item.find("b").text.strip()
|
||||
elif "Total of" in text:
|
||||
total_votes = item.find("b").text.strip()
|
||||
|
||||
# 解析详细属性
|
||||
attributes = {}
|
||||
for row in soup.find_all("div", class_="specifications-grid-row"):
|
||||
items = row.find_all("div", class_="specifications-grid-item")
|
||||
if len(items) == 2:
|
||||
label = items[0].find("h5").text.strip()
|
||||
value = items[0].find("span").text.strip()
|
||||
attributes[label] = value
|
||||
|
||||
label2 = items[1].find("h5").text.strip()
|
||||
value2 = items[1].find("span").text.strip()
|
||||
attributes[label2] = value2
|
||||
|
||||
# 解析出生信息、身高、体重等
|
||||
birth_info = parse_birth_info(attributes.get("Born", ""))
|
||||
height_info = parse_height(attributes.get("Height", ""))
|
||||
weight_info = parse_weight(attributes.get("Weight", ""))
|
||||
|
||||
return {
|
||||
"pornstar": actress['pornstar'],
|
||||
"rating": actress['rating'],
|
||||
"rank": actress['rank'],
|
||||
"votes": actress['votes'],
|
||||
"href": actress['href'],
|
||||
'name': name,
|
||||
"alias": attributes.get("Name", ""),
|
||||
"career_start": attributes.get("Career start", ""),
|
||||
"measurements": attributes.get("Measurements", ""),
|
||||
"born": attributes.get("Born", ""),
|
||||
"height": attributes.get("Height", ""),
|
||||
"weight": attributes.get("Weight", ""),
|
||||
"date_modified": date_modified,
|
||||
"global_rank": global_rank,
|
||||
"weekly_rank": weekly_rank,
|
||||
"last_month_rating": last_month_rating,
|
||||
"current_rating": current_rating,
|
||||
"total_votes": total_votes,
|
||||
**birth_info,
|
||||
**height_info,
|
||||
**weight_info,
|
||||
}
|
||||
|
||||
# 读取已处理数据
|
||||
def load_existing_data():
|
||||
if os.path.exists(DETAILS_JSON_FILE):
|
||||
with open(DETAILS_JSON_FILE, "r", encoding="utf-8") as f:
|
||||
return {item["pornstar"]: item for item in json.load(f)}
|
||||
return {}
|
||||
|
||||
# 访问页面
|
||||
def fetch_page(url):
|
||||
scraper = cloudscraper.create_scraper()
|
||||
for _ in range(500): # 最多重试5次
|
||||
try:
|
||||
response = scraper.get(url, headers=HEADERS, cookies=COOKIES, timeout=10)
|
||||
if response.status_code == 200 and "specifications-grid-row" in response.text:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
print(f"请求 {url} 失败,错误: {e}")
|
||||
time.sleep(random.uniform(2, 5)) # 随机延迟
|
||||
return None
|
||||
|
||||
# 处理数据并保存
|
||||
def process_data():
|
||||
with open(ACTRESSES_FILE, "r", encoding="utf-8") as f:
|
||||
actresses = json.load(f)
|
||||
|
||||
existing_data = load_existing_data()
|
||||
updated_data = list(existing_data.values())
|
||||
|
||||
for actress in actresses:
|
||||
name, url = actress["pornstar"], actress["href"]
|
||||
|
||||
if name in existing_data:
|
||||
print(f"跳过已处理: {name}")
|
||||
continue
|
||||
|
||||
print(f"正在处理: {name} - {url}")
|
||||
html = fetch_page(url)
|
||||
if not html:
|
||||
print(f"无法获取页面: {url}")
|
||||
continue
|
||||
|
||||
details = parse_page(actress, html)
|
||||
if details:
|
||||
updated_data.append(details)
|
||||
existing_data[name] = details
|
||||
|
||||
with open(DETAILS_JSON_FILE, "w", encoding="utf-8") as jsonfile:
|
||||
json.dump(updated_data, jsonfile, indent=4, ensure_ascii=False)
|
||||
|
||||
# 从 JSON 生成 CSV
|
||||
def json_to_csv():
|
||||
if not os.path.exists(DETAILS_JSON_FILE):
|
||||
print("没有 JSON 文件,跳过 CSV 生成")
|
||||
return
|
||||
|
||||
with open(DETAILS_JSON_FILE, "r", encoding="utf-8") as jsonfile:
|
||||
data = json.load(jsonfile)
|
||||
|
||||
fieldnames = data[0].keys()
|
||||
with open(DETAILS_CSV_FILE, "w", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(data)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 确保目录存在
|
||||
os.makedirs(DIR_RES, exist_ok=True)
|
||||
|
||||
process_data()
|
||||
json_to_csv()
|
||||
print("数据处理完成!")
|
||||
27
thelordofporn/config.py
Normal file
27
thelordofporn/config.py
Normal file
@ -0,0 +1,27 @@
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
from datetime import datetime
|
||||
|
||||
# 映射到宿主机的目录
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_host_data_dir = f'{home_dir}/hostdir/scripts_data/thelordofporn'
|
||||
|
||||
# 设置日志配置
|
||||
def setup_logging(log_filename=None):
|
||||
# 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
|
||||
if log_filename is None:
|
||||
# 获取调用 setup_logging 的脚本文件名
|
||||
caller_frame = inspect.stack()[1]
|
||||
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||
|
||||
# 获取当前日期,格式为 yyyymmdd
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
# 拼接 log 文件名,将日期加在扩展名前
|
||||
log_filename = f'./log/{caller_filename}_{current_date}.log'
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename),
|
||||
logging.StreamHandler()
|
||||
])
|
||||
138
thelordofporn/list_fetch.py
Normal file
138
thelordofporn/list_fetch.py
Normal file
@ -0,0 +1,138 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 thelordofporn.com 上获取女优列表,并逐个获取女优详细信息。
|
||||
由于网站使用了cloudflare, 无法直接爬取,使用 cloudscraper 绕过限制。
|
||||
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 同时生成csv文件;
|
||||
actress_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import csv
|
||||
import os
|
||||
import random
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
import config
|
||||
|
||||
DIR_RES = config.global_host_data_dir
|
||||
ACTRESSES_JSON = f"{DIR_RES}/actresses.json"
|
||||
ACTRESSES_CSV = f"{DIR_RES}/actresses.csv"
|
||||
|
||||
# 设置目标 URL
|
||||
BASE_URL = "https://thelordofporn.com/pornstars/"
|
||||
|
||||
# 伪装成真实浏览器
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||||
"Referer": "https://thelordofporn.com/",
|
||||
}
|
||||
|
||||
# 记录抓取数据
|
||||
actress_list = []
|
||||
|
||||
# 创建 CloudScraper 以绕过 Cloudflare
|
||||
scraper = cloudscraper.create_scraper(
|
||||
browser={"browser": "chrome", "platform": "windows", "mobile": False}
|
||||
)
|
||||
|
||||
# 爬取页面函数(支持分页)
|
||||
def scrape_page(url):
|
||||
print(f"[INFO] 正在抓取: {url}")
|
||||
|
||||
# 网络访问失败时自动重试
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = scraper.get(url, headers=HEADERS, timeout=10)
|
||||
response.raise_for_status() # 检查 HTTP 状态码
|
||||
# 检查是否返回了有效的页面
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
main_tag = soup.find("main", class_="site-content")
|
||||
|
||||
if main_tag:
|
||||
break # 如果页面内容正确,则继续解析
|
||||
else:
|
||||
print(f"[WARNING] 服务器返回的页面不完整,尝试重新获取 ({attempt+1}/3)")
|
||||
time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 访问失败 ({attempt+1}/3): {e}")
|
||||
time.sleep(random.uniform(2, 5)) # 休眠 2-5 秒再试
|
||||
else:
|
||||
print("[ERROR] 多次尝试后仍然失败,跳过该页面")
|
||||
return None
|
||||
|
||||
#soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# 解析演员信息
|
||||
articles = soup.find_all("article", class_="loop-item")
|
||||
for article in articles:
|
||||
try:
|
||||
# 获取演员详情
|
||||
title_tag = article.find("h3", class_="loop-item__title").find("a")
|
||||
title = title_tag.text.strip()
|
||||
href = title_tag["href"]
|
||||
|
||||
# 获取评分
|
||||
rating_tag = article.find("div", class_="loop-item__rating")
|
||||
rating = rating_tag.text.strip() if rating_tag else "N/A"
|
||||
|
||||
# 获取 Rank 和 Votes
|
||||
meta_tags = article.find("div", class_="loop-item__rank").find_all("span")
|
||||
rank = meta_tags[0].find("b").text.strip() if meta_tags else "N/A"
|
||||
votes = meta_tags[1].find("b").text.strip() if len(meta_tags) > 1 else "N/A"
|
||||
|
||||
# 存入列表
|
||||
actress_list.append({
|
||||
"pornstar": title,
|
||||
"rating": rating,
|
||||
"rank": rank,
|
||||
"votes": votes,
|
||||
"href": href
|
||||
})
|
||||
print(f"-----[INFO] 获取演员: {title} (Rank: {rank}, Votes: {votes}, Rating: {rating})-----")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 解析演员信息失败: {e}")
|
||||
|
||||
# 查找下一页链接
|
||||
next_page_tag = soup.select_one(".nav-links .next.page-numbers")
|
||||
if next_page_tag:
|
||||
next_page_url = urljoin(BASE_URL, next_page_tag["href"])
|
||||
print(f"[INFO] 发现下一页: {next_page_url}")
|
||||
time.sleep(random.uniform(1, 3)) # 休眠 1-3 秒,避免被封
|
||||
scrape_page(next_page_url)
|
||||
else:
|
||||
print("[INFO] 已抓取所有页面,爬取结束")
|
||||
|
||||
# 保存数据
|
||||
def save_data():
|
||||
# 确保目录存在
|
||||
os.makedirs(DIR_RES, exist_ok=True)
|
||||
|
||||
# 保存数据为 JSON
|
||||
with open(ACTRESSES_JSON, "w", encoding="utf-8") as json_file:
|
||||
json.dump(actress_list, json_file, ensure_ascii=False, indent=4)
|
||||
print(f"[INFO] 数据已保存到 {ACTRESSES_JSON}")
|
||||
|
||||
# 保存数据为 CSV
|
||||
with open(ACTRESSES_CSV, "w", encoding="utf-8", newline="") as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=["pornstar", "rating", "rank", "votes", "href"])
|
||||
writer.writeheader()
|
||||
writer.writerows(actress_list)
|
||||
print(f"[INFO] 数据已保存到 {ACTRESSES_CSV}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
scrape_page(BASE_URL)
|
||||
save_data()
|
||||
166
thelordofporn/tools.py
Normal file
166
thelordofporn/tools.py
Normal file
@ -0,0 +1,166 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
def setup_logging():
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
db_path = "/root/sharedata/shared.db"
|
||||
|
||||
def connect_db(db_name=db_path):
|
||||
return sqlite3.connect(db_name)
|
||||
|
||||
def create_tables(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS thelordofporn_actress (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
pornstar TEXT,
|
||||
rating REAL,
|
||||
rank INTEGER,
|
||||
votes INTEGER,
|
||||
href TEXT UNIQUE,
|
||||
career_start TEXT,
|
||||
measurements TEXT,
|
||||
born TEXT,
|
||||
height TEXT,
|
||||
weight TEXT,
|
||||
date_modified TEXT,
|
||||
global_rank INTEGER,
|
||||
weekly_rank INTEGER,
|
||||
last_month_rating REAL,
|
||||
current_rating REAL,
|
||||
total_votes INTEGER,
|
||||
birth_date TEXT,
|
||||
birth_year TEXT,
|
||||
birth_place TEXT,
|
||||
height_ft TEXT,
|
||||
height_cm TEXT,
|
||||
weight_lbs TEXT,
|
||||
weight_kg TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
);
|
||||
''')
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS thelordofporn_alias (
|
||||
actress_id INTEGER NOT NULL,
|
||||
alias TEXT NOT NULL,
|
||||
FOREIGN KEY (actress_id) REFERENCES thelordofporn_actress(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY(`actress_id`, `alias`)
|
||||
);
|
||||
''')
|
||||
conn.commit()
|
||||
|
||||
def load_json(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
logging.error(f"Failed to load JSON file: {e}")
|
||||
return []
|
||||
|
||||
def clean_alias(alias):
|
||||
alias = re.sub(r'\(Age \d+\)', '', alias) # 去掉 (Age XX)
|
||||
return [name.strip() for name in alias.split(',') if name.strip()]
|
||||
|
||||
def parse_numeric(value):
|
||||
try:
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
return 0 # 默认值为 0
|
||||
|
||||
def insert_actress(conn, actress):
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 插入 thelordofporn_actress 表
|
||||
cursor.execute('''
|
||||
INSERT INTO thelordofporn_actress (
|
||||
pornstar, rating, rank, votes, href, career_start, measurements, born,
|
||||
height, weight, date_modified, global_rank, weekly_rank,
|
||||
last_month_rating, current_rating, total_votes,
|
||||
birth_date, birth_year, birth_place, height_ft, height_cm,
|
||||
weight_lbs, weight_kg, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
|
||||
ON CONFLICT(href) DO UPDATE SET
|
||||
rating=excluded.rating,
|
||||
rank=excluded.rank,
|
||||
votes=excluded.votes,
|
||||
career_start=excluded.career_start,
|
||||
measurements=excluded.measurements,
|
||||
born=excluded.born,
|
||||
height=excluded.height,
|
||||
weight=excluded.weight,
|
||||
date_modified=excluded.date_modified,
|
||||
global_rank=excluded.global_rank,
|
||||
weekly_rank=excluded.weekly_rank,
|
||||
last_month_rating=excluded.last_month_rating,
|
||||
current_rating=excluded.current_rating,
|
||||
total_votes=excluded.total_votes,
|
||||
birth_date=excluded.birth_date,
|
||||
birth_year=excluded.birth_year,
|
||||
birth_place=excluded.birth_place,
|
||||
height_ft=excluded.height_ft,
|
||||
height_cm=excluded.height_cm,
|
||||
weight_lbs=excluded.weight_lbs,
|
||||
weight_kg=excluded.weight_kg,
|
||||
updated_at=datetime('now', 'localtime');
|
||||
''', (
|
||||
actress.get('pornstar', ''),
|
||||
parse_numeric(actress.get('rating', 0)),
|
||||
parse_numeric(actress.get('rank', 0)),
|
||||
parse_numeric(actress.get('votes', 0)),
|
||||
actress.get('href', ''),
|
||||
actress.get('career_start', ''),
|
||||
actress.get('measurements', ''),
|
||||
actress.get('born', ''),
|
||||
actress.get('height', ''),
|
||||
actress.get('weight', ''),
|
||||
actress.get('date_modified', ''),
|
||||
parse_numeric(actress.get('global_rank', 0)),
|
||||
parse_numeric(actress.get('weekly_rank', 0)),
|
||||
parse_numeric(actress.get('last_month_rating', 0)),
|
||||
parse_numeric(actress.get('current_rating', 0)),
|
||||
parse_numeric(actress.get('total_votes', 0)),
|
||||
actress.get('birth_date', ''),
|
||||
str(actress.get('birth_year', '')),
|
||||
actress.get('birth_place', ''),
|
||||
actress.get('height_ft', ''),
|
||||
str(actress.get('height_cm', '')),
|
||||
str(actress.get('weight_lbs', '')),
|
||||
str(actress.get('weight_kg', ''))
|
||||
))
|
||||
|
||||
actress_id = cursor.lastrowid if cursor.lastrowid else cursor.execute("SELECT id FROM thelordofporn_actress WHERE href = ?", (actress.get('href', ''),)).fetchone()[0]
|
||||
|
||||
# 插入 thelordofporn_alias 表
|
||||
if 'alias' in actress:
|
||||
aliases = clean_alias(actress['alias'])
|
||||
cursor.execute("DELETE FROM thelordofporn_alias WHERE actress_id = ?", (actress_id,))
|
||||
for alias in aliases:
|
||||
cursor.execute("INSERT INTO thelordofporn_alias (actress_id, alias) VALUES (?, ?) ON CONFLICT(actress_id, alias) DO NOTHING ", (actress_id, alias))
|
||||
|
||||
conn.commit()
|
||||
|
||||
def main():
|
||||
setup_logging()
|
||||
conn = connect_db()
|
||||
#create_tables(conn)
|
||||
actresses = load_json("./result/actress_detail.json")
|
||||
|
||||
if actresses:
|
||||
for actress in actresses:
|
||||
try:
|
||||
insert_actress(conn, actress)
|
||||
logging.info(f"Inserted/Updated: {actress.get('pornstar', 'Unknown')}")
|
||||
except Exception as e:
|
||||
logging.error(f"Error inserting actress: {e}")
|
||||
else:
|
||||
logging.warning("No data to insert.")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
205
thelordofporn/top_scenes.py
Normal file
205
thelordofporn/top_scenes.py
Normal file
@ -0,0 +1,205 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
import time
|
||||
import re
|
||||
import logging
|
||||
import csv
|
||||
from datetime import datetime
|
||||
from datetime import date
|
||||
import config # 日志配置
|
||||
import cloudscraper
|
||||
|
||||
# 日志
|
||||
config.setup_logging()
|
||||
httpx_logger = logging.getLogger("httpx")
|
||||
httpx_logger.setLevel(logging.DEBUG)
|
||||
|
||||
# 配置基础URL和输出文件
|
||||
base_url = 'https://thelordofporn.com/'
|
||||
list_url_scenes = 'https://thelordofporn.com/category/top-10/porn-scenes-movies/'
|
||||
list_url_pornstars = 'https://thelordofporn.com/category/pornstars-top-10/'
|
||||
curr_novel_pages = 0
|
||||
|
||||
res_dir = 'result'
|
||||
|
||||
top_scenes_file = f'{res_dir}/top_scenes_list.csv'
|
||||
top_pornstars_file = f'{res_dir}/top_pornstars_list.csv'
|
||||
|
||||
# 请求头和 Cookies(模拟真实浏览器)
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
}
|
||||
COOKIES = {
|
||||
"cf_clearance": "your_clearance_token_here" # 需要根据 Cloudflare 的验证情况更新
|
||||
}
|
||||
# 定义获取页面内容的函数,带重试机制
|
||||
def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10):
|
||||
scraper = cloudscraper.create_scraper(
|
||||
browser={"browser": "chrome", "platform": "windows", "mobile": False}
|
||||
)
|
||||
|
||||
retries = 0
|
||||
while retries < max_retries:
|
||||
try:
|
||||
response = scraper.get(url, headers=HEADERS, cookies=COOKIES, timeout=default_timeout)
|
||||
if response.status_code == 200 and "content-area content-area--full-width" in response.text :
|
||||
return response.text # 请求成功,返回内容
|
||||
except requests.RequestException as e:
|
||||
retries += 1
|
||||
logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...")
|
||||
if retries >= max_retries:
|
||||
logging.error(f"Failed to fetch page {url} after {max_retries} retries.")
|
||||
return None
|
||||
time.sleep(sleep_time) # 休眠指定的时间,然后重试
|
||||
|
||||
# 获取 top scenes and movies
|
||||
def get_scenes(base_url, output_file=top_scenes_file):
|
||||
# 初始化变量
|
||||
current_url = base_url
|
||||
all_data = []
|
||||
|
||||
while current_url:
|
||||
try:
|
||||
logging.info(f"Fetching URL: {current_url}")
|
||||
# 发起网络请求
|
||||
content = get_page_content(current_url)
|
||||
|
||||
# 解析网页内容
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_movies__scen")
|
||||
|
||||
if not articles:
|
||||
logging.warning(f"No articles found on page: {current_url}")
|
||||
|
||||
# 解析每个 article 标签
|
||||
for article in articles:
|
||||
try:
|
||||
# 获取 href 和 title
|
||||
a_tag = article.find("a", class_="loop-item__image")
|
||||
title = a_tag.get("title", "").strip()
|
||||
href = a_tag.get("href", "").strip()
|
||||
|
||||
if title and href:
|
||||
all_data.append({
|
||||
'title': title,
|
||||
'href': href
|
||||
})
|
||||
logging.info(f"Extracted: {title} -> {href}")
|
||||
else:
|
||||
logging.warning("Missing title or href in an article.")
|
||||
except Exception as e:
|
||||
logging.error(f"Error parsing article: {e}")
|
||||
|
||||
# 找下一页链接
|
||||
next_page = soup.find("a", class_="next page-numbers")
|
||||
if next_page:
|
||||
current_url = next_page.get("href", "").strip()
|
||||
else:
|
||||
current_url = None
|
||||
logging.info("No more pages to fetch.")
|
||||
|
||||
# 等待一段时间以避免被目标网站封禁
|
||||
time.sleep(2)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Network error while fetching {current_url}: {e}")
|
||||
break
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error: {e}")
|
||||
break
|
||||
|
||||
# 保存结果到文件
|
||||
csv_headers = ["title", "href"]
|
||||
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_data)
|
||||
logging.info(f"Data successfully saved to {output_file}.")
|
||||
|
||||
|
||||
# 获取 top pornstars
|
||||
def get_pornstars(base_url, output_file=top_pornstars_file):
|
||||
# 初始化变量
|
||||
current_url = base_url
|
||||
all_data = []
|
||||
|
||||
while current_url:
|
||||
try:
|
||||
logging.info(f"Fetching URL: {current_url}")
|
||||
# 发起网络请求
|
||||
content = get_page_content(current_url)
|
||||
|
||||
# 解析网页内容
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
articles = soup.find_all("article", class_="loop-item loop-item--top loop-item--ca_prod_pornstars")
|
||||
|
||||
if not articles:
|
||||
logging.warning(f"No articles found on page: {current_url}")
|
||||
|
||||
# 解析每个 article 标签
|
||||
for article in articles:
|
||||
try:
|
||||
# 获取 href 和 title
|
||||
a_tag = article.find("a", class_="loop-item__image")
|
||||
title = a_tag.get("title", "").strip()
|
||||
href = a_tag.get("href", "").strip()
|
||||
|
||||
if title and href:
|
||||
all_data.append({
|
||||
'title':title,
|
||||
'href': href
|
||||
})
|
||||
logging.info(f"Extracted: {title} -> {href}")
|
||||
else:
|
||||
logging.warning("Missing title or href in an article.")
|
||||
except Exception as e:
|
||||
logging.error(f"Error parsing article: {e}")
|
||||
|
||||
# 找下一页链接
|
||||
next_page = soup.find("a", class_="next page-numbers")
|
||||
if next_page:
|
||||
current_url = next_page.get("href", "").strip()
|
||||
else:
|
||||
current_url = None
|
||||
logging.info("No more pages to fetch.")
|
||||
|
||||
# 等待一段时间以避免被目标网站封禁
|
||||
time.sleep(2)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Network error while fetching {current_url}: {e}")
|
||||
break
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error: {e}")
|
||||
break
|
||||
|
||||
# 保存结果到文件
|
||||
csv_headers = ["title", "href"]
|
||||
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_data)
|
||||
logging.info(f"Data successfully saved to {output_file}.")
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python script.py <cmd>")
|
||||
print("cmd: scenes, pornstars")
|
||||
sys.exit(1)
|
||||
|
||||
cmd = sys.argv[1]
|
||||
|
||||
if cmd == "scenes":
|
||||
get_scenes(list_url_scenes) # 之前已经实现的获取列表功能
|
||||
elif cmd == "pornstars":
|
||||
get_pornstars(list_url_pornstars) # 之前已经实现的获取详情功能
|
||||
else:
|
||||
print(f"Unknown command: {cmd}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user