176 lines
6.8 KiB
Python
176 lines
6.8 KiB
Python
"""
|
|
Script Name:
|
|
Description: 从 javhd.com 上获取女优列表,并逐个获取女优详细信息。
|
|
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 支持ja,zh,en三种语言可选(一般情况下可以三种全部拉取一遍);
|
|
list_format.py 则把这些文件读取出来,合并,形成完整的列表, 主要是把三种语言的女优名字拼到一起, 使用处理后的链接地址+图片地址作为判断同一个人的依据;
|
|
model_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
|
|
注意: Header部分是从浏览器中抓取的, 时间久了可能要替换。
|
|
|
|
Author: [Your Name]
|
|
Created Date: YYYY-MM-DD
|
|
Last Modified: YYYY-MM-DD
|
|
Version: 1.0
|
|
|
|
Modification History:
|
|
- YYYY-MM-DD [Your Name]:
|
|
- YYYY-MM-DD [Your Name]:
|
|
- YYYY-MM-DD [Your Name]:
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import requests
|
|
import time
|
|
import logging
|
|
import os
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
# 配置日志
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
# 目标文件路径
|
|
INPUT_FILE = "result/models.json"
|
|
OUTPUT_JSON = "result/javhd_models.json"
|
|
OUTPUT_CSV = "result/javhd_models.csv"
|
|
|
|
HEADERS = {
|
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
|
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
|
|
'origin': 'https://javhd.com',
|
|
'priority': 'u=1, i',
|
|
'referer': 'https://javhd.com/ja/model' ,
|
|
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
|
|
'sec-ch-ua-mobile': '?0' ,
|
|
'sec-ch-ua-platform': '"macOS"' ,
|
|
'sec-fetch-dest': 'empty' ,
|
|
'sec-fetch-mode': 'cors' ,
|
|
'sec-fetch-site': 'same-origin' ,
|
|
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
|
|
}
|
|
# 需要提取的字段
|
|
FIELDS = ["Height", "Weight", "Breast size", "Breast factor", "Hair color",
|
|
"Eye color", "Birth date", "Ethnicity", "Birth place"]
|
|
|
|
|
|
def fetch_data(url, retries=3):
|
|
"""从给定 URL 获取数据,带重试机制"""
|
|
for attempt in range(retries):
|
|
try:
|
|
response = requests.get(url, headers=HEADERS, timeout=10)
|
|
response.raise_for_status()
|
|
return response
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
|
|
time.sleep(2)
|
|
return None
|
|
|
|
|
|
def process_paragraph(paragraph):
|
|
# 获取完整的 HTML 结构,而不是 get_text()
|
|
paragraph_html = str(paragraph)
|
|
|
|
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
|
|
soup = BeautifulSoup(paragraph_html, 'html.parser')
|
|
cleaned_text = soup.get_text().strip()
|
|
|
|
return cleaned_text
|
|
|
|
# 读取已处理数据
|
|
def load_existing_data():
|
|
if os.path.exists(OUTPUT_JSON):
|
|
try:
|
|
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
|
|
detailed_models = json.load(f)
|
|
existing_names = {model["en_name"] for model in detailed_models}
|
|
except Exception as e:
|
|
logging.error(f"无法读取 {OUTPUT_JSON}: {e}")
|
|
detailed_models = []
|
|
existing_names = set()
|
|
else:
|
|
detailed_models = []
|
|
existing_names = set()
|
|
return detailed_models, existing_names
|
|
|
|
def process_data():
|
|
# 读取原始 JSON 数据
|
|
try:
|
|
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
|
models = json.load(f)
|
|
except Exception as e:
|
|
logging.error(f"无法读取 {INPUT_FILE}: {e}")
|
|
return
|
|
|
|
detailed_models, existing_names = load_existing_data()
|
|
|
|
# 遍历 models.json 里的每个对象
|
|
for model in models:
|
|
en_name = model.get("en_name", "")
|
|
ja_name = model.get('ja_name', '')
|
|
url = model.get("url", "")
|
|
|
|
if not url or en_name in existing_names:
|
|
logging.info(f"跳过 {en_name}, 已处理或无有效 URL")
|
|
continue
|
|
|
|
logging.info(f"正在处理: {en_name} - {ja_name} - {url}")
|
|
|
|
try:
|
|
response = fetch_data(url, retries=100)
|
|
|
|
if not response:
|
|
logging.warning(f"请求失败 ({response.text}): {url}")
|
|
break
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
info_section = soup.find("div", class_="info__features")
|
|
|
|
if not info_section:
|
|
logging.warning(f"未找到 info__features 区块: {url}")
|
|
continue
|
|
|
|
extracted_data = {field: "" for field in FIELDS}
|
|
for li in info_section.find_all("li", class_="content-desc__list-item"):
|
|
title_tag = li.find("strong", class_="content-desc__list-title")
|
|
value_tag = li.find("span", class_="content-desc__list-text")
|
|
if title_tag and value_tag:
|
|
title = process_paragraph(title_tag)
|
|
value = process_paragraph(value_tag)
|
|
if title in extracted_data:
|
|
extracted_data[title] = value
|
|
|
|
model.update(extracted_data)
|
|
detailed_models.append(model)
|
|
|
|
# 追加写入 JSON 文件
|
|
with open(OUTPUT_JSON, "w+", encoding="utf-8") as f:
|
|
json.dump(detailed_models, f, ensure_ascii=False, indent=4)
|
|
|
|
logging.info(f"已保存: {en_name}")
|
|
|
|
time.sleep(3) # 适当延迟,防止请求过快
|
|
|
|
except Exception as e:
|
|
logging.error(f"处理 {en_name} 失败: {e}")
|
|
|
|
|
|
# 从 JSON 生成 CSV
|
|
def json_to_csv():
|
|
if not os.path.exists(OUTPUT_JSON):
|
|
print("没有 JSON 文件,跳过 CSV 生成")
|
|
return
|
|
|
|
with open(OUTPUT_JSON, "r", encoding="utf-8") as jsonfile:
|
|
data = json.load(jsonfile)
|
|
|
|
fieldnames = data[0].keys()
|
|
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(data)
|
|
|
|
if __name__ == '__main__':
|
|
process_data()
|
|
json_to_csv()
|
|
print("处理完成!") |