108 lines
4.6 KiB
Python
108 lines
4.6 KiB
Python
"""
|
|
Script Name:
|
|
Description: 从 javhd.com 上获取女优列表,并逐个获取女优详细信息。
|
|
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 支持ja,zh,en三种语言可选(一般情况下可以三种全部拉取一遍);
|
|
list_format.py 则把这些文件读取出来,合并,形成完整的列表, 主要是把三种语言的女优名字拼到一起, 使用处理后的链接地址+图片地址作为判断同一个人的依据;
|
|
model_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
|
|
注意: Header部分是从浏览器中抓取的, 时间久了可能要替换。
|
|
|
|
Author: [Your Name]
|
|
Created Date: YYYY-MM-DD
|
|
Last Modified: YYYY-MM-DD
|
|
Version: 1.0
|
|
|
|
Modification History:
|
|
- YYYY-MM-DD [Your Name]:
|
|
- YYYY-MM-DD [Your Name]:
|
|
- YYYY-MM-DD [Your Name]:
|
|
"""
|
|
|
|
import requests
|
|
import os
|
|
import time
|
|
import json
|
|
import sys
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
# 设置初始 URL
|
|
BASE_URL = "https://javhd.com"
|
|
#START_URL = "/ja/model"
|
|
#START_URL = "/zh/model"
|
|
START_URL = "/en/model"
|
|
HEADERS = {
|
|
"accept": "application/json, text/plain, */*",
|
|
"content-type": "application/json",
|
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",
|
|
"x-requested-with": "XMLHttpRequest",
|
|
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
|
'content-type': 'application/json',
|
|
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
|
|
'origin': 'https://javhd.com',
|
|
'priority': 'u=1, i',
|
|
'referer': 'https://javhd.com/ja/model' ,
|
|
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
|
|
'sec-ch-ua-mobile': '?0' ,
|
|
'sec-ch-ua-platform': '"macOS"' ,
|
|
'sec-fetch-dest': 'empty' ,
|
|
'sec-fetch-mode': 'cors' ,
|
|
'sec-fetch-site': 'same-origin' ,
|
|
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
|
|
'x-requested-with': 'XMLHttpRequest' ,
|
|
}
|
|
POST_DATA = {} # 空字典表示无数据
|
|
|
|
def sanitize_filename(url_path):
|
|
"""将 URL 路径转换为合法的文件名"""
|
|
return url_path.strip("/").replace("/", "_") + ".json"
|
|
|
|
def fetch_data(url, retries=3):
|
|
"""从给定 URL 获取数据,带重试机制"""
|
|
for attempt in range(retries):
|
|
try:
|
|
response = requests.post(url, headers=HEADERS, json=POST_DATA, timeout=10)
|
|
print(response)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
|
|
time.sleep(2)
|
|
return None
|
|
|
|
def save_data(url, data):
|
|
"""保存数据到文件"""
|
|
parsed_url = urlparse(url)
|
|
filename = sanitize_filename(parsed_url.path)
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
print(f"[成功] 数据已保存到 {filename}")
|
|
|
|
def main(s_url):
|
|
current_url = urljoin(BASE_URL, s_url)
|
|
while current_url:
|
|
print(f"[信息] 正在抓取 {current_url}")
|
|
data = fetch_data(current_url)
|
|
|
|
if not data:
|
|
print(f"[错误] 无法获取数据 {current_url}")
|
|
break
|
|
|
|
# 检查 JSON 结构
|
|
if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
|
|
print(f"[错误] 数据结构异常: {data}")
|
|
break
|
|
|
|
save_data(current_url, data)
|
|
|
|
# 获取下一页
|
|
next_path = data.get("pagination_params", {}).get("next")
|
|
if next_path:
|
|
current_url = urljoin(BASE_URL, next_path)
|
|
else:
|
|
print("[信息] 已抓取所有页面。")
|
|
break
|
|
|
|
if __name__ == "__main__":
|
|
s_url = "/ja/model"
|
|
if len(sys.argv) >= 2:
|
|
s_url = f'/{sys.argv[1]}/model'
|
|
main(s_url) |