add some scripts.

This commit is contained in:
2025-02-11 16:07:43 +08:00
parent 2cab12ea34
commit 62a2fbdc77
20 changed files with 148909 additions and 4 deletions

108
scripts/javhd/list_fetch.py Normal file
View File

@ -0,0 +1,108 @@
"""
Script Name:
Description: 从 javhd.com 上获取女优列表,并逐个获取女优详细信息。
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 支持ja,zh,en三种语言可选(一般情况下可以三种全部拉取一遍);
list_format.py 则把这些文件读取出来,合并,形成完整的列表, 主要是把三种语言的女优名字拼到一起, 使用处理后的链接地址+图片地址作为判断同一个人的依据;
model_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
注意: Header部分是从浏览器中抓取的, 时间久了可能要替换。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import requests
import os
import time
import json
import sys
from urllib.parse import urljoin, urlparse
# 设置初始 URL
BASE_URL = "https://javhd.com"
#START_URL = "/ja/model"
#START_URL = "/zh/model"
START_URL = "/en/model"
HEADERS = {
"accept": "application/json, text/plain, */*",
"content-type": "application/json",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",
"x-requested-with": "XMLHttpRequest",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
'content-type': 'application/json',
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
'origin': 'https://javhd.com',
'priority': 'u=1, i',
'referer': 'https://javhd.com/ja/model' ,
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
'sec-ch-ua-mobile': '?0' ,
'sec-ch-ua-platform': '"macOS"' ,
'sec-fetch-dest': 'empty' ,
'sec-fetch-mode': 'cors' ,
'sec-fetch-site': 'same-origin' ,
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
'x-requested-with': 'XMLHttpRequest' ,
}
POST_DATA = {} # 空字典表示无数据
def sanitize_filename(url_path):
"""将 URL 路径转换为合法的文件名"""
return url_path.strip("/").replace("/", "_") + ".json"
def fetch_data(url, retries=3):
"""从给定 URL 获取数据,带重试机制"""
for attempt in range(retries):
try:
response = requests.post(url, headers=HEADERS, json=POST_DATA, timeout=10)
print(response)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
time.sleep(2)
return None
def save_data(url, data):
"""保存数据到文件"""
parsed_url = urlparse(url)
filename = sanitize_filename(parsed_url.path)
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"[成功] 数据已保存到 {filename}")
def main(s_url):
current_url = urljoin(BASE_URL, s_url)
while current_url:
print(f"[信息] 正在抓取 {current_url}")
data = fetch_data(current_url)
if not data:
print(f"[错误] 无法获取数据 {current_url}")
break
# 检查 JSON 结构
if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
print(f"[错误] 数据结构异常: {data}")
break
save_data(current_url, data)
# 获取下一页
next_path = data.get("pagination_params", {}).get("next")
if next_path:
current_url = urljoin(BASE_URL, next_path)
else:
print("[信息] 已抓取所有页面。")
break
if __name__ == "__main__":
s_url = "/ja/model"
if len(sys.argv) >= 2:
s_url = f'/{sys.argv[1]}/model'
main(s_url)

View File

@ -0,0 +1,119 @@
"""
Script Name:
Description: 从 javhd.com 上获取女优列表,并逐个获取女优详细信息。
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 支持ja,zh,en三种语言可选(一般情况下可以三种全部拉取一遍);
list_format.py 则把这些文件读取出来,合并,形成完整的列表, 主要是把三种语言的女优名字拼到一起, 使用处理后的链接地址+图片地址作为判断同一个人的依据;
model_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
注意: Header部分是从浏览器中抓取的, 时间久了可能要替换。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import os
import re
import json
import glob
import logging
import csv
from collections import defaultdict
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 结果目录
RESULT_DIR = "result"
RESULT_TMP_DIR = f'{RESULT_DIR}/tmp'
OUTPUT_JSON = os.path.join(RESULT_DIR, "models.json")
OUTPUT_CSV = os.path.join(RESULT_DIR, "models.csv")
# 可能需要重命名的文件
LANGS = ["ja", "en", "zh"]
for lang in LANGS:
old_file = os.path.join(RESULT_TMP_DIR, f"{lang}_model.json")
new_file = os.path.join(RESULT_TMP_DIR, f"{lang}_model_popular_1.json")
if os.path.exists(old_file):
logging.info(f"Renaming {old_file} to {new_file}")
os.rename(old_file, new_file)
# 读取所有匹配的 JSON 文件
file_paths = sorted(glob.glob(os.path.join(RESULT_TMP_DIR, "*_model_popular_*.json")))
pattern = re.compile(r'(\w+)_model_popular_(\d+)\.json')
def normalize_url(url):
"""去掉URL中的 en/ja/zh 子目录"""
return re.sub(r'/(en|ja|zh)/', '/', url)
# 主处理程序
def main_process():
models = {}
for file_path in file_paths:
match = pattern.search(os.path.basename(file_path))
if not match:
continue
lang, num = match.groups()
num = int(num)
logging.info(f"Processing {file_path} (lang={lang}, num={num})")
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except Exception as e:
logging.error(f"Failed to load {file_path}: {e}")
continue
template = data.get("template", "")
thumb_components = re.findall(r'<thumb-component[^>]*>', template)
for idx, thumb in enumerate(thumb_components, start=1):
rank = (num - 1) * 36 + idx
link_content = re.search(r'link-content="(.*?)"', thumb)
url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
title = re.search(r'title="(.*?)"', thumb)
if not url_thumb or not title:
logging.info(f"no countent for rank:{rank} title:{title} url:{url_thumb} {thumb}")
continue
pic = url_thumb.group(1)
name = title.group(1)
url = link_content.group(1) if link_content and lang == "en" else ""
norm_url = normalize_url(link_content.group(1))
key = (pic, norm_url)
if key not in models:
models[key] = {"rank": rank, "ja_name": "", "zh_name": "", "en_name": "", "url": url, "pic": pic}
models[key][f"{lang}_name"] = name
if lang == "en" and url:
models[key]["url"] = url
# 按 rank 排序后输出 JSON
sorted_models = sorted(models.values(), key=lambda x: x["rank"])
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump(sorted_models, f, indent=4, ensure_ascii=False)
logging.info(f"Saved JSON output to {OUTPUT_JSON}")
# 输出 CSV 格式
headers = ["rank", "ja_name", "zh_name", "en_name", "url", "pic"]
with open(OUTPUT_CSV, "w", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
writer.writerows(sorted_models)
logging.info(f"Saved TXT output to {OUTPUT_CSV}")
if __name__ == '__main__':
main_process()

View File

@ -0,0 +1,176 @@
"""
Script Name:
Description: 从 javhd.com 上获取女优列表,并逐个获取女优详细信息。
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 支持ja,zh,en三种语言可选(一般情况下可以三种全部拉取一遍);
list_format.py 则把这些文件读取出来,合并,形成完整的列表, 主要是把三种语言的女优名字拼到一起, 使用处理后的链接地址+图片地址作为判断同一个人的依据;
model_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
注意: Header部分是从浏览器中抓取的, 时间久了可能要替换。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import json
import csv
import requests
import time
import logging
import os
import re
from bs4 import BeautifulSoup
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# 目标文件路径
INPUT_FILE = "result/models.json"
OUTPUT_JSON = "result/models_detail.json"
OUTPUT_CSV = "result/models_detail.csv"
HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
'origin': 'https://javhd.com',
'priority': 'u=1, i',
'referer': 'https://javhd.com/ja/model' ,
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
'sec-ch-ua-mobile': '?0' ,
'sec-ch-ua-platform': '"macOS"' ,
'sec-fetch-dest': 'empty' ,
'sec-fetch-mode': 'cors' ,
'sec-fetch-site': 'same-origin' ,
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
}
# 需要提取的字段
FIELDS = ["Height", "Weight", "Breast size", "Breast factor", "Hair color",
"Eye color", "Birth date", "Ethnicity", "Birth place"]
def fetch_data(url, retries=3):
"""从给定 URL 获取数据,带重试机制"""
for attempt in range(retries):
try:
response = requests.get(url, headers=HEADERS, timeout=10)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
time.sleep(2)
return None
def process_paragraph(paragraph):
# 获取完整的 HTML 结构,而不是 get_text()
paragraph_html = str(paragraph)
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
soup = BeautifulSoup(paragraph_html, 'html.parser')
cleaned_text = soup.get_text().strip()
return cleaned_text
# 读取已处理数据
def load_existing_data():
if os.path.exists(OUTPUT_JSON):
try:
with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
detailed_models = json.load(f)
existing_names = {model["en_name"] for model in detailed_models}
except Exception as e:
logging.error(f"无法读取 {OUTPUT_JSON}: {e}")
detailed_models = []
existing_names = set()
else:
detailed_models = []
existing_names = set()
return detailed_models, existing_names
def process_data():
# 读取原始 JSON 数据
try:
with open(INPUT_FILE, "r", encoding="utf-8") as f:
models = json.load(f)
except Exception as e:
logging.error(f"无法读取 {INPUT_FILE}: {e}")
return
detailed_models, existing_names = load_existing_data()
# 遍历 models.json 里的每个对象
for model in models:
en_name = model.get("en_name", "")
ja_name = model.get('ja_name', '')
url = model.get("url", "")
if not url or en_name in existing_names:
logging.info(f"跳过 {en_name}, 已处理或无有效 URL")
continue
logging.info(f"正在处理: {en_name} - {ja_name} - {url}")
try:
response = fetch_data(url, retries=100)
if not response:
logging.warning(f"请求失败 ({response.text}): {url}")
break
soup = BeautifulSoup(response.text, "html.parser")
info_section = soup.find("div", class_="info__features")
if not info_section:
logging.warning(f"未找到 info__features 区块: {url}")
continue
extracted_data = {field: "" for field in FIELDS}
for li in info_section.find_all("li", class_="content-desc__list-item"):
title_tag = li.find("strong", class_="content-desc__list-title")
value_tag = li.find("span", class_="content-desc__list-text")
if title_tag and value_tag:
title = process_paragraph(title_tag)
value = process_paragraph(value_tag)
if title in extracted_data:
extracted_data[title] = value
model.update(extracted_data)
detailed_models.append(model)
# 追加写入 JSON 文件
with open(OUTPUT_JSON, "w+", encoding="utf-8") as f:
json.dump(detailed_models, f, ensure_ascii=False, indent=4)
logging.info(f"已保存: {en_name}")
time.sleep(3) # 适当延迟,防止请求过快
except Exception as e:
logging.error(f"处理 {en_name} 失败: {e}")
# 从 JSON 生成 CSV
def json_to_csv():
if not os.path.exists(OUTPUT_JSON):
print("没有 JSON 文件,跳过 CSV 生成")
return
with open(OUTPUT_JSON, "r", encoding="utf-8") as jsonfile:
data = json.load(jsonfile)
fieldnames = data[0].keys()
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
if __name__ == '__main__':
process_data()
json_to_csv()
print("处理完成!")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff