modify scripts
This commit is contained in:
119
javhd/list_format.py
Normal file
119
javhd/list_format.py
Normal file
@ -0,0 +1,119 @@
|
||||
"""
|
||||
Script Name:
|
||||
Description: 从 javhd.com 上获取女优列表,并逐个获取女优详细信息。
|
||||
list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 支持ja,zh,en三种语言可选(一般情况下可以三种全部拉取一遍);
|
||||
list_format.py 则把这些文件读取出来,合并,形成完整的列表, 主要是把三种语言的女优名字拼到一起, 使用处理后的链接地址+图片地址作为判断同一个人的依据;
|
||||
model_fetch.py 则把上一步获取到的列表,读取详情页面,合并进来一些详细信息。
|
||||
注意: Header部分是从浏览器中抓取的, 时间久了可能要替换。
|
||||
|
||||
Author: [Your Name]
|
||||
Created Date: YYYY-MM-DD
|
||||
Last Modified: YYYY-MM-DD
|
||||
Version: 1.0
|
||||
|
||||
Modification History:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
- YYYY-MM-DD [Your Name]:
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import glob
|
||||
import logging
|
||||
import csv
|
||||
from collections import defaultdict
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# 结果目录
|
||||
RESULT_DIR = "result"
|
||||
RESULT_TMP_DIR = f'{RESULT_DIR}/tmp'
|
||||
OUTPUT_JSON = os.path.join(RESULT_DIR, "models.json")
|
||||
OUTPUT_CSV = os.path.join(RESULT_DIR, "models.csv")
|
||||
|
||||
# 可能需要重命名的文件
|
||||
LANGS = ["ja", "en", "zh"]
|
||||
for lang in LANGS:
|
||||
old_file = os.path.join(RESULT_TMP_DIR, f"{lang}_model.json")
|
||||
new_file = os.path.join(RESULT_TMP_DIR, f"{lang}_model_popular_1.json")
|
||||
if os.path.exists(old_file):
|
||||
logging.info(f"Renaming {old_file} to {new_file}")
|
||||
os.rename(old_file, new_file)
|
||||
|
||||
# 读取所有匹配的 JSON 文件
|
||||
file_paths = sorted(glob.glob(os.path.join(RESULT_TMP_DIR, "*_model_popular_*.json")))
|
||||
pattern = re.compile(r'(\w+)_model_popular_(\d+)\.json')
|
||||
|
||||
def normalize_url(url):
|
||||
"""去掉URL中的 en/ja/zh 子目录"""
|
||||
return re.sub(r'/(en|ja|zh)/', '/', url)
|
||||
|
||||
# 主处理程序
|
||||
def main_process():
|
||||
models = {}
|
||||
|
||||
for file_path in file_paths:
|
||||
match = pattern.search(os.path.basename(file_path))
|
||||
if not match:
|
||||
continue
|
||||
|
||||
lang, num = match.groups()
|
||||
num = int(num)
|
||||
|
||||
logging.info(f"Processing {file_path} (lang={lang}, num={num})")
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load {file_path}: {e}")
|
||||
continue
|
||||
|
||||
template = data.get("template", "")
|
||||
thumb_components = re.findall(r'<thumb-component[^>]*>', template)
|
||||
|
||||
for idx, thumb in enumerate(thumb_components, start=1):
|
||||
rank = (num - 1) * 36 + idx
|
||||
|
||||
link_content = re.search(r'link-content="(.*?)"', thumb)
|
||||
url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
|
||||
title = re.search(r'title="(.*?)"', thumb)
|
||||
|
||||
if not url_thumb or not title:
|
||||
logging.info(f"no countent for rank:{rank} title:{title} url:{url_thumb} {thumb}")
|
||||
continue
|
||||
|
||||
pic = url_thumb.group(1)
|
||||
name = title.group(1)
|
||||
url = link_content.group(1) if link_content and lang == "en" else ""
|
||||
norm_url = normalize_url(link_content.group(1))
|
||||
|
||||
key = (pic, norm_url)
|
||||
if key not in models:
|
||||
models[key] = {"rank": rank, "ja_name": "", "zh_name": "", "en_name": "", "url": url, "pic": pic}
|
||||
|
||||
models[key][f"{lang}_name"] = name
|
||||
if lang == "en" and url:
|
||||
models[key]["url"] = url
|
||||
|
||||
# 按 rank 排序后输出 JSON
|
||||
sorted_models = sorted(models.values(), key=lambda x: x["rank"])
|
||||
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(sorted_models, f, indent=4, ensure_ascii=False)
|
||||
logging.info(f"Saved JSON output to {OUTPUT_JSON}")
|
||||
|
||||
# 输出 CSV 格式
|
||||
headers = ["rank", "ja_name", "zh_name", "en_name", "url", "pic"]
|
||||
with open(OUTPUT_CSV, "w", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=headers)
|
||||
writer.writeheader()
|
||||
writer.writerows(sorted_models)
|
||||
logging.info(f"Saved TXT output to {OUTPUT_CSV}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main_process()
|
||||
Reference in New Issue
Block a user