modify scripts

2025-03-17 11:30:35 +08:00
parent e6327fbe73
commit d5dc76b87f
178 changed files with 44 additions and 184447 deletions
--- a/javhd/list_format.py
+++ b/javhd/list_format.py
@ -0,0 +1,119 @@
+"""
+Script Name: 
+Description: 从 javhd.com 上获取女优列表，并逐个获取女优详细信息。
+    list_fetch.py 从网站上获取列表, 并以json的形式把结果输出到本地文件, 支持ja,zh,en三种语言可选(一般情况下可以三种全部拉取一遍); 
+    list_format.py 则把这些文件读取出来，合并，形成完整的列表, 主要是把三种语言的女优名字拼到一起, 使用处理后的链接地址+图片地址作为判断同一个人的依据;
+    model_fetch.py 则把上一步获取到的列表，读取详情页面，合并进来一些详细信息。
+    注意: Header部分是从浏览器中抓取的, 时间久了可能要替换。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import os
+import re
+import json
+import glob
+import logging
+import csv
+from collections import defaultdict
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# 结果目录
+RESULT_DIR = "result"
+RESULT_TMP_DIR = f'{RESULT_DIR}/tmp'
+OUTPUT_JSON = os.path.join(RESULT_DIR, "models.json")
+OUTPUT_CSV = os.path.join(RESULT_DIR, "models.csv")
+
+# 可能需要重命名的文件
+LANGS = ["ja", "en", "zh"]
+for lang in LANGS:
+    old_file = os.path.join(RESULT_TMP_DIR, f"{lang}_model.json")
+    new_file = os.path.join(RESULT_TMP_DIR, f"{lang}_model_popular_1.json")
+    if os.path.exists(old_file):
+        logging.info(f"Renaming {old_file} to {new_file}")
+        os.rename(old_file, new_file)
+
+# 读取所有匹配的 JSON 文件
+file_paths = sorted(glob.glob(os.path.join(RESULT_TMP_DIR, "*_model_popular_*.json")))
+pattern = re.compile(r'(\w+)_model_popular_(\d+)\.json')
+
+def normalize_url(url):
+    """去掉URL中的 en/ja/zh 子目录"""
+    return re.sub(r'/(en|ja|zh)/', '/', url)
+
+# 主处理程序
+def main_process():
+    models = {}
+
+    for file_path in file_paths:
+        match = pattern.search(os.path.basename(file_path))
+        if not match:
+            continue
+        
+        lang, num = match.groups()
+        num = int(num)
+        
+        logging.info(f"Processing {file_path} (lang={lang}, num={num})")
+        
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except Exception as e:
+            logging.error(f"Failed to load {file_path}: {e}")
+            continue
+        
+        template = data.get("template", "")
+        thumb_components = re.findall(r'<thumb-component[^>]*>', template)
+        
+        for idx, thumb in enumerate(thumb_components, start=1):
+            rank = (num - 1) * 36 + idx
+            
+            link_content = re.search(r'link-content="(.*?)"', thumb)
+            url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
+            title = re.search(r'title="(.*?)"', thumb)
+            
+            if not url_thumb or not title:
+                logging.info(f"no countent for rank:{rank} title:{title} url:{url_thumb}  {thumb}")
+                continue
+            
+            pic = url_thumb.group(1)
+            name = title.group(1)
+            url = link_content.group(1) if link_content and lang == "en" else ""
+            norm_url = normalize_url(link_content.group(1))
+            
+            key = (pic, norm_url)
+            if key not in models:
+                models[key] = {"rank": rank, "ja_name": "", "zh_name": "", "en_name": "", "url": url, "pic": pic}
+            
+            models[key][f"{lang}_name"] = name
+            if lang == "en" and url:
+                models[key]["url"] = url
+
+    # 按 rank 排序后输出 JSON
+    sorted_models = sorted(models.values(), key=lambda x: x["rank"])
+
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump(sorted_models, f, indent=4, ensure_ascii=False)
+    logging.info(f"Saved JSON output to {OUTPUT_JSON}")
+
+    # 输出 CSV 格式
+    headers = ["rank", "ja_name", "zh_name", "en_name", "url", "pic"]
+    with open(OUTPUT_CSV, "w", encoding="utf-8") as csvfile:    
+        writer = csv.DictWriter(csvfile, fieldnames=headers)
+        writer.writeheader()
+        writer.writerows(sorted_models)
+    logging.info(f"Saved TXT output to {OUTPUT_CSV}")
+
+
+if __name__ == '__main__':
+    main_process()