modify scripts

2025-06-03 10:20:03 +08:00
parent f45886bc9f
commit e97f49bfb9
5 changed files with 825 additions and 0 deletions
--- a/javhd/src/scraper.py
+++ b/javhd/src/scraper.py
@ -0,0 +1,285 @@
+import cloudscraper
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+from urllib.parse import urljoin, urlparse
+import config
+import utils
+
+# 定义基础 URL 和可变参数
+host_url = "https://javhd.com"
+lang_prefix = ["ja", "en", "zh"]
+
+http_code_404   = 404
+http_code_login = 401
+http_code_local = 99
+
+save_raw_html = False
+load_from_local = False
+
+POST_HEADERS = {
+    "accept": "application/json, text/plain, */*",
+    "content-type": "application/json",
+    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",
+    "x-requested-with": "XMLHttpRequest",
+    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+    'content-type': 'application/json',
+    'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
+    'origin': 'https://javhd.com',
+    'priority': 'u=1, i',
+    'referer': 'https://javhd.com/ja/model' ,
+    'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
+    'sec-ch-ua-mobile': '?0' ,
+    'sec-ch-ua-platform': '"macOS"' ,
+    'sec-fetch-dest': 'empty' ,
+    'sec-fetch-mode': 'cors' ,
+    'sec-fetch-site': 'same-origin' ,
+    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
+    'x-requested-with': 'XMLHttpRequest' ,
+}
+POST_DATA = {}  # 空字典表示无数据
+
+HEADERS = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+    'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
+    'origin': 'https://javhd.com',
+    'priority': 'u=1, i',
+    'referer': 'https://javhd.com/ja/model' ,
+    'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
+    'sec-ch-ua-mobile': '?0' ,
+    'sec-ch-ua-platform': '"macOS"' ,
+    'sec-fetch-dest': 'empty' ,
+    'sec-fetch-mode': 'cors' ,
+    'sec-fetch-site': 'same-origin' ,
+    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
+}
+
+scraper = cloudscraper.create_scraper()
+
+# POST 请求，并返回json数据
+def fetch_post_page(url, retries=3):
+    """从给定 URL 获取数据，带重试机制"""
+    for attempt in range(retries):
+        try:            
+            response = scraper.post(url=url, headers=POST_HEADERS, json=POST_DATA, timeout=10)
+            response.raise_for_status()
+            return response.json()
+        except cloudscraper.exceptions.CloudflareChallengeError as e:
+            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+        except cloudscraper.exceptions.CloudflareCode1020 as e:
+            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+        except Exception as e:
+            logging.error(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
+            time.sleep(2)
+    return None
+
+
+#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
+def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+    if load_from_local:     # 从本地读取的逻辑
+        html = utils.read_raw_html(url)
+        if html:
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(html) if preprocessor else html
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                logging.debug(f"read from local. href: {url}")
+                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的
+
+    for attempt in range(max_retries):
+        try:
+            if 'javhd.com' not in url.lower():
+                logging.error(f'wrong url format: {url}')
+                return None, None
+            
+            response = scraper.get(url, headers=HEADERS)
+
+            # 处理 HTTP 状态码
+            if response.status_code == 404:
+                logging.debug(f"Page not found (404): {url}")
+                return None, http_code_404  # 直接返回 404，调用方可以跳过
+            
+            response.raise_for_status()  # 处理 HTTP 错误
+
+            # 检查是否发生跳转，比如到登录页面
+            if response.history:
+                logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
+                soup = BeautifulSoup(response.text, parser)
+                # 判断是否为登录页面，
+                if soup.find('nav', class_='panel form-panel'):
+                    logging.debug(f"Page redirected to login page on {url}.")
+                    return None, http_code_login
+                
+            if save_raw_html:
+                utils.write_raw_html(url, response.text)
+
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(response.text) if preprocessor else response.text
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup, response.status_code
+
+            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+        except cloudscraper.exceptions.CloudflareChallengeError as e:
+            logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
+        except cloudscraper.exceptions.CloudflareCode1020 as e:
+            logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
+        except Exception as e:
+            logging.error(f"Unexpected error on {url}: {e}, Retring...")
+
+    logging.error(f'Fetching failed after max retries. {url}')
+    return None, None  # 达到最大重试次数仍然失败
+
+# 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
+def preprocess_html(html):
+    return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
+
+# 通用的 HTML 结构验证器
+def generic_validator(soup, tag, identifier, attr_type="id"):
+    if attr_type == "id":
+        return soup.find(tag, id=identifier) is not None
+    elif attr_type == "class":
+        return bool(soup.find_all(tag, class_=identifier))
+    elif attr_type == "name": 
+        return bool(soup.find('select', {'name': identifier}))
+    return False
+
+
+# 解析列表页
+def parse_list_json(data, num, lang='en'): 
+    template = data.get("template", "")
+    thumb_components = re.findall(r'<thumb-component[^>]*>', template)
+    
+    list_data = []
+    for idx, thumb in enumerate(thumb_components, start=1):
+        rank = (num - 1) * 36 + idx
+        
+        link_content = re.search(r'link-content="(.*?)"', thumb)
+        url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
+        title = re.search(r'title="(.*?)"', thumb)
+        
+        if not url_thumb or not title:
+            logging.warning(f"no countent for rank:{rank} title:{title} url:{url_thumb}  {thumb}")
+            continue
+        
+        pic = url_thumb.group(1) 
+        name = title.group(1)
+        url = link_content.group(1)
+
+        data = {"rank": rank, "url": url, "pic": pic}
+        data[f"{lang}_name"] = name
+
+        list_data.append(data)
+
+    return list_data
+
+def process_paragraph(paragraph):
+    # 获取完整的 HTML 结构，而不是 get_text()
+    paragraph_html = str(paragraph)
+
+    # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
+    soup = BeautifulSoup(paragraph_html, 'html.parser')
+    cleaned_text = soup.get_text().strip()
+
+    return cleaned_text
+
+
+# 解析 HTML 内容，提取需要的数据
+def parse_actor_detail(soup, href):
+    info_section = soup.find("div", class_="info__features")
+    
+    if not info_section:
+        logging.warning(f"未找到 info__features 区块: {href}")
+        return  None, None
+        
+    # 页面标题到数据库字段的映射
+    FIELD_MAPPING = {
+        "Height": "height",
+        "Weight": "weight",
+        "Breast size": "breast_size",
+        "Breast factor": "breast_factor",
+        "Hair color": "hair_color",
+        "Eye color": "eye_color",
+        "Birth date": "birth_date",
+        "Ethnicity": "ethnicity",
+        "Birth place": "birth_place"
+    }
+    # 初始化数据字典，使用数据库字段名
+    extracted_data = {db_field: "" for db_field in FIELD_MAPPING.values()}
+    extracted_data['url'] = href
+    
+    for li in info_section.find_all("li", class_="content-desc__list-item"):
+        title_tag = li.find("strong", class_="content-desc__list-title")
+        value_tag = li.find("span", class_="content-desc__list-text")
+        
+        if title_tag and value_tag:
+            title = process_paragraph(title_tag)  # 页面原始标题
+            value = process_paragraph(value_tag)
+            
+            # 通过映射表转换为数据库字段名
+            db_field = FIELD_MAPPING.get(title)
+            if db_field:
+                extracted_data[db_field] = value
+    return extracted_data, None
+
+###### 以下为测试代码 ######
+def test_actor_list():
+    s_url = "/ja/model"
+    current_url = urljoin(host_url, s_url)
+    while current_url:
+        print(f"[信息] 正在抓取 {current_url}")
+        data = fetch_post_page(current_url)
+        
+        if not data:
+            print(f"[错误] 无法获取数据 {current_url}")
+            break
+        
+        # 检查 JSON 结构
+        if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
+            print(f"[错误] 数据结构异常: {data}")
+            break
+
+        all_data = parse_list_json(data, 1)
+        print(all_data)
+        
+        # 获取下一页
+        next_path = data.get("pagination_params", {}).get("next")
+        if next_path:
+            current_url = urljoin(host_url, next_path)
+            print(f"next page: {current_url}")
+        else:
+            print("[信息] 已抓取所有页面。")
+            break
+            
+        break
+
+def test_actor():
+    next_url = 'https://javhd.com/en/model/Yui-Hatano'
+    all_data = []
+    while next_url:
+        print(f'fetching page {next_url}')
+        soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="div", identifier="info__features", attr_type="class"))
+        if soup:
+            list_data, next_url = parse_actor_detail(soup, next_url)
+            if list_data :
+                all_data.append(list_data)
+            else:
+                print('get wrong page.')
+    print(all_data)
+
+
+if __name__ == "__main__":
+    test_actor_list()
+    test_actor()
+