import cloudscraper import time import json import csv import logging import signal import sys import os import re from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial from urllib.parse import urljoin, urlparse import config import utils # 定义基础 URL 和可变参数 host_url = "https://javhd.com" lang_prefix = ["ja", "en", "zh"] http_code_404 = 404 http_code_login = 401 http_code_local = 99 save_raw_html = False load_from_local = False POST_HEADERS = { "accept": "application/json, text/plain, */*", "content-type": "application/json", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0", "x-requested-with": "XMLHttpRequest", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 'content-type': 'application/json', 'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' , 'origin': 'https://javhd.com', 'priority': 'u=1, i', 'referer': 'https://javhd.com/ja/model' , 'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' , 'sec-ch-ua-mobile': '?0' , 'sec-ch-ua-platform': '"macOS"' , 'sec-fetch-dest': 'empty' , 'sec-fetch-mode': 'cors' , 'sec-fetch-site': 'same-origin' , 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' , 'x-requested-with': 'XMLHttpRequest' , } POST_DATA = {} # 空字典表示无数据 HEADERS = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' , 'origin': 'https://javhd.com', 'priority': 'u=1, i', 'referer': 'https://javhd.com/ja/model' , 'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' , 'sec-ch-ua-mobile': '?0' , 'sec-ch-ua-platform': '"macOS"' , 'sec-fetch-dest': 'empty' , 'sec-fetch-mode': 'cors' , 'sec-fetch-site': 'same-origin' , 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' , } scraper = cloudscraper.create_scraper() # POST 请求,并返回json数据 def fetch_post_page(url, retries=3): """从给定 URL 获取数据,带重试机制""" for attempt in range(retries): try: response = scraper.post(url=url, headers=POST_HEADERS, json=POST_DATA, timeout=10) response.raise_for_status() return response.json() except cloudscraper.exceptions.CloudflareChallengeError as e: logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...") except cloudscraper.exceptions.CloudflareCode1020 as e: logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...") except Exception as e: logging.error(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}") time.sleep(2) return None #使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): if load_from_local: # 从本地读取的逻辑 html = utils.read_raw_html(url) if html: # 预处理 HTML(如果提供了 preprocessor) html_text = preprocessor(html) if preprocessor else html soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 logging.debug(f"read from local. href: {url}") return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的 for attempt in range(max_retries): try: if 'javhd.com' not in url.lower(): logging.error(f'wrong url format: {url}') return None, None response = scraper.get(url, headers=HEADERS) # 处理 HTTP 状态码 if response.status_code == 404: logging.debug(f"Page not found (404): {url}") return None, http_code_404 # 直接返回 404,调用方可以跳过 response.raise_for_status() # 处理 HTTP 错误 # 检查是否发生跳转,比如到登录页面 if response.history: logging.debug(f"Page redirected on {url}. Checking if it's a login page.") soup = BeautifulSoup(response.text, parser) # 判断是否为登录页面, if soup.find('nav', class_='panel form-panel'): logging.debug(f"Page redirected to login page on {url}.") return None, http_code_login if save_raw_html: utils.write_raw_html(url, response.text) # 预处理 HTML(如果提供了 preprocessor) html_text = preprocessor(response.text) if preprocessor else response.text soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 return soup, response.status_code logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except cloudscraper.exceptions.CloudflareChallengeError as e: logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...") except cloudscraper.exceptions.CloudflareCode1020 as e: logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...") except Exception as e: logging.error(f"Unexpected error on {url}: {e}, Retring...") logging.error(f'Fetching failed after max retries. {url}') return None, None # 达到最大重试次数仍然失败 # 修复 HTML 结构,去除多余标签并修正 标签,在获取人种的时候需要 def preprocess_html(html): return html.replace('
', '').replace('
]*>', template) list_data = [] for idx, thumb in enumerate(thumb_components, start=1): rank = (num - 1) * 36 + idx link_content = re.search(r'link-content="(.*?)"', thumb) url_thumb = re.search(r'url-thumb="(.*?)"', thumb) title = re.search(r'title="(.*?)"', thumb) if not url_thumb or not title: logging.warning(f"no countent for rank:{rank} title:{title} url:{url_thumb} {thumb}") continue pic = url_thumb.group(1) name = title.group(1) url = link_content.group(1) data = {"rank": rank, "url": url, "pic": pic} data[f"{lang}_name"] = name list_data.append(data) return list_data def process_paragraph(paragraph): # 获取完整的 HTML 结构,而不是 get_text() paragraph_html = str(paragraph) # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本 soup = BeautifulSoup(paragraph_html, 'html.parser') cleaned_text = soup.get_text().strip() return cleaned_text # 解析 HTML 内容,提取需要的数据 def parse_actor_detail(soup, href): info_section = soup.find("div", class_="info__features") if not info_section: logging.warning(f"未找到 info__features 区块: {href}") return None, None # 页面标题到数据库字段的映射 FIELD_MAPPING = { "Height": "height", "Weight": "weight", "Breast size": "breast_size", "Breast factor": "breast_factor", "Hair color": "hair_color", "Eye color": "eye_color", "Birth date": "birth_date", "Ethnicity": "ethnicity", "Birth place": "birth_place" } # 初始化数据字典,使用数据库字段名 extracted_data = {db_field: "" for db_field in FIELD_MAPPING.values()} extracted_data['url'] = href for li in info_section.find_all("li", class_="content-desc__list-item"): title_tag = li.find("strong", class_="content-desc__list-title") value_tag = li.find("span", class_="content-desc__list-text") if title_tag and value_tag: title = process_paragraph(title_tag) # 页面原始标题 value = process_paragraph(value_tag) # 通过映射表转换为数据库字段名 db_field = FIELD_MAPPING.get(title) if db_field: extracted_data[db_field] = value return extracted_data, None ###### 以下为测试代码 ###### def test_actor_list(): s_url = "/ja/model" current_url = urljoin(host_url, s_url) while current_url: print(f"[信息] 正在抓取 {current_url}") data = fetch_post_page(current_url) if not data: print(f"[错误] 无法获取数据 {current_url}") break # 检查 JSON 结构 if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]): print(f"[错误] 数据结构异常: {data}") break all_data = parse_list_json(data, 1) print(all_data) # 获取下一页 next_path = data.get("pagination_params", {}).get("next") if next_path: current_url = urljoin(host_url, next_path) print(f"next page: {current_url}") else: print("[信息] 已抓取所有页面。") break break def test_actor(): next_url = 'https://javhd.com/en/model/Yui-Hatano' all_data = [] while next_url: print(f'fetching page {next_url}') soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="div", identifier="info__features", attr_type="class")) if soup: list_data, next_url = parse_actor_detail(soup, next_url) if list_data : all_data.append(list_data) else: print('get wrong page.') print(all_data) if __name__ == "__main__": test_actor_list() test_actor()