This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/javhd/src/scraper.py
2025-06-03 10:20:03 +08:00

285 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import cloudscraper
import time
import json
import csv
import logging
import signal
import sys
import os
import re
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
from urllib.parse import urljoin, urlparse
import config
import utils
# 定义基础 URL 和可变参数
host_url = "https://javhd.com"
lang_prefix = ["ja", "en", "zh"]
http_code_404 = 404
http_code_login = 401
http_code_local = 99
save_raw_html = False
load_from_local = False
POST_HEADERS = {
"accept": "application/json, text/plain, */*",
"content-type": "application/json",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",
"x-requested-with": "XMLHttpRequest",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
'content-type': 'application/json',
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
'origin': 'https://javhd.com',
'priority': 'u=1, i',
'referer': 'https://javhd.com/ja/model' ,
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
'sec-ch-ua-mobile': '?0' ,
'sec-ch-ua-platform': '"macOS"' ,
'sec-fetch-dest': 'empty' ,
'sec-fetch-mode': 'cors' ,
'sec-fetch-site': 'same-origin' ,
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
'x-requested-with': 'XMLHttpRequest' ,
}
POST_DATA = {} # 空字典表示无数据
HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
'origin': 'https://javhd.com',
'priority': 'u=1, i',
'referer': 'https://javhd.com/ja/model' ,
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
'sec-ch-ua-mobile': '?0' ,
'sec-ch-ua-platform': '"macOS"' ,
'sec-fetch-dest': 'empty' ,
'sec-fetch-mode': 'cors' ,
'sec-fetch-site': 'same-origin' ,
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
}
scraper = cloudscraper.create_scraper()
# POST 请求并返回json数据
def fetch_post_page(url, retries=3):
"""从给定 URL 获取数据,带重试机制"""
for attempt in range(retries):
try:
response = scraper.post(url=url, headers=POST_HEADERS, json=POST_DATA, timeout=10)
response.raise_for_status()
return response.json()
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
time.sleep(2)
return None
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
if load_from_local: # 从本地读取的逻辑
html = utils.read_raw_html(url)
if html:
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(html) if preprocessor else html
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
logging.debug(f"read from local. href: {url}")
return soup, http_code_local # 返回一个小于100的错误码表明是从本地返回的
for attempt in range(max_retries):
try:
if 'javhd.com' not in url.lower():
logging.error(f'wrong url format: {url}')
return None, None
response = scraper.get(url, headers=HEADERS)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.debug(f"Page not found (404): {url}")
return None, http_code_404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 检查是否发生跳转,比如到登录页面
if response.history:
logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
soup = BeautifulSoup(response.text, parser)
# 判断是否为登录页面,
if soup.find('nav', class_='panel form-panel'):
logging.debug(f"Page redirected to login page on {url}.")
return None, http_code_login
if save_raw_html:
utils.write_raw_html(url, response.text)
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except cloudscraper.exceptions.CloudflareChallengeError as e:
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
except cloudscraper.exceptions.CloudflareCode1020 as e:
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
except Exception as e:
logging.error(f"Unexpected error on {url}: {e}, Retring...")
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
def preprocess_html(html):
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 解析列表页
def parse_list_json(data, num, lang='en'):
template = data.get("template", "")
thumb_components = re.findall(r'<thumb-component[^>]*>', template)
list_data = []
for idx, thumb in enumerate(thumb_components, start=1):
rank = (num - 1) * 36 + idx
link_content = re.search(r'link-content="(.*?)"', thumb)
url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
title = re.search(r'title="(.*?)"', thumb)
if not url_thumb or not title:
logging.warning(f"no countent for rank:{rank} title:{title} url:{url_thumb} {thumb}")
continue
pic = url_thumb.group(1)
name = title.group(1)
url = link_content.group(1)
data = {"rank": rank, "url": url, "pic": pic}
data[f"{lang}_name"] = name
list_data.append(data)
return list_data
def process_paragraph(paragraph):
# 获取完整的 HTML 结构,而不是 get_text()
paragraph_html = str(paragraph)
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
soup = BeautifulSoup(paragraph_html, 'html.parser')
cleaned_text = soup.get_text().strip()
return cleaned_text
# 解析 HTML 内容,提取需要的数据
def parse_actor_detail(soup, href):
info_section = soup.find("div", class_="info__features")
if not info_section:
logging.warning(f"未找到 info__features 区块: {href}")
return None, None
# 页面标题到数据库字段的映射
FIELD_MAPPING = {
"Height": "height",
"Weight": "weight",
"Breast size": "breast_size",
"Breast factor": "breast_factor",
"Hair color": "hair_color",
"Eye color": "eye_color",
"Birth date": "birth_date",
"Ethnicity": "ethnicity",
"Birth place": "birth_place"
}
# 初始化数据字典,使用数据库字段名
extracted_data = {db_field: "" for db_field in FIELD_MAPPING.values()}
extracted_data['url'] = href
for li in info_section.find_all("li", class_="content-desc__list-item"):
title_tag = li.find("strong", class_="content-desc__list-title")
value_tag = li.find("span", class_="content-desc__list-text")
if title_tag and value_tag:
title = process_paragraph(title_tag) # 页面原始标题
value = process_paragraph(value_tag)
# 通过映射表转换为数据库字段名
db_field = FIELD_MAPPING.get(title)
if db_field:
extracted_data[db_field] = value
return extracted_data, None
###### 以下为测试代码 ######
def test_actor_list():
s_url = "/ja/model"
current_url = urljoin(host_url, s_url)
while current_url:
print(f"[信息] 正在抓取 {current_url}")
data = fetch_post_page(current_url)
if not data:
print(f"[错误] 无法获取数据 {current_url}")
break
# 检查 JSON 结构
if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
print(f"[错误] 数据结构异常: {data}")
break
all_data = parse_list_json(data, 1)
print(all_data)
# 获取下一页
next_path = data.get("pagination_params", {}).get("next")
if next_path:
current_url = urljoin(host_url, next_path)
print(f"next page: {current_url}")
else:
print("[信息] 已抓取所有页面。")
break
break
def test_actor():
next_url = 'https://javhd.com/en/model/Yui-Hatano'
all_data = []
while next_url:
print(f'fetching page {next_url}')
soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="div", identifier="info__features", attr_type="class"))
if soup:
list_data, next_url = parse_actor_detail(soup, next_url)
if list_data :
all_data.append(list_data)
else:
print('get wrong page.')
print(all_data)
if __name__ == "__main__":
test_actor_list()
test_actor()