285 lines
12 KiB
Python
285 lines
12 KiB
Python
import cloudscraper
|
||
import time
|
||
import json
|
||
import csv
|
||
import logging
|
||
import signal
|
||
import sys
|
||
import os
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from requests.exceptions import RequestException
|
||
from functools import partial
|
||
from urllib.parse import urljoin, urlparse
|
||
import config
|
||
import utils
|
||
|
||
# 定义基础 URL 和可变参数
|
||
host_url = "https://javhd.com"
|
||
lang_prefix = ["ja", "en", "zh"]
|
||
|
||
http_code_404 = 404
|
||
http_code_login = 401
|
||
http_code_local = 99
|
||
|
||
save_raw_html = False
|
||
load_from_local = False
|
||
|
||
POST_HEADERS = {
|
||
"accept": "application/json, text/plain, */*",
|
||
"content-type": "application/json",
|
||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",
|
||
"x-requested-with": "XMLHttpRequest",
|
||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||
'content-type': 'application/json',
|
||
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
|
||
'origin': 'https://javhd.com',
|
||
'priority': 'u=1, i',
|
||
'referer': 'https://javhd.com/ja/model' ,
|
||
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
|
||
'sec-ch-ua-mobile': '?0' ,
|
||
'sec-ch-ua-platform': '"macOS"' ,
|
||
'sec-fetch-dest': 'empty' ,
|
||
'sec-fetch-mode': 'cors' ,
|
||
'sec-fetch-site': 'same-origin' ,
|
||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
|
||
'x-requested-with': 'XMLHttpRequest' ,
|
||
}
|
||
POST_DATA = {} # 空字典表示无数据
|
||
|
||
HEADERS = {
|
||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||
'cookie': 'adult-warning-popup=disabled; st_d=%7B%7D; feid=c18cd2f2cf5c034d120e5975558acc8c; xfeid=3b040b0aecba9d3df41f21732480d947; _ym_uid=1739069925634817268; _ym_d=1739069925; atas_uid=; _clck=1cd9xpy%7C2%7Cftb%7C0%7C1866; _ym_isad=2; nats=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_cookie=https%253A%252F%252Fcn.pornhub.com%252F; nats_unique=ODY0LjIuMi4yNi4yMzQuMC4wLjAuMA; nats_sess=480e7410e649efce6003c3add587a579; nats_landing=No%2BLanding%2BPage%2BURL; JAVSESSID=n42hnvj3ecr0r6tadusladpk3h; user_lang=zh; locale=ja; utm=%7B%22ads_type%22%3A%22%22%7D; sid=3679b28ec523df85ec4e7739e32f2008; _ym_visorc=w; feid_sa=62; sid_sa=2' ,
|
||
'origin': 'https://javhd.com',
|
||
'priority': 'u=1, i',
|
||
'referer': 'https://javhd.com/ja/model' ,
|
||
'sec-ch-ua': '"Not A(Brand";v="8", "Chromium";v="132", "Microsoft Edge";v="132"' ,
|
||
'sec-ch-ua-mobile': '?0' ,
|
||
'sec-ch-ua-platform': '"macOS"' ,
|
||
'sec-fetch-dest': 'empty' ,
|
||
'sec-fetch-mode': 'cors' ,
|
||
'sec-fetch-site': 'same-origin' ,
|
||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0' ,
|
||
}
|
||
|
||
scraper = cloudscraper.create_scraper()
|
||
|
||
# POST 请求,并返回json数据
|
||
def fetch_post_page(url, retries=3):
|
||
"""从给定 URL 获取数据,带重试机制"""
|
||
for attempt in range(retries):
|
||
try:
|
||
response = scraper.post(url=url, headers=POST_HEADERS, json=POST_DATA, timeout=10)
|
||
response.raise_for_status()
|
||
return response.json()
|
||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||
except Exception as e:
|
||
logging.error(f"[错误] 请求失败 {url}: {e}, 重试 {attempt + 1}/{retries}")
|
||
time.sleep(2)
|
||
return None
|
||
|
||
|
||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||
def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
|
||
if load_from_local: # 从本地读取的逻辑
|
||
html = utils.read_raw_html(url)
|
||
if html:
|
||
# 预处理 HTML(如果提供了 preprocessor)
|
||
html_text = preprocessor(html) if preprocessor else html
|
||
|
||
soup = BeautifulSoup(html_text, parser)
|
||
if validator(soup): # 进行自定义页面检查
|
||
logging.debug(f"read from local. href: {url}")
|
||
return soup, http_code_local # 返回一个小于100的错误码,表明是从本地返回的
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
if 'javhd.com' not in url.lower():
|
||
logging.error(f'wrong url format: {url}')
|
||
return None, None
|
||
|
||
response = scraper.get(url, headers=HEADERS)
|
||
|
||
# 处理 HTTP 状态码
|
||
if response.status_code == 404:
|
||
logging.debug(f"Page not found (404): {url}")
|
||
return None, http_code_404 # 直接返回 404,调用方可以跳过
|
||
|
||
response.raise_for_status() # 处理 HTTP 错误
|
||
|
||
# 检查是否发生跳转,比如到登录页面
|
||
if response.history:
|
||
logging.debug(f"Page redirected on {url}. Checking if it's a login page.")
|
||
soup = BeautifulSoup(response.text, parser)
|
||
# 判断是否为登录页面,
|
||
if soup.find('nav', class_='panel form-panel'):
|
||
logging.debug(f"Page redirected to login page on {url}.")
|
||
return None, http_code_login
|
||
|
||
if save_raw_html:
|
||
utils.write_raw_html(url, response.text)
|
||
|
||
# 预处理 HTML(如果提供了 preprocessor)
|
||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||
|
||
soup = BeautifulSoup(html_text, parser)
|
||
if validator(soup): # 进行自定义页面检查
|
||
return soup, response.status_code
|
||
|
||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||
logging.error(f"Cloudflare Challenge Error on {url}: {e}, Retring...")
|
||
except cloudscraper.exceptions.CloudflareCode1020 as e:
|
||
logging.error(f"Access Denied (Error 1020) on {url}: {e}, Retring...")
|
||
except Exception as e:
|
||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||
|
||
logging.error(f'Fetching failed after max retries. {url}')
|
||
return None, None # 达到最大重试次数仍然失败
|
||
|
||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||
def preprocess_html(html):
|
||
return html.replace('<br>', '').replace('<a ', '<a target="_blank" ')
|
||
|
||
# 通用的 HTML 结构验证器
|
||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||
if attr_type == "id":
|
||
return soup.find(tag, id=identifier) is not None
|
||
elif attr_type == "class":
|
||
return bool(soup.find_all(tag, class_=identifier))
|
||
elif attr_type == "name":
|
||
return bool(soup.find('select', {'name': identifier}))
|
||
return False
|
||
|
||
|
||
# 解析列表页
|
||
def parse_list_json(data, num, lang='en'):
|
||
template = data.get("template", "")
|
||
thumb_components = re.findall(r'<thumb-component[^>]*>', template)
|
||
|
||
list_data = []
|
||
for idx, thumb in enumerate(thumb_components, start=1):
|
||
rank = (num - 1) * 36 + idx
|
||
|
||
link_content = re.search(r'link-content="(.*?)"', thumb)
|
||
url_thumb = re.search(r'url-thumb="(.*?)"', thumb)
|
||
title = re.search(r'title="(.*?)"', thumb)
|
||
|
||
if not url_thumb or not title:
|
||
logging.warning(f"no countent for rank:{rank} title:{title} url:{url_thumb} {thumb}")
|
||
continue
|
||
|
||
pic = url_thumb.group(1)
|
||
name = title.group(1)
|
||
url = link_content.group(1)
|
||
|
||
data = {"rank": rank, "url": url, "pic": pic}
|
||
data[f"{lang}_name"] = name
|
||
|
||
list_data.append(data)
|
||
|
||
return list_data
|
||
|
||
def process_paragraph(paragraph):
|
||
# 获取完整的 HTML 结构,而不是 get_text()
|
||
paragraph_html = str(paragraph)
|
||
|
||
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
|
||
soup = BeautifulSoup(paragraph_html, 'html.parser')
|
||
cleaned_text = soup.get_text().strip()
|
||
|
||
return cleaned_text
|
||
|
||
|
||
# 解析 HTML 内容,提取需要的数据
|
||
def parse_actor_detail(soup, href):
|
||
info_section = soup.find("div", class_="info__features")
|
||
|
||
if not info_section:
|
||
logging.warning(f"未找到 info__features 区块: {href}")
|
||
return None, None
|
||
|
||
# 页面标题到数据库字段的映射
|
||
FIELD_MAPPING = {
|
||
"Height": "height",
|
||
"Weight": "weight",
|
||
"Breast size": "breast_size",
|
||
"Breast factor": "breast_factor",
|
||
"Hair color": "hair_color",
|
||
"Eye color": "eye_color",
|
||
"Birth date": "birth_date",
|
||
"Ethnicity": "ethnicity",
|
||
"Birth place": "birth_place"
|
||
}
|
||
# 初始化数据字典,使用数据库字段名
|
||
extracted_data = {db_field: "" for db_field in FIELD_MAPPING.values()}
|
||
extracted_data['url'] = href
|
||
|
||
for li in info_section.find_all("li", class_="content-desc__list-item"):
|
||
title_tag = li.find("strong", class_="content-desc__list-title")
|
||
value_tag = li.find("span", class_="content-desc__list-text")
|
||
|
||
if title_tag and value_tag:
|
||
title = process_paragraph(title_tag) # 页面原始标题
|
||
value = process_paragraph(value_tag)
|
||
|
||
# 通过映射表转换为数据库字段名
|
||
db_field = FIELD_MAPPING.get(title)
|
||
if db_field:
|
||
extracted_data[db_field] = value
|
||
return extracted_data, None
|
||
|
||
###### 以下为测试代码 ######
|
||
def test_actor_list():
|
||
s_url = "/ja/model"
|
||
current_url = urljoin(host_url, s_url)
|
||
while current_url:
|
||
print(f"[信息] 正在抓取 {current_url}")
|
||
data = fetch_post_page(current_url)
|
||
|
||
if not data:
|
||
print(f"[错误] 无法获取数据 {current_url}")
|
||
break
|
||
|
||
# 检查 JSON 结构
|
||
if not all(key in data for key in ["status", "results_count", "pagination_params", "template"]):
|
||
print(f"[错误] 数据结构异常: {data}")
|
||
break
|
||
|
||
all_data = parse_list_json(data, 1)
|
||
print(all_data)
|
||
|
||
# 获取下一页
|
||
next_path = data.get("pagination_params", {}).get("next")
|
||
if next_path:
|
||
current_url = urljoin(host_url, next_path)
|
||
print(f"next page: {current_url}")
|
||
else:
|
||
print("[信息] 已抓取所有页面。")
|
||
break
|
||
|
||
break
|
||
|
||
def test_actor():
|
||
next_url = 'https://javhd.com/en/model/Yui-Hatano'
|
||
all_data = []
|
||
while next_url:
|
||
print(f'fetching page {next_url}')
|
||
soup, http_status_code = fetch_page(next_url, partial(generic_validator, tag="div", identifier="info__features", attr_type="class"))
|
||
if soup:
|
||
list_data, next_url = parse_actor_detail(soup, next_url)
|
||
if list_data :
|
||
all_data.append(list_data)
|
||
else:
|
||
print('get wrong page.')
|
||
print(all_data)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
test_actor_list()
|
||
test_actor()
|
||
|