This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py
2025-07-23 13:16:27 +08:00

231 lines
11 KiB
Python

# spiders/pornbox_spider.py
import scrapy
import json
import os
import sys
from datetime import datetime
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import PBoxStuItem, PBoxMovItem, CommErrItem, PBoxActorIndexItem, PBoxAlternateItem, PBoxMovIndexItem, PBoxMovTagsItem
import scrapy_proj.comm.comm_def as comm
from scrapy_proj.utils.utils import format_timestamp
from scrapy_proj.db_wapper.spider_db_handler import PboxDBHandler
db_tools = PboxDBHandler()
class PornboxSpider(BaseSpider):
name = comm.SPIDER_NAME_PBOX
allowed_domains = ["pornbox.com"]
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
"content-type": "application/json",
'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'priority': 'u=1, i',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0',
'x-requested-with': 'XMLHttpRequest',
'x-csrf-token' : 'GOgY0XYd-ZImUIrOID6f35AIw_ZbdvvITor4',
'cookie': '_ga=GA1.1.1702111276.1741134024; agree18=1; _al=19t837144r; http_referer=https%3A%2F%2Ftheporndude.com%2F; entry_point=https%3A%2F%2Fpornbox.com%2Fapplication%2Fstudio%2Flist%3Faff%3DUBJGFRA8UU____; pornboxcookie=19c67b1326563c7f0y83hr5TyULWp6xXf3rI2-ML9bO5yV0ej1oycJoLejz2ZWQozMsVdb4HwoUgnYfElqZf4kNTJKpIFReAUgqbvZI3QPPdvmbnKLsblHN5Dytt0MRNqj5o_ul-SOOtvuKQqGkQU9bKpbOZfOtpZk5rqA==; sxc_affiliate=UBJGFRA8UU; product_offer=; boxsessid=s%3AvJIsw1Zeq96r_j2Bq24rpsHZ1VyI5aIR.R1biCZrwUhuzYELLQWepgNblLsGATRLl6xIFVLRlvs8; _als=k84lsh1olk; _ga_E272WS0NTB=GS2.1.s1751772066$o10$g1$t1751772680$j59$l0$h0; JDIALOG3=AQ79OJ467NWQONZM1TH55VFZR19PIWUGEZUKPV7G6UWMOFZRIJ; OLD_JDIALOG=88IN5MG4EGHO9R47VPQCNQ0HXD4C4VS7KOY4T3599BP8TM8BFF',
},
#'COOKIES_ENABLED': True, # 需要启用 cookies 以维持会话
}
def __init__(self, debug='false', cmd='', begin=None, mod='all', *args, **kwargs):
super().__init__(*args, **kwargs)
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
self.update_mod = False
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
# 增加一个更新模式,需要传入 mod == update 并且有 开始时间
self.begin = parse_date_to_datetime(begin) if begin else None
if mod.lower() == 'update' and self.begin:
self.update_mod = True
self.cmd_studio = 'studio'
self.cmd_movie = 'movies'
self.cmd_actors = 'actors'
self.cmd_list = [self.cmd_studio, self.cmd_movie, self.cmd_actors]
if cmd and cmd != '' :
self.cmd_list = cmd.split(',')
def _build_studio_url(self, studio_id, page_id=1, sort_flag='latest'):
# sort = {latest, recent, popular}, latest 会按发布时间倒排,便于做更新拉取
return f"https://pornbox.com/studio/{studio_id}/?skip={page_id}&sort={sort_flag}&_={int(datetime.now().timestamp()*1000)}"
def _build_studio_list_url(self, page_id=1, sort_flag='popular'):
return f"https://pornbox.com/studio/list/ppd?page={page_id}&sort={sort_flag}"
# 入口函数,由基类的方法触发
def custom_start_requests(self):
# studios 列表
if self.cmd_studio in self.cmd_list:
url = self._build_studio_list_url()
yield scrapy.Request(url, callback=self.parse_studios_list)
self.crawler.stats.set_value(f"{self.name}/req_list_all", 0)
self.crawler.stats.set_value(f"{self.name}/req_list_done", 0)
# 获取每个stutio, 获取详情
if self.cmd_movie in self.cmd_list:
fitlers= {}
if self.debug :
fitlers['limit'] = 1
stu_list = db_tools.get_studios(**fitlers)
for stu in stu_list:
url = self._build_studio_url(stu['label_id'])
yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})
# 添加统计项
self.crawler.stats.set_value(f"{self.name}/req_mov_all", len(stu_list))
self.crawler.stats.set_value(f"{self.name}/req_mov_done", 0)
def parse_studios_list(self, response):
# 尝试解析 JSON 响应
try:
data = json.loads(response.text)
except json.JSONDecodeError as e:
err_item = CommErrItem()
err_item['url'] = response.url
err_item['error'] = '非 JSON 响应'
err_item['status'] = response.status
err_item['partial_content'] = response.text[:500]
yield err_item
return # 跳过后续处理
# 提取当前页码和总页数
current_page = data.get('current_page', 1)
total_pages = data.get('total_pages', 1)
self.crawler.stats.inc_value(f"{self.name}/req_list_done")
self.crawler.stats.set_value(f"{self.name}/req_list_all", total_pages)
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
# 处理每个工作室项目
for item in data.get('items', []):
studio_item = PBoxStuItem()
# 提取工作室信息
studio_item['item_type'] = comm.ITEM_TYPE_STUDIO
studio_item['label_id'] = item.get('label_id')
studio_item['name'] = item.get('name')
studio_item['description'] = item.get('description')
studio_item['scene_count'] = item.get('scene_count')
studio_item['href'] = f"https://pornbox.com/application/studio/{item.get('label_id')}"
yield studio_item
# 处理分页
if current_page < total_pages:
next_page = current_page + 1
if self.debug and current_page >= 5:
pass
else:
next_url = self._build_studio_list_url(next_page)
yield scrapy.Request(next_url, callback=self.parse_studios_list)
def parse_studio(self, response):
# 尝试解析 JSON 响应
try:
data = json.loads(response.text)
except json.JSONDecodeError as e:
err_item = CommErrItem()
err_item['url'] = response.url
err_item['error'] = '非 JSON 响应'
err_item['status'] = response.status
err_item['partial_content'] = response.text[:500]
yield err_item
return # 跳过后续处理
# 提取当前页码和总页数
current_page = data.get('currentPage', 1)
total_pages = data.get('totalPages', 1)
self.logger.debug(f"url: {response.url}, total: {total_pages}, curr: {current_page}, items: {len(data.get('contents', []))}")
need_next = False
# 处理每个工作室项目
for item in data.get('contents', []):
mov_item = PBoxMovItem()
# 提取工作室信息
mov_item['item_type'] = comm.ITEM_TYPE_MOVIE_DETAIL
mov_item['movie_id'] = item.get('id')
mov_item['content_id'] = item.get('content_id')
mov_item['href'] = f"https://pornbox.com/application/watch-page/{item.get('id', 0)}"
mov_item['publish_date'] = item.get('publish_date')
mov_item['release_date'] = format_timestamp(item.get('release_date'))
mov_item['title'] = item.get('scene_name')
mov_item['duration'] = item.get('runtime')
mov_item['studio_id'] = int(item.get('label_id', '0'))
mov_item['is_full_data'] = 1
# 关联演员
actors_list = []
for actor in item.get('models', []):
actor_index = PBoxActorIndexItem()
actor_index['actor_id'] = actor.get('model_id', 0)
actor_index['name'] = actor.get('model_name', 0)
actor_index['gender'] = actor.get('sex', 0)
actor_index['is_full_data'] = 0
actor_index['href'] = f"https://pornbox.com/application/model/{actor.get('model_id', 0)}"
actors_list.append(actor_index)
mov_item['actor_index_list'] = actors_list
# 关联tags
tags_list = []
for tag in item.get('niches', []):
tag_item = PBoxMovTagsItem()
tag_item['tag_id'] = tag.get('niche_id')
tag_item['name'] = tag.get('niche')
tag_item['href'] = f"https://pornbox.com/application/niche/{tag.get('niche_id')}"
tags_list.append(tag_item)
mov_item['mov_tags_list'] = tags_list
# 关联别名
alt_list = []
alt_items = []
if isinstance(item.get('alternate'), dict):
alt_items.append(item.get('alternate'))
elif isinstance(item.get('alternate'), list):
alt_items = item.get('alternate')
for alt in alt_items:
alt_item = PBoxAlternateItem()
alt_item['min_mov_id'] = min(alt.get('content_id'), mov_item['movie_id'])
alt_item['max_mov_id'] = max(alt.get('content_id'), mov_item['movie_id'])
alt_list.append(alt_item)
mov_item['mov_alt_list'] = alt_list
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页(主要是担心一些时间不准的脏数据干扰。否则只要出现一个更早的时间,就可以停止了)
up_date = parse_date_to_datetime(mov_item['release_date'])
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
pass
else:
need_next = True
yield mov_item
# 后面的都是旧数据了,无需继续翻页
stu_id = response.meta['stu_id']
stu_name = response.meta['name']
scene_count = response.meta['scene_count']
if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
self.crawler.stats.inc_value(f"{self.name}/req_mov_done")
total_rows = db_tools.get_stu_mov_count(stu_id)
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
return
# 下一页
next_url = self._build_studio_url(stu_id, current_page + 1)
yield scrapy.Request(next_url, callback=self.parse_studio, meta=response.meta)