231 lines
11 KiB
Python
231 lines
11 KiB
Python
# spiders/pornbox_spider.py
|
|
import scrapy
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
|
from scrapy_proj.spiders.base_spider import BaseSpider
|
|
from scrapy_proj.items import PBoxStuItem, PBoxMovItem, CommErrItem, PBoxActorIndexItem, PBoxAlternateItem, PBoxMovIndexItem, PBoxMovTagsItem
|
|
import scrapy_proj.comm.comm_def as comm
|
|
from scrapy_proj.utils.utils import format_timestamp
|
|
from scrapy_proj.db_wapper.spider_db_handler import PboxDBHandler
|
|
|
|
db_tools = PboxDBHandler()
|
|
|
|
class PornboxSpider(BaseSpider):
|
|
name = comm.SPIDER_NAME_PBOX
|
|
allowed_domains = ["pornbox.com"]
|
|
|
|
custom_settings = {
|
|
'DEFAULT_REQUEST_HEADERS': {
|
|
'accept': 'application/json, text/javascript, */*; q=0.01',
|
|
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
"content-type": "application/json",
|
|
'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"macOS"',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'cors',
|
|
'sec-fetch-site': 'same-origin',
|
|
'priority': 'u=1, i',
|
|
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0',
|
|
'x-requested-with': 'XMLHttpRequest',
|
|
'x-csrf-token' : 'GOgY0XYd-ZImUIrOID6f35AIw_ZbdvvITor4',
|
|
'cookie': '_ga=GA1.1.1702111276.1741134024; agree18=1; _al=19t837144r; http_referer=https%3A%2F%2Ftheporndude.com%2F; entry_point=https%3A%2F%2Fpornbox.com%2Fapplication%2Fstudio%2Flist%3Faff%3DUBJGFRA8UU____; pornboxcookie=19c67b1326563c7f0y83hr5TyULWp6xXf3rI2-ML9bO5yV0ej1oycJoLejz2ZWQozMsVdb4HwoUgnYfElqZf4kNTJKpIFReAUgqbvZI3QPPdvmbnKLsblHN5Dytt0MRNqj5o_ul-SOOtvuKQqGkQU9bKpbOZfOtpZk5rqA==; sxc_affiliate=UBJGFRA8UU; product_offer=; boxsessid=s%3AvJIsw1Zeq96r_j2Bq24rpsHZ1VyI5aIR.R1biCZrwUhuzYELLQWepgNblLsGATRLl6xIFVLRlvs8; _als=k84lsh1olk; _ga_E272WS0NTB=GS2.1.s1751772066$o10$g1$t1751772680$j59$l0$h0; JDIALOG3=AQ79OJ467NWQONZM1TH55VFZR19PIWUGEZUKPV7G6UWMOFZRIJ; OLD_JDIALOG=88IN5MG4EGHO9R47VPQCNQ0HXD4C4VS7KOY4T3599BP8TM8BFF',
|
|
|
|
},
|
|
#'COOKIES_ENABLED': True, # 需要启用 cookies 以维持会话
|
|
}
|
|
|
|
|
|
def __init__(self, debug='false', cmd='', begin=None, mod='all', *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
|
self.update_mod = False
|
|
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
|
|
|
# 增加一个更新模式,需要传入 mod == update 并且有 开始时间
|
|
self.begin = parse_date_to_datetime(begin) if begin else None
|
|
if mod.lower() == 'update' and self.begin:
|
|
self.update_mod = True
|
|
|
|
self.cmd_studio = 'studio'
|
|
self.cmd_movie = 'movies'
|
|
self.cmd_actors = 'actors'
|
|
self.cmd_list = [self.cmd_studio, self.cmd_movie, self.cmd_actors]
|
|
if cmd and cmd != '' :
|
|
self.cmd_list = cmd.split(',')
|
|
|
|
def _build_studio_url(self, studio_id, page_id=1, sort_flag='latest'):
|
|
# sort = {latest, recent, popular}, latest 会按发布时间倒排,便于做更新拉取
|
|
return f"https://pornbox.com/studio/{studio_id}/?skip={page_id}&sort={sort_flag}&_={int(datetime.now().timestamp()*1000)}"
|
|
|
|
def _build_studio_list_url(self, page_id=1, sort_flag='popular'):
|
|
return f"https://pornbox.com/studio/list/ppd?page={page_id}&sort={sort_flag}"
|
|
|
|
# 入口函数,由基类的方法触发
|
|
def custom_start_requests(self):
|
|
# studios 列表
|
|
if self.cmd_studio in self.cmd_list:
|
|
url = self._build_studio_list_url()
|
|
yield scrapy.Request(url, callback=self.parse_studios_list)
|
|
self.crawler.stats.set_value(f"{self.name}/req_list_all", 0)
|
|
self.crawler.stats.set_value(f"{self.name}/req_list_done", 0)
|
|
|
|
# 获取每个stutio, 获取详情
|
|
if self.cmd_movie in self.cmd_list:
|
|
fitlers= {}
|
|
if self.debug :
|
|
fitlers['limit'] = 1
|
|
stu_list = db_tools.get_studios(**fitlers)
|
|
for stu in stu_list:
|
|
url = self._build_studio_url(stu['label_id'])
|
|
yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})
|
|
|
|
# 添加统计项
|
|
self.crawler.stats.set_value(f"{self.name}/req_mov_all", len(stu_list))
|
|
self.crawler.stats.set_value(f"{self.name}/req_mov_done", 0)
|
|
|
|
def parse_studios_list(self, response):
|
|
# 尝试解析 JSON 响应
|
|
try:
|
|
data = json.loads(response.text)
|
|
except json.JSONDecodeError as e:
|
|
err_item = CommErrItem()
|
|
err_item['url'] = response.url
|
|
err_item['error'] = '非 JSON 响应'
|
|
err_item['status'] = response.status
|
|
err_item['partial_content'] = response.text[:500]
|
|
yield err_item
|
|
|
|
return # 跳过后续处理
|
|
|
|
# 提取当前页码和总页数
|
|
current_page = data.get('current_page', 1)
|
|
total_pages = data.get('total_pages', 1)
|
|
|
|
self.crawler.stats.inc_value(f"{self.name}/req_list_done")
|
|
self.crawler.stats.set_value(f"{self.name}/req_list_all", total_pages)
|
|
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
|
|
|
|
# 处理每个工作室项目
|
|
for item in data.get('items', []):
|
|
studio_item = PBoxStuItem()
|
|
|
|
# 提取工作室信息
|
|
studio_item['item_type'] = comm.ITEM_TYPE_STUDIO
|
|
studio_item['label_id'] = item.get('label_id')
|
|
studio_item['name'] = item.get('name')
|
|
studio_item['description'] = item.get('description')
|
|
studio_item['scene_count'] = item.get('scene_count')
|
|
studio_item['href'] = f"https://pornbox.com/application/studio/{item.get('label_id')}"
|
|
|
|
yield studio_item
|
|
|
|
# 处理分页
|
|
if current_page < total_pages:
|
|
next_page = current_page + 1
|
|
if self.debug and current_page >= 5:
|
|
pass
|
|
else:
|
|
next_url = self._build_studio_list_url(next_page)
|
|
yield scrapy.Request(next_url, callback=self.parse_studios_list)
|
|
|
|
|
|
def parse_studio(self, response):
|
|
# 尝试解析 JSON 响应
|
|
try:
|
|
data = json.loads(response.text)
|
|
except json.JSONDecodeError as e:
|
|
err_item = CommErrItem()
|
|
err_item['url'] = response.url
|
|
err_item['error'] = '非 JSON 响应'
|
|
err_item['status'] = response.status
|
|
err_item['partial_content'] = response.text[:500]
|
|
yield err_item
|
|
|
|
return # 跳过后续处理
|
|
|
|
# 提取当前页码和总页数
|
|
current_page = data.get('currentPage', 1)
|
|
total_pages = data.get('totalPages', 1)
|
|
|
|
self.logger.debug(f"url: {response.url}, total: {total_pages}, curr: {current_page}, items: {len(data.get('contents', []))}")
|
|
|
|
need_next = False
|
|
# 处理每个工作室项目
|
|
for item in data.get('contents', []):
|
|
mov_item = PBoxMovItem()
|
|
|
|
# 提取工作室信息
|
|
mov_item['item_type'] = comm.ITEM_TYPE_MOVIE_DETAIL
|
|
mov_item['movie_id'] = item.get('id')
|
|
mov_item['content_id'] = item.get('content_id')
|
|
mov_item['href'] = f"https://pornbox.com/application/watch-page/{item.get('id', 0)}"
|
|
mov_item['publish_date'] = item.get('publish_date')
|
|
mov_item['release_date'] = format_timestamp(item.get('release_date'))
|
|
mov_item['title'] = item.get('scene_name')
|
|
mov_item['duration'] = item.get('runtime')
|
|
mov_item['studio_id'] = int(item.get('label_id', '0'))
|
|
mov_item['is_full_data'] = 1
|
|
|
|
# 关联演员
|
|
actors_list = []
|
|
for actor in item.get('models', []):
|
|
actor_index = PBoxActorIndexItem()
|
|
actor_index['actor_id'] = actor.get('model_id', 0)
|
|
actor_index['name'] = actor.get('model_name', 0)
|
|
actor_index['gender'] = actor.get('sex', 0)
|
|
actor_index['is_full_data'] = 0
|
|
actor_index['href'] = f"https://pornbox.com/application/model/{actor.get('model_id', 0)}"
|
|
actors_list.append(actor_index)
|
|
mov_item['actor_index_list'] = actors_list
|
|
|
|
# 关联tags
|
|
tags_list = []
|
|
for tag in item.get('niches', []):
|
|
tag_item = PBoxMovTagsItem()
|
|
tag_item['tag_id'] = tag.get('niche_id')
|
|
tag_item['name'] = tag.get('niche')
|
|
tag_item['href'] = f"https://pornbox.com/application/niche/{tag.get('niche_id')}"
|
|
tags_list.append(tag_item)
|
|
mov_item['mov_tags_list'] = tags_list
|
|
|
|
# 关联别名
|
|
alt_list = []
|
|
alt_items = []
|
|
if isinstance(item.get('alternate'), dict):
|
|
alt_items.append(item.get('alternate'))
|
|
elif isinstance(item.get('alternate'), list):
|
|
alt_items = item.get('alternate')
|
|
for alt in alt_items:
|
|
alt_item = PBoxAlternateItem()
|
|
alt_item['min_mov_id'] = min(alt.get('content_id'), mov_item['movie_id'])
|
|
alt_item['max_mov_id'] = max(alt.get('content_id'), mov_item['movie_id'])
|
|
alt_list.append(alt_item)
|
|
mov_item['mov_alt_list'] = alt_list
|
|
|
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页(主要是担心一些时间不准的脏数据干扰。否则只要出现一个更早的时间,就可以停止了)
|
|
up_date = parse_date_to_datetime(mov_item['release_date'])
|
|
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
|
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
|
|
pass
|
|
else:
|
|
need_next = True
|
|
|
|
yield mov_item
|
|
|
|
# 后面的都是旧数据了,无需继续翻页
|
|
stu_id = response.meta['stu_id']
|
|
stu_name = response.meta['name']
|
|
scene_count = response.meta['scene_count']
|
|
if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
|
|
self.crawler.stats.inc_value(f"{self.name}/req_mov_done")
|
|
total_rows = db_tools.get_stu_mov_count(stu_id)
|
|
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
|
|
return
|
|
|
|
# 下一页
|
|
next_url = self._build_studio_url(stu_id, current_page + 1)
|
|
yield scrapy.Request(next_url, callback=self.parse_studio, meta=response.meta) |