resources/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py

# spiders/pornbox_spider.py
import scrapy
import json
import os
import sys
from datetime import datetime
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
from scrapy_proj.spiders.base_spider import BaseSpider
from scrapy_proj.items import PBoxStuItem, PBoxMovItem, CommErrItem, PBoxActorIndexItem, PBoxAlternateItem, PBoxMovIndexItem, PBoxMovTagsItem
import scrapy_proj.comm.comm_def as comm
from scrapy_proj.utils.utils import format_timestamp
from scrapy_proj.db_wapper.spider_db_handler import PboxDBHandler

db_tools = PboxDBHandler()

class PornboxSpider(BaseSpider):
    name = comm.SPIDER_NAME_PBOX
    allowed_domains = ["pornbox.com"]

    custom_settings = {
        'DEFAULT_REQUEST_HEADERS': {
            'accept': 'application/json, text/javascript, */*; q=0.01',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            "content-type": "application/json",
            'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"macOS"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'priority': 'u=1, i',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0',
            'x-requested-with': 'XMLHttpRequest',
            'x-csrf-token' : 'GOgY0XYd-ZImUIrOID6f35AIw_ZbdvvITor4',
            'cookie': '_ga=GA1.1.1702111276.1741134024; agree18=1; _al=19t837144r; http_referer=https%3A%2F%2Ftheporndude.com%2F; entry_point=https%3A%2F%2Fpornbox.com%2Fapplication%2Fstudio%2Flist%3Faff%3DUBJGFRA8UU____; pornboxcookie=19c67b1326563c7f0y83hr5TyULWp6xXf3rI2-ML9bO5yV0ej1oycJoLejz2ZWQozMsVdb4HwoUgnYfElqZf4kNTJKpIFReAUgqbvZI3QPPdvmbnKLsblHN5Dytt0MRNqj5o_ul-SOOtvuKQqGkQU9bKpbOZfOtpZk5rqA==; sxc_affiliate=UBJGFRA8UU; product_offer=; boxsessid=s%3AvJIsw1Zeq96r_j2Bq24rpsHZ1VyI5aIR.R1biCZrwUhuzYELLQWepgNblLsGATRLl6xIFVLRlvs8; _als=k84lsh1olk; _ga_E272WS0NTB=GS2.1.s1751772066$o10$g1$t1751772680$j59$l0$h0; JDIALOG3=AQ79OJ467NWQONZM1TH55VFZR19PIWUGEZUKPV7G6UWMOFZRIJ; OLD_JDIALOG=88IN5MG4EGHO9R47VPQCNQ0HXD4C4VS7KOY4T3599BP8TM8BFF',

        },
        #'COOKIES_ENABLED': True,  # 需要启用 cookies 以维持会话
    }


    def __init__(self, debug='false', cmd='', begin=None, mod='all', *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
        self.update_mod = False
        self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")

        # 增加一个更新模式，需要传入 mod == update 并且有 开始时间
        self.begin = parse_date_to_datetime(begin) if begin else None
        if mod.lower() == 'update' and self.begin:
            self.update_mod = True

        self.cmd_studio = 'studio'
        self.cmd_movie = 'movies'
        self.cmd_actors = 'actors'
        self.cmd_list = [self.cmd_studio, self.cmd_movie, self.cmd_actors]
        if cmd and cmd != '' :
            self.cmd_list = cmd.split(',')

    def _build_studio_url(self, studio_id, page_id=1, sort_flag='latest'):
        # sort = {latest, recent, popular}, latest 会按发布时间倒排，便于做更新拉取
        return f"https://pornbox.com/studio/{studio_id}/?skip={page_id}&sort={sort_flag}&_={int(datetime.now().timestamp()*1000)}"

    def _build_studio_list_url(self, page_id=1, sort_flag='popular'):
        return f"https://pornbox.com/studio/list/ppd?page={page_id}&sort={sort_flag}"

    # 入口函数，由基类的方法触发
    def custom_start_requests(self):
        # studios 列表
        if self.cmd_studio in self.cmd_list:
            url = self._build_studio_list_url()
            yield scrapy.Request(url, callback=self.parse_studios_list)
            self.crawler.stats.set_value(f"{self.name}/req_list_all", 0)
            self.crawler.stats.set_value(f"{self.name}/req_list_done", 0)

        # 获取每个stutio, 获取详情
        if self.cmd_movie in self.cmd_list:
            fitlers= {}
            if self.debug :
                fitlers['limit'] = 1
            stu_list = db_tools.get_studios(**fitlers)
            for stu in stu_list:
                url = self._build_studio_url(stu['label_id'])
                yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})

            # 添加统计项
            self.crawler.stats.set_value(f"{self.name}/req_mov_all", len(stu_list))
            self.crawler.stats.set_value(f"{self.name}/req_mov_done", 0)

    def parse_studios_list(self, response):
        # 尝试解析 JSON 响应
        try:
            data = json.loads(response.text)
        except json.JSONDecodeError as e:
            err_item = CommErrItem()
            err_item['url'] = response.url
            err_item['error'] = '非 JSON 响应'
            err_item['status'] = response.status
            err_item['partial_content'] = response.text[:500]
            yield err_item

            return  # 跳过后续处理

        # 提取当前页码和总页数
        current_page = data.get('current_page', 1)
        total_pages = data.get('total_pages', 1)

        self.crawler.stats.inc_value(f"{self.name}/req_list_done")
        self.crawler.stats.set_value(f"{self.name}/req_list_all", total_pages)
        self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")

        # 处理每个工作室项目
        for item in data.get('items', []):
            studio_item = PBoxStuItem()

            # 提取工作室信息
            studio_item['item_type'] = comm.ITEM_TYPE_STUDIO
            studio_item['label_id'] = item.get('label_id')
            studio_item['name'] = item.get('name')
            studio_item['description'] = item.get('description')
            studio_item['scene_count'] = item.get('scene_count')
            studio_item['href'] = f"https://pornbox.com/application/studio/{item.get('label_id')}"

            yield studio_item

        # 处理分页
        if current_page < total_pages:
            next_page = current_page + 1
            if self.debug and current_page >= 5:
                pass
            else:
                next_url = self._build_studio_list_url(next_page)
                yield scrapy.Request(next_url, callback=self.parse_studios_list)


    def parse_studio(self, response):
        # 尝试解析 JSON 响应
        try:
            data = json.loads(response.text)
        except json.JSONDecodeError as e:
            err_item = CommErrItem()
            err_item['url'] = response.url
            err_item['error'] = '非 JSON 响应'
            err_item['status'] = response.status
            err_item['partial_content'] = response.text[:500]
            yield err_item

            return  # 跳过后续处理

        # 提取当前页码和总页数
        current_page = data.get('currentPage', 1)
        total_pages = data.get('totalPages', 1)

        self.logger.debug(f"url: {response.url}, total: {total_pages}, curr: {current_page}, items: {len(data.get('contents', []))}")

        need_next = False
        # 处理每个工作室项目
        for item in data.get('contents', []):
            mov_item = PBoxMovItem()

            # 提取工作室信息
            mov_item['item_type'] = comm.ITEM_TYPE_MOVIE_DETAIL
            mov_item['movie_id'] = item.get('id')
            mov_item['content_id'] = item.get('content_id')
            mov_item['href'] = f"https://pornbox.com/application/watch-page/{item.get('id', 0)}"
            mov_item['publish_date'] = item.get('publish_date')
            mov_item['release_date'] = format_timestamp(item.get('release_date'))
            mov_item['title'] = item.get('scene_name')
            mov_item['duration'] = item.get('runtime')
            mov_item['studio_id'] = int(item.get('label_id', '0'))
            mov_item['is_full_data'] = 1

            # 关联演员
            actors_list = []
            for actor in item.get('models', []):
                actor_index = PBoxActorIndexItem()
                actor_index['actor_id'] = actor.get('model_id', 0)
                actor_index['name'] = actor.get('model_name', 0)
                actor_index['gender'] = actor.get('sex', 0)
                actor_index['is_full_data'] = 0
                actor_index['href'] = f"https://pornbox.com/application/model/{actor.get('model_id', 0)}"
                actors_list.append(actor_index)
            mov_item['actor_index_list'] = actors_list

            # 关联tags
            tags_list = []
            for tag in item.get('niches', []):
                tag_item = PBoxMovTagsItem()
                tag_item['tag_id'] = tag.get('niche_id')
                tag_item['name'] = tag.get('niche')
                tag_item['href'] = f"https://pornbox.com/application/niche/{tag.get('niche_id')}"
                tags_list.append(tag_item)
            mov_item['mov_tags_list'] = tags_list

            # 关联别名
            alt_list = []
            alt_items = []
            if isinstance(item.get('alternate'), dict):
                alt_items.append(item.get('alternate'))
            elif isinstance(item.get('alternate'), list):
                alt_items = item.get('alternate')
            for alt in alt_items:
                alt_item = PBoxAlternateItem()
                alt_item['min_mov_id'] = min(alt.get('content_id'), mov_item['movie_id'])
                alt_item['max_mov_id'] = max(alt.get('content_id'), mov_item['movie_id'])
                alt_list.append(alt_item)
            mov_item['mov_alt_list'] = alt_list

            # 判断是否还要翻页，只有满足所有页面的数据，日期均小于开始日期时，停止翻页（主要是担心一些时间不准的脏数据干扰。否则只要出现一个更早的时间，就可以停止了）
            up_date = parse_date_to_datetime(mov_item['release_date'])
            self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
            if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
                pass
            else:
                need_next = True

            yield mov_item

        # 后面的都是旧数据了，无需继续翻页
        stu_id = response.meta['stu_id']
        stu_name = response.meta['name']
        scene_count = response.meta['scene_count']
        if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
            self.crawler.stats.inc_value(f"{self.name}/req_mov_done")
            total_rows = db_tools.get_stu_mov_count(stu_id)
            self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
            return

        # 下一页
        next_url = self._build_studio_url(stu_id, current_page + 1)
        yield scrapy.Request(next_url, callback=self.parse_studio, meta=response.meta)