From 1ab4d55483109379835925e9d060e54ffb0e4340 Mon Sep 17 00:00:00 2001 From: oscarz Date: Sun, 6 Jul 2025 19:12:51 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/pbox/1.json | 0 scrapy_proj/pbox/2.json | 0 scrapy_proj/pbox/3.json | 0 scrapy_proj/pbox/4.json | 0 scrapy_proj/pbox/5.json | 0 .../scrapy_proj/spiders/pornbox_spider.py | 33 ++++++++++++++----- 6 files changed, 24 insertions(+), 9 deletions(-) create mode 100644 scrapy_proj/pbox/1.json create mode 100644 scrapy_proj/pbox/2.json create mode 100644 scrapy_proj/pbox/3.json create mode 100644 scrapy_proj/pbox/4.json create mode 100644 scrapy_proj/pbox/5.json diff --git a/scrapy_proj/pbox/1.json b/scrapy_proj/pbox/1.json new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_proj/pbox/2.json b/scrapy_proj/pbox/2.json new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_proj/pbox/3.json b/scrapy_proj/pbox/3.json new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_proj/pbox/4.json b/scrapy_proj/pbox/4.json new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_proj/pbox/5.json b/scrapy_proj/pbox/5.json new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py b/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py index 42aae60..b6f68f7 100644 --- a/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py @@ -44,18 +44,30 @@ class PornboxSpider(BaseSpider): self.update = int(update) self.logger.info(f"debug mod: {self.debug}, cmd: {self.cmd_str}, update: {self.update}") + self.cmd_studio = 'studio' + self.cmd_movie = 'movies' + self.cmd_actors = 'actors' + self.cmd_list = self.cmd_str.split(',') + if len(self.cmd_list) == 0 : + self.cmd_list = [self.cmd_studio, self.cmd_movie, self.cmd_actors] + # 入口函数,由基类的方法触发 def custom_start_requests(self): # studios 列表 - url = "https://pornbox.com/studio/list/ppd?page=1&sort=popular" - #yield scrapy.Request(url, callback=self.parse_studios_list) + if self.cmd_studio in self.cmd_list: + url = "https://pornbox.com/studio/list/ppd?page=1&sort=popular" + yield scrapy.Request(url, callback=self.parse_studios_list) # 获取每个stutio, 获取详情 - stu_list = db_tools.get_studios(limit=1 if self.debug else 100) - for stu in stu_list: - stu_url = f"https://pornbox.com/studio/{stu['label_id']}" - url = f"{stu_url}/?skip=1&sort=recent&_={int(datetime.now().timestamp()*1000)}" - yield scrapy.Request(url, callback=self.parse_studio, meta={'sdu_href':stu_url}) + if self.cmd_movie in self.cmd_list: + fitlers= {} + if self.debug : + fitlers['limit'] = 5 + stu_list = db_tools.get_studios(**fitlers) + for stu in stu_list: + stu_url = f"https://pornbox.com/studio/{stu['label_id']}" + url = f"{stu_url}/?skip=1&sort=recent&_={int(datetime.now().timestamp()*1000)}" + yield scrapy.Request(url, callback=self.parse_studio, meta={'sdu_href':stu_url}) def parse_studios_list(self, response): @@ -106,8 +118,11 @@ class PornboxSpider(BaseSpider): # 处理分页 if current_page < total_pages: next_page = current_page + 1 - next_url = f"https://pornbox.com/studio/list/ppd?page={next_page}&sort=popular" - yield scrapy.Request(next_url, callback=self.parse_studios_list) + if self.debug and current_page >= 5: + pass + else: + next_url = f"https://pornbox.com/studio/list/ppd?page={next_page}&sort=popular" + yield scrapy.Request(next_url, callback=self.parse_studios_list) def parse_studio(self, response):