From 558ceee49c2dd36dfc5c71990b750ccdf20d4641 Mon Sep 17 00:00:00 2001 From: sophon Date: Tue, 22 Jul 2025 16:53:04 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/cron/cmd.txt | 3 + scrapy_proj/cron/cron_scheduler.sh | 6 +- scrapy_proj/cron/cron_weekly.sh | 117 ------------------ .../db_wapper/spider_db_handler.py | 7 +- .../scrapy_proj/spiders/pornbox_spider.py | 75 ++++++----- scrapy_proj/scrapy_proj/utils/utils.py | 2 +- 6 files changed, 57 insertions(+), 153 deletions(-) delete mode 100755 scrapy_proj/cron/cron_weekly.sh diff --git a/scrapy_proj/cron/cmd.txt b/scrapy_proj/cron/cmd.txt index a714ebb..21d07f5 100644 --- a/scrapy_proj/cron/cmd.txt +++ b/scrapy_proj/cron/cmd.txt @@ -3,6 +3,9 @@ scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_PUSH_MSG=False scrapy crawl clm -a mod='reload' -s STATS_PUSH_MSG=False -a file_path=./scrapy_proj/data/clm_keywords.json scrapy crawl u3c3 -a begin='2025-07-04' end='2024-07-12' + +scrapy crawl pbox -a mod='update' -a begin='2025-07-16' scrapy crawl pbox -a debug=1 -a cmd='studio,movies' +scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16' scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG \ No newline at end of file diff --git a/scrapy_proj/cron/cron_scheduler.sh b/scrapy_proj/cron/cron_scheduler.sh index b28624e..7462108 100755 --- a/scrapy_proj/cron/cron_scheduler.sh +++ b/scrapy_proj/cron/cron_scheduler.sh @@ -81,7 +81,7 @@ fi if [ "${PERIOD}" = "--weekly" ]; then COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d) elif [ "${PERIOD}" = "--monthly" ]; then - COMMON_DATE_PARAM=$(date -d "31 days ago" +%Y-%m-%d) + COMMON_DATE_PARAM=$(date -d "32 days ago" +%Y-%m-%d) fi @@ -97,14 +97,14 @@ register_spider() { # 每周任务 if [ "${PERIOD}" = "--weekly" ]; then - register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}" register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}" + register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}" register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' " fi # 每月任务 if [ "${PERIOD}" = "--monthly" ]; then - register_spider "pbox" "scrapy crawl pbox " + register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' " fi diff --git a/scrapy_proj/cron/cron_weekly.sh b/scrapy_proj/cron/cron_weekly.sh deleted file mode 100755 index 1805fae..0000000 --- a/scrapy_proj/cron/cron_weekly.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -# ============================================== -# 配置区:可根据需求修改或扩展 -# ============================================== -# 补充环境变量(根据 which scrapy 的结果修改路径) -export PATH="/home/ubuntu/.local/bin:$PATH" - -# 项目基础路径(自动计算:脚本所在目录的上一级,即scrapy_proj/) -SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) -LOG_DIR="${SCRAPY_PROJ_DIR}/log" # 日志存放目录 -SLEEP_SECONDS=60 # 每个Spider执行间隔(秒) -# 计算公共日期参数(当前日期往前减8天,yyyy-mm-dd) -COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d) - -# ============================================== -# 注册Spider:在此处为每个Spider定义执行命令 -# 格式:register_spider "Spider名称" "完整执行命令(支持变量和自定义参数)" -# ============================================== -# 定义注册函数(无需修改) -declare -a SPIDER_REGISTRY=() # 存储注册的Spider信息 -register_spider() { - local spider_name="$1" - local execute_cmd="$2" - SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}") -} - -# 注册示例:根据实际需求修改或添加 -# SpiderA:仅需公共日期参数 -register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}" - -# SpiderB:需要公共日期+自定义参数 -register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}" - -# SpiderB:需要公共日期+自定义参数 -register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' " - -# SpiderC:完全自定义参数(不依赖公共日期) -register_spider "pbox" "scrapy crawl pbox " - -# ============================================== -# 核心执行逻辑(无需修改) -# ============================================== -# 初始化日志目录 -mkdir -p "${LOG_DIR}" -current_time=$(date +"%Y%m%d") # 执行时间戳(用于日志命名) -main_log="${LOG_DIR}/cron_${current_time}.log" - -# 日志函数:带时间戳并写入主日志 -log() { - local msg="$1" - local timestamp=$(date +"%Y-%m-%d %H:%M:%S") - echo "[$timestamp] $msg" | tee -a "${main_log}" -} - -# 执行单个Spider的函数 -execute_spider() { - local spider_name="$1" - local execute_cmd="$2" - - log "===== 开始执行 ${spider_name} =====" - log "执行命令:${execute_cmd}" - - # 单个Spider的日志文件(独立记录,便于排查) - local spider_log="${LOG_DIR}/${spider_name}_${current_time}.log" - - # 执行命令(切换到项目目录,确保scrapy命令生效) - (cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1 - local exit_code=$? # 捕获命令执行结果 - - # 执行结果判断 - if [ ${exit_code} -eq 0 ]; then - log "${spider_name} 执行成功(日志:${spider_log})" - else - log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code})" - fi - return ${exit_code} -} - -# ============================================== -# 主流程:遍历注册的Spider并顺序执行 -# ============================================== -log "===== 爬虫调度脚本启动 =====" -log "项目路径:${SCRAPY_PROJ_DIR}" -log "公共日期参数:${COMMON_DATE_PARAM}" -log "已注册Spider数量:${#SPIDER_REGISTRY[@]}" - -# 检查注册的Spider是否为空 -if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then - log "ERROR: 未注册任何Spider,脚本终止" - exit 1 -fi - -# 遍历执行所有注册的Spider -for spider_info in "${SPIDER_REGISTRY[@]}"; do - # 解析注册信息(分割名称和命令) - IFS="|" read -r spider_name execute_cmd <<< "${spider_info}" - - # 执行当前Spider - execute_spider "${spider_name}" "${execute_cmd}" - last_exit_code=$? - - # 若开启“失败即终止”,取消以下注释(某一个失败后不再执行后续) - # if [ ${last_exit_code} -ne 0 ]; then - # log "ERROR: 因${spider_name}执行失败,终止后续执行" - # exit ${last_exit_code} - # fi - - # 不是最后一个Spider则休眠 - if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then - log "等待${SLEEP_SECONDS}秒后执行下一个Spider..." - sleep ${SLEEP_SECONDS} - fi -done - -log "===== 所有注册的Spider执行完毕 =====" -exit 0 \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index 863e628..2f8ceae 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -499,7 +499,7 @@ class PboxDBHandler(SQLiteDBHandler): def get_studios(self, **filters): try: - sql = f"SELECT href, name, id, label_id FROM {self.tbl_studios} WHERE 1=1" + sql = f"SELECT href, name, id, label_id, scene_count FROM {self.tbl_studios} WHERE 1=1" params = [] conditions = { @@ -531,6 +531,11 @@ class PboxDBHandler(SQLiteDBHandler): logging.error(f"查询 href 失败: {e}") return [] + def get_stu_mov_count(self, stu_id): + self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_movies} WHERE studio_id = ?", (stu_id,)) + row = self.cursor.fetchone() + return row[0] if row else None + # 统计函数 def get_stat(self): try: diff --git a/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py b/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py index 895bb1f..82056da 100644 --- a/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/pornbox_spider.py @@ -2,7 +2,9 @@ import scrapy import json import os +import sys from datetime import datetime +from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime from scrapy_proj.spiders.base_spider import BaseSpider from scrapy_proj.items import PBoxStuItem, PBoxMovItem, CommErrItem, PBoxActorIndexItem, PBoxAlternateItem, PBoxMovIndexItem, PBoxMovTagsItem import scrapy_proj.comm.comm_def as comm @@ -37,11 +39,16 @@ class PornboxSpider(BaseSpider): } - def __init__(self, debug='false', cmd='', update='0', *args, **kwargs): + def __init__(self, debug='false', cmd='', begin=None, mod='all', *args, **kwargs): super().__init__(*args, **kwargs) self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False - self.update = int(update) - self.logger.info(f"debug mod: {self.debug}, cmd: {cmd}, update: {self.update}") + self.update_mod = False + self.logger.info(f"RUN CMD: {' '.join(sys.argv)}") + + # 增加一个更新模式,需要传入 mod == update 并且有 开始时间 + self.begin = parse_date_to_datetime(begin) if begin else None + if mod.lower() == 'update' and self.begin: + self.update_mod = True self.cmd_studio = 'studio' self.cmd_movie = 'movies' @@ -50,23 +57,29 @@ class PornboxSpider(BaseSpider): if cmd and cmd != '' : self.cmd_list = cmd.split(',') + def _build_studio_url(self, studio_id, page_id=1, sort_flag='latest'): + # sort = {latest, recent, popular}, latest 会按发布时间倒排,便于做更新拉取 + return f"https://pornbox.com/studio/{studio_id}/?skip={page_id}&sort={sort_flag}&_={int(datetime.now().timestamp()*1000)}" + + def _build_studio_list_url(self, page_id=1, sort_flag='popular'): + return f"https://pornbox.com/studio/list/ppd?page={page_id}&sort={sort_flag}" + # 入口函数,由基类的方法触发 def custom_start_requests(self): # studios 列表 if self.cmd_studio in self.cmd_list: - url = "https://pornbox.com/studio/list/ppd?page=1&sort=popular" + url = self._build_studio_list_url() yield scrapy.Request(url, callback=self.parse_studios_list) # 获取每个stutio, 获取详情 if self.cmd_movie in self.cmd_list: fitlers= {} if self.debug : - fitlers['limit'] = 5 + fitlers['limit'] = 1 stu_list = db_tools.get_studios(**fitlers) for stu in stu_list: - stu_url = f"https://pornbox.com/studio/{stu['label_id']}" - url = f"{stu_url}/?skip=1&sort=recent&_={int(datetime.now().timestamp()*1000)}" - yield scrapy.Request(url, callback=self.parse_studio, meta={'sdu_href':stu_url}) + url = self._build_studio_url(stu['label_id']) + yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']}) def parse_studios_list(self, response): @@ -89,18 +102,6 @@ class PornboxSpider(BaseSpider): self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}") - ''' - # 由于 item 中不包含页码信息,我们需要从 spider 的属性中获取 - json_dir = './pbox' - os.makedirs(json_dir, exist_ok=True) - file_path = os.path.join(json_dir, f"{current_page}.json") - - # 保存为 JSON 文件 - with open(file_path, 'w', encoding='utf-8') as f: - pass - #json.dump(data, f, ensure_ascii=False, indent=2) - ''' - # 处理每个工作室项目 for item in data.get('items', []): studio_item = PBoxStuItem() @@ -121,7 +122,7 @@ class PornboxSpider(BaseSpider): if self.debug and current_page >= 5: pass else: - next_url = f"https://pornbox.com/studio/list/ppd?page={next_page}&sort=popular" + next_url = self._build_studio_list_url(next_page) yield scrapy.Request(next_url, callback=self.parse_studios_list) @@ -142,10 +143,10 @@ class PornboxSpider(BaseSpider): # 提取当前页码和总页数 current_page = data.get('currentPage', 1) total_pages = data.get('totalPages', 1) - stu_href = response.meta['sdu_href'] - self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('contents', []))}") + self.logger.debug(f"url: {response.url}, total: {total_pages}, curr: {current_page}, items: {len(data.get('contents', []))}") + need_next = False # 处理每个工作室项目 for item in data.get('contents', []): mov_item = PBoxMovItem() @@ -198,13 +199,25 @@ class PornboxSpider(BaseSpider): alt_list.append(alt_item) mov_item['mov_alt_list'] = alt_list - yield mov_item - - # 处理分页 - if current_page < total_pages: - next_page = current_page + 1 - if self.debug and current_page >= 5: + # 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页(主要是担心一些时间不准的脏数据干扰。否则只要出现一个更早的时间,就可以停止了) + up_date = parse_date_to_datetime(mov_item['release_date']) + self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}") + if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()): pass else: - next_url = f"{stu_href}/?skip={next_page}&sort=recent&_={int(datetime.now().timestamp()*1000)}" - yield scrapy.Request(next_url, callback=self.parse_studio, meta={'sdu_href':stu_href}) \ No newline at end of file + need_next = True + + yield mov_item + + # 后面的都是旧数据了,无需继续翻页 + stu_id = response.meta['stu_id'] + stu_name = response.meta['name'] + scene_count = response.meta['scene_count'] + if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000): + total_rows = db_tools.get_stu_mov_count(stu_id) + self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}') + return + + # 下一页 + next_url = self._build_studio_url(stu_id, current_page + 1) + yield scrapy.Request(next_url, callback=self.parse_studio, meta=response.meta) \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/utils/utils.py b/scrapy_proj/scrapy_proj/utils/utils.py index 6c671f3..a935a0b 100644 --- a/scrapy_proj/scrapy_proj/utils/utils.py +++ b/scrapy_proj/scrapy_proj/utils/utils.py @@ -37,7 +37,7 @@ def format_timestamp(ts, is_ms=True): format_ts = ts / 1000.0 if is_ms else ts dt = datetime.fromtimestamp(format_ts, tz=timezone.utc) - return dt.strftime('%Y-%m-%d %H:%M:%S.%f')[:-4] + return dt.strftime('%Y-%m-%d %H:%M:%S') ''' 解析格式为 xxxMB, xxxGB, xxxM 等格式的字符串, 统一单位为 gb ''' def parse_size(size_text):