modify scripts
This commit is contained in:
@ -3,6 +3,9 @@ scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_PUSH_MSG=False
|
|||||||
scrapy crawl clm -a mod='reload' -s STATS_PUSH_MSG=False -a file_path=./scrapy_proj/data/clm_keywords.json
|
scrapy crawl clm -a mod='reload' -s STATS_PUSH_MSG=False -a file_path=./scrapy_proj/data/clm_keywords.json
|
||||||
|
|
||||||
scrapy crawl u3c3 -a begin='2025-07-04' end='2024-07-12'
|
scrapy crawl u3c3 -a begin='2025-07-04' end='2024-07-12'
|
||||||
|
|
||||||
|
scrapy crawl pbox -a mod='update' -a begin='2025-07-16'
|
||||||
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
|
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
|
||||||
|
scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16'
|
||||||
|
|
||||||
scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG
|
scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG
|
||||||
@ -81,7 +81,7 @@ fi
|
|||||||
if [ "${PERIOD}" = "--weekly" ]; then
|
if [ "${PERIOD}" = "--weekly" ]; then
|
||||||
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
|
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
|
||||||
elif [ "${PERIOD}" = "--monthly" ]; then
|
elif [ "${PERIOD}" = "--monthly" ]; then
|
||||||
COMMON_DATE_PARAM=$(date -d "31 days ago" +%Y-%m-%d)
|
COMMON_DATE_PARAM=$(date -d "32 days ago" +%Y-%m-%d)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
@ -97,14 +97,14 @@ register_spider() {
|
|||||||
|
|
||||||
# 每周任务
|
# 每周任务
|
||||||
if [ "${PERIOD}" = "--weekly" ]; then
|
if [ "${PERIOD}" = "--weekly" ]; then
|
||||||
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
|
||||||
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
|
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
|
||||||
|
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
||||||
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 每月任务
|
# 每月任务
|
||||||
if [ "${PERIOD}" = "--monthly" ]; then
|
if [ "${PERIOD}" = "--monthly" ]; then
|
||||||
register_spider "pbox" "scrapy crawl pbox "
|
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,117 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# ==============================================
|
|
||||||
# 配置区:可根据需求修改或扩展
|
|
||||||
# ==============================================
|
|
||||||
# 补充环境变量(根据 which scrapy 的结果修改路径)
|
|
||||||
export PATH="/home/ubuntu/.local/bin:$PATH"
|
|
||||||
|
|
||||||
# 项目基础路径(自动计算:脚本所在目录的上一级,即scrapy_proj/)
|
|
||||||
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
|
||||||
LOG_DIR="${SCRAPY_PROJ_DIR}/log" # 日志存放目录
|
|
||||||
SLEEP_SECONDS=60 # 每个Spider执行间隔(秒)
|
|
||||||
# 计算公共日期参数(当前日期往前减8天,yyyy-mm-dd)
|
|
||||||
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
|
|
||||||
|
|
||||||
# ==============================================
|
|
||||||
# 注册Spider:在此处为每个Spider定义执行命令
|
|
||||||
# 格式:register_spider "Spider名称" "完整执行命令(支持变量和自定义参数)"
|
|
||||||
# ==============================================
|
|
||||||
# 定义注册函数(无需修改)
|
|
||||||
declare -a SPIDER_REGISTRY=() # 存储注册的Spider信息
|
|
||||||
register_spider() {
|
|
||||||
local spider_name="$1"
|
|
||||||
local execute_cmd="$2"
|
|
||||||
SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}")
|
|
||||||
}
|
|
||||||
|
|
||||||
# 注册示例:根据实际需求修改或添加
|
|
||||||
# SpiderA:仅需公共日期参数
|
|
||||||
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
|
||||||
|
|
||||||
# SpiderB:需要公共日期+自定义参数
|
|
||||||
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
|
|
||||||
|
|
||||||
# SpiderB:需要公共日期+自定义参数
|
|
||||||
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
|
||||||
|
|
||||||
# SpiderC:完全自定义参数(不依赖公共日期)
|
|
||||||
register_spider "pbox" "scrapy crawl pbox "
|
|
||||||
|
|
||||||
# ==============================================
|
|
||||||
# 核心执行逻辑(无需修改)
|
|
||||||
# ==============================================
|
|
||||||
# 初始化日志目录
|
|
||||||
mkdir -p "${LOG_DIR}"
|
|
||||||
current_time=$(date +"%Y%m%d") # 执行时间戳(用于日志命名)
|
|
||||||
main_log="${LOG_DIR}/cron_${current_time}.log"
|
|
||||||
|
|
||||||
# 日志函数:带时间戳并写入主日志
|
|
||||||
log() {
|
|
||||||
local msg="$1"
|
|
||||||
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
|
|
||||||
echo "[$timestamp] $msg" | tee -a "${main_log}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 执行单个Spider的函数
|
|
||||||
execute_spider() {
|
|
||||||
local spider_name="$1"
|
|
||||||
local execute_cmd="$2"
|
|
||||||
|
|
||||||
log "===== 开始执行 ${spider_name} ====="
|
|
||||||
log "执行命令:${execute_cmd}"
|
|
||||||
|
|
||||||
# 单个Spider的日志文件(独立记录,便于排查)
|
|
||||||
local spider_log="${LOG_DIR}/${spider_name}_${current_time}.log"
|
|
||||||
|
|
||||||
# 执行命令(切换到项目目录,确保scrapy命令生效)
|
|
||||||
(cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1
|
|
||||||
local exit_code=$? # 捕获命令执行结果
|
|
||||||
|
|
||||||
# 执行结果判断
|
|
||||||
if [ ${exit_code} -eq 0 ]; then
|
|
||||||
log "${spider_name} 执行成功(日志:${spider_log})"
|
|
||||||
else
|
|
||||||
log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code})"
|
|
||||||
fi
|
|
||||||
return ${exit_code}
|
|
||||||
}
|
|
||||||
|
|
||||||
# ==============================================
|
|
||||||
# 主流程:遍历注册的Spider并顺序执行
|
|
||||||
# ==============================================
|
|
||||||
log "===== 爬虫调度脚本启动 ====="
|
|
||||||
log "项目路径:${SCRAPY_PROJ_DIR}"
|
|
||||||
log "公共日期参数:${COMMON_DATE_PARAM}"
|
|
||||||
log "已注册Spider数量:${#SPIDER_REGISTRY[@]}"
|
|
||||||
|
|
||||||
# 检查注册的Spider是否为空
|
|
||||||
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
|
|
||||||
log "ERROR: 未注册任何Spider,脚本终止"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 遍历执行所有注册的Spider
|
|
||||||
for spider_info in "${SPIDER_REGISTRY[@]}"; do
|
|
||||||
# 解析注册信息(分割名称和命令)
|
|
||||||
IFS="|" read -r spider_name execute_cmd <<< "${spider_info}"
|
|
||||||
|
|
||||||
# 执行当前Spider
|
|
||||||
execute_spider "${spider_name}" "${execute_cmd}"
|
|
||||||
last_exit_code=$?
|
|
||||||
|
|
||||||
# 若开启“失败即终止”,取消以下注释(某一个失败后不再执行后续)
|
|
||||||
# if [ ${last_exit_code} -ne 0 ]; then
|
|
||||||
# log "ERROR: 因${spider_name}执行失败,终止后续执行"
|
|
||||||
# exit ${last_exit_code}
|
|
||||||
# fi
|
|
||||||
|
|
||||||
# 不是最后一个Spider则休眠
|
|
||||||
if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then
|
|
||||||
log "等待${SLEEP_SECONDS}秒后执行下一个Spider..."
|
|
||||||
sleep ${SLEEP_SECONDS}
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
log "===== 所有注册的Spider执行完毕 ====="
|
|
||||||
exit 0
|
|
||||||
@ -499,7 +499,7 @@ class PboxDBHandler(SQLiteDBHandler):
|
|||||||
|
|
||||||
def get_studios(self, **filters):
|
def get_studios(self, **filters):
|
||||||
try:
|
try:
|
||||||
sql = f"SELECT href, name, id, label_id FROM {self.tbl_studios} WHERE 1=1"
|
sql = f"SELECT href, name, id, label_id, scene_count FROM {self.tbl_studios} WHERE 1=1"
|
||||||
params = []
|
params = []
|
||||||
|
|
||||||
conditions = {
|
conditions = {
|
||||||
@ -531,6 +531,11 @@ class PboxDBHandler(SQLiteDBHandler):
|
|||||||
logging.error(f"查询 href 失败: {e}")
|
logging.error(f"查询 href 失败: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def get_stu_mov_count(self, stu_id):
|
||||||
|
self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_movies} WHERE studio_id = ?", (stu_id,))
|
||||||
|
row = self.cursor.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
|
||||||
# 统计函数
|
# 统计函数
|
||||||
def get_stat(self):
|
def get_stat(self):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -2,7 +2,9 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
||||||
from scrapy_proj.spiders.base_spider import BaseSpider
|
from scrapy_proj.spiders.base_spider import BaseSpider
|
||||||
from scrapy_proj.items import PBoxStuItem, PBoxMovItem, CommErrItem, PBoxActorIndexItem, PBoxAlternateItem, PBoxMovIndexItem, PBoxMovTagsItem
|
from scrapy_proj.items import PBoxStuItem, PBoxMovItem, CommErrItem, PBoxActorIndexItem, PBoxAlternateItem, PBoxMovIndexItem, PBoxMovTagsItem
|
||||||
import scrapy_proj.comm.comm_def as comm
|
import scrapy_proj.comm.comm_def as comm
|
||||||
@ -37,11 +39,16 @@ class PornboxSpider(BaseSpider):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, debug='false', cmd='', update='0', *args, **kwargs):
|
def __init__(self, debug='false', cmd='', begin=None, mod='all', *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
||||||
self.update = int(update)
|
self.update_mod = False
|
||||||
self.logger.info(f"debug mod: {self.debug}, cmd: {cmd}, update: {self.update}")
|
self.logger.info(f"RUN CMD: {' '.join(sys.argv)}")
|
||||||
|
|
||||||
|
# 增加一个更新模式,需要传入 mod == update 并且有 开始时间
|
||||||
|
self.begin = parse_date_to_datetime(begin) if begin else None
|
||||||
|
if mod.lower() == 'update' and self.begin:
|
||||||
|
self.update_mod = True
|
||||||
|
|
||||||
self.cmd_studio = 'studio'
|
self.cmd_studio = 'studio'
|
||||||
self.cmd_movie = 'movies'
|
self.cmd_movie = 'movies'
|
||||||
@ -50,23 +57,29 @@ class PornboxSpider(BaseSpider):
|
|||||||
if cmd and cmd != '' :
|
if cmd and cmd != '' :
|
||||||
self.cmd_list = cmd.split(',')
|
self.cmd_list = cmd.split(',')
|
||||||
|
|
||||||
|
def _build_studio_url(self, studio_id, page_id=1, sort_flag='latest'):
|
||||||
|
# sort = {latest, recent, popular}, latest 会按发布时间倒排,便于做更新拉取
|
||||||
|
return f"https://pornbox.com/studio/{studio_id}/?skip={page_id}&sort={sort_flag}&_={int(datetime.now().timestamp()*1000)}"
|
||||||
|
|
||||||
|
def _build_studio_list_url(self, page_id=1, sort_flag='popular'):
|
||||||
|
return f"https://pornbox.com/studio/list/ppd?page={page_id}&sort={sort_flag}"
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def custom_start_requests(self):
|
def custom_start_requests(self):
|
||||||
# studios 列表
|
# studios 列表
|
||||||
if self.cmd_studio in self.cmd_list:
|
if self.cmd_studio in self.cmd_list:
|
||||||
url = "https://pornbox.com/studio/list/ppd?page=1&sort=popular"
|
url = self._build_studio_list_url()
|
||||||
yield scrapy.Request(url, callback=self.parse_studios_list)
|
yield scrapy.Request(url, callback=self.parse_studios_list)
|
||||||
|
|
||||||
# 获取每个stutio, 获取详情
|
# 获取每个stutio, 获取详情
|
||||||
if self.cmd_movie in self.cmd_list:
|
if self.cmd_movie in self.cmd_list:
|
||||||
fitlers= {}
|
fitlers= {}
|
||||||
if self.debug :
|
if self.debug :
|
||||||
fitlers['limit'] = 5
|
fitlers['limit'] = 1
|
||||||
stu_list = db_tools.get_studios(**fitlers)
|
stu_list = db_tools.get_studios(**fitlers)
|
||||||
for stu in stu_list:
|
for stu in stu_list:
|
||||||
stu_url = f"https://pornbox.com/studio/{stu['label_id']}"
|
url = self._build_studio_url(stu['label_id'])
|
||||||
url = f"{stu_url}/?skip=1&sort=recent&_={int(datetime.now().timestamp()*1000)}"
|
yield scrapy.Request(url, callback=self.parse_studio, meta={'stu_id':stu['label_id'], 'name': stu['name'], 'scene_count': stu['scene_count']})
|
||||||
yield scrapy.Request(url, callback=self.parse_studio, meta={'sdu_href':stu_url})
|
|
||||||
|
|
||||||
|
|
||||||
def parse_studios_list(self, response):
|
def parse_studios_list(self, response):
|
||||||
@ -89,18 +102,6 @@ class PornboxSpider(BaseSpider):
|
|||||||
|
|
||||||
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
|
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('items', []))}")
|
||||||
|
|
||||||
'''
|
|
||||||
# 由于 item 中不包含页码信息,我们需要从 spider 的属性中获取
|
|
||||||
json_dir = './pbox'
|
|
||||||
os.makedirs(json_dir, exist_ok=True)
|
|
||||||
file_path = os.path.join(json_dir, f"{current_page}.json")
|
|
||||||
|
|
||||||
# 保存为 JSON 文件
|
|
||||||
with open(file_path, 'w', encoding='utf-8') as f:
|
|
||||||
pass
|
|
||||||
#json.dump(data, f, ensure_ascii=False, indent=2)
|
|
||||||
'''
|
|
||||||
|
|
||||||
# 处理每个工作室项目
|
# 处理每个工作室项目
|
||||||
for item in data.get('items', []):
|
for item in data.get('items', []):
|
||||||
studio_item = PBoxStuItem()
|
studio_item = PBoxStuItem()
|
||||||
@ -121,7 +122,7 @@ class PornboxSpider(BaseSpider):
|
|||||||
if self.debug and current_page >= 5:
|
if self.debug and current_page >= 5:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
next_url = f"https://pornbox.com/studio/list/ppd?page={next_page}&sort=popular"
|
next_url = self._build_studio_list_url(next_page)
|
||||||
yield scrapy.Request(next_url, callback=self.parse_studios_list)
|
yield scrapy.Request(next_url, callback=self.parse_studios_list)
|
||||||
|
|
||||||
|
|
||||||
@ -142,10 +143,10 @@ class PornboxSpider(BaseSpider):
|
|||||||
# 提取当前页码和总页数
|
# 提取当前页码和总页数
|
||||||
current_page = data.get('currentPage', 1)
|
current_page = data.get('currentPage', 1)
|
||||||
total_pages = data.get('totalPages', 1)
|
total_pages = data.get('totalPages', 1)
|
||||||
stu_href = response.meta['sdu_href']
|
|
||||||
|
|
||||||
self.logger.info(f"url: {response.url}, total: {total_pages}, items: {len(data.get('contents', []))}")
|
self.logger.debug(f"url: {response.url}, total: {total_pages}, curr: {current_page}, items: {len(data.get('contents', []))}")
|
||||||
|
|
||||||
|
need_next = False
|
||||||
# 处理每个工作室项目
|
# 处理每个工作室项目
|
||||||
for item in data.get('contents', []):
|
for item in data.get('contents', []):
|
||||||
mov_item = PBoxMovItem()
|
mov_item = PBoxMovItem()
|
||||||
@ -198,13 +199,25 @@ class PornboxSpider(BaseSpider):
|
|||||||
alt_list.append(alt_item)
|
alt_list.append(alt_item)
|
||||||
mov_item['mov_alt_list'] = alt_list
|
mov_item['mov_alt_list'] = alt_list
|
||||||
|
|
||||||
yield mov_item
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页(主要是担心一些时间不准的脏数据干扰。否则只要出现一个更早的时间,就可以停止了)
|
||||||
|
up_date = parse_date_to_datetime(mov_item['release_date'])
|
||||||
# 处理分页
|
self.logger.debug(f"url: {response.url} update: {up_date}, begin: {self.begin}, now: {datetime.now()}")
|
||||||
if current_page < total_pages:
|
if up_date and self.begin and (up_date < self.begin or up_date>datetime.now()):
|
||||||
next_page = current_page + 1
|
|
||||||
if self.debug and current_page >= 5:
|
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
next_url = f"{stu_href}/?skip={next_page}&sort=recent&_={int(datetime.now().timestamp()*1000)}"
|
need_next = True
|
||||||
yield scrapy.Request(next_url, callback=self.parse_studio, meta={'sdu_href':stu_href})
|
|
||||||
|
yield mov_item
|
||||||
|
|
||||||
|
# 后面的都是旧数据了,无需继续翻页
|
||||||
|
stu_id = response.meta['stu_id']
|
||||||
|
stu_name = response.meta['name']
|
||||||
|
scene_count = response.meta['scene_count']
|
||||||
|
if not need_next or current_page >= total_pages or (self.debug and current_page >= 50000):
|
||||||
|
total_rows = db_tools.get_stu_mov_count(stu_id)
|
||||||
|
self.logger.info(f'停止翻页. 更新模式: {self.update_mod}. studio: ({stu_name}), total movies: {total_rows}, scene_count: {scene_count}, url: {response.url}')
|
||||||
|
return
|
||||||
|
|
||||||
|
# 下一页
|
||||||
|
next_url = self._build_studio_url(stu_id, current_page + 1)
|
||||||
|
yield scrapy.Request(next_url, callback=self.parse_studio, meta=response.meta)
|
||||||
@ -37,7 +37,7 @@ def format_timestamp(ts, is_ms=True):
|
|||||||
|
|
||||||
format_ts = ts / 1000.0 if is_ms else ts
|
format_ts = ts / 1000.0 if is_ms else ts
|
||||||
dt = datetime.fromtimestamp(format_ts, tz=timezone.utc)
|
dt = datetime.fromtimestamp(format_ts, tz=timezone.utc)
|
||||||
return dt.strftime('%Y-%m-%d %H:%M:%S.%f')[:-4]
|
return dt.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
''' 解析格式为 xxxMB, xxxGB, xxxM 等格式的字符串, 统一单位为 gb '''
|
''' 解析格式为 xxxMB, xxxGB, xxxM 等格式的字符串, 统一单位为 gb '''
|
||||||
def parse_size(size_text):
|
def parse_size(size_text):
|
||||||
|
|||||||
Reference in New Issue
Block a user