modify scripts
This commit is contained in:
@ -3,6 +3,9 @@ scrapy crawl clm -a mod='update' -a begin='2025-07-10' -s STATS_PUSH_MSG=False
|
||||
scrapy crawl clm -a mod='reload' -s STATS_PUSH_MSG=False -a file_path=./scrapy_proj/data/clm_keywords.json
|
||||
|
||||
scrapy crawl u3c3 -a begin='2025-07-04' end='2024-07-12'
|
||||
|
||||
scrapy crawl pbox -a mod='update' -a begin='2025-07-16'
|
||||
scrapy crawl pbox -a debug=1 -a cmd='studio,movies'
|
||||
scrapy crawl pbox -a debug='1' -s STATS_PUSH_MSG=False -a cmd='movies' -s LOG_LEVEL=DEBUG -a mod='update' -a begin='2025-07-16'
|
||||
|
||||
scrapy crawl iafd -a debug=1 -a cmd=performers -s STATS_EXPORT_INTERVAL=60 -s LOG_LEVEL=DEBUG
|
||||
@ -81,7 +81,7 @@ fi
|
||||
if [ "${PERIOD}" = "--weekly" ]; then
|
||||
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
|
||||
elif [ "${PERIOD}" = "--monthly" ]; then
|
||||
COMMON_DATE_PARAM=$(date -d "31 days ago" +%Y-%m-%d)
|
||||
COMMON_DATE_PARAM=$(date -d "32 days ago" +%Y-%m-%d)
|
||||
fi
|
||||
|
||||
|
||||
@ -97,14 +97,14 @@ register_spider() {
|
||||
|
||||
# 每周任务
|
||||
if [ "${PERIOD}" = "--weekly" ]; then
|
||||
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
||||
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
|
||||
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
||||
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
fi
|
||||
|
||||
# 每月任务
|
||||
if [ "${PERIOD}" = "--monthly" ]; then
|
||||
register_spider "pbox" "scrapy crawl pbox "
|
||||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
fi
|
||||
|
||||
|
||||
|
||||
@ -1,117 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ==============================================
|
||||
# 配置区:可根据需求修改或扩展
|
||||
# ==============================================
|
||||
# 补充环境变量(根据 which scrapy 的结果修改路径)
|
||||
export PATH="/home/ubuntu/.local/bin:$PATH"
|
||||
|
||||
# 项目基础路径(自动计算:脚本所在目录的上一级,即scrapy_proj/)
|
||||
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
||||
LOG_DIR="${SCRAPY_PROJ_DIR}/log" # 日志存放目录
|
||||
SLEEP_SECONDS=60 # 每个Spider执行间隔(秒)
|
||||
# 计算公共日期参数(当前日期往前减8天,yyyy-mm-dd)
|
||||
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
|
||||
|
||||
# ==============================================
|
||||
# 注册Spider:在此处为每个Spider定义执行命令
|
||||
# 格式:register_spider "Spider名称" "完整执行命令(支持变量和自定义参数)"
|
||||
# ==============================================
|
||||
# 定义注册函数(无需修改)
|
||||
declare -a SPIDER_REGISTRY=() # 存储注册的Spider信息
|
||||
register_spider() {
|
||||
local spider_name="$1"
|
||||
local execute_cmd="$2"
|
||||
SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}")
|
||||
}
|
||||
|
||||
# 注册示例:根据实际需求修改或添加
|
||||
# SpiderA:仅需公共日期参数
|
||||
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
||||
|
||||
# SpiderB:需要公共日期+自定义参数
|
||||
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
|
||||
|
||||
# SpiderB:需要公共日期+自定义参数
|
||||
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
|
||||
# SpiderC:完全自定义参数(不依赖公共日期)
|
||||
register_spider "pbox" "scrapy crawl pbox "
|
||||
|
||||
# ==============================================
|
||||
# 核心执行逻辑(无需修改)
|
||||
# ==============================================
|
||||
# 初始化日志目录
|
||||
mkdir -p "${LOG_DIR}"
|
||||
current_time=$(date +"%Y%m%d") # 执行时间戳(用于日志命名)
|
||||
main_log="${LOG_DIR}/cron_${current_time}.log"
|
||||
|
||||
# 日志函数:带时间戳并写入主日志
|
||||
log() {
|
||||
local msg="$1"
|
||||
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
|
||||
echo "[$timestamp] $msg" | tee -a "${main_log}"
|
||||
}
|
||||
|
||||
# 执行单个Spider的函数
|
||||
execute_spider() {
|
||||
local spider_name="$1"
|
||||
local execute_cmd="$2"
|
||||
|
||||
log "===== 开始执行 ${spider_name} ====="
|
||||
log "执行命令:${execute_cmd}"
|
||||
|
||||
# 单个Spider的日志文件(独立记录,便于排查)
|
||||
local spider_log="${LOG_DIR}/${spider_name}_${current_time}.log"
|
||||
|
||||
# 执行命令(切换到项目目录,确保scrapy命令生效)
|
||||
(cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1
|
||||
local exit_code=$? # 捕获命令执行结果
|
||||
|
||||
# 执行结果判断
|
||||
if [ ${exit_code} -eq 0 ]; then
|
||||
log "${spider_name} 执行成功(日志:${spider_log})"
|
||||
else
|
||||
log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code})"
|
||||
fi
|
||||
return ${exit_code}
|
||||
}
|
||||
|
||||
# ==============================================
|
||||
# 主流程:遍历注册的Spider并顺序执行
|
||||
# ==============================================
|
||||
log "===== 爬虫调度脚本启动 ====="
|
||||
log "项目路径:${SCRAPY_PROJ_DIR}"
|
||||
log "公共日期参数:${COMMON_DATE_PARAM}"
|
||||
log "已注册Spider数量:${#SPIDER_REGISTRY[@]}"
|
||||
|
||||
# 检查注册的Spider是否为空
|
||||
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
|
||||
log "ERROR: 未注册任何Spider,脚本终止"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 遍历执行所有注册的Spider
|
||||
for spider_info in "${SPIDER_REGISTRY[@]}"; do
|
||||
# 解析注册信息(分割名称和命令)
|
||||
IFS="|" read -r spider_name execute_cmd <<< "${spider_info}"
|
||||
|
||||
# 执行当前Spider
|
||||
execute_spider "${spider_name}" "${execute_cmd}"
|
||||
last_exit_code=$?
|
||||
|
||||
# 若开启“失败即终止”,取消以下注释(某一个失败后不再执行后续)
|
||||
# if [ ${last_exit_code} -ne 0 ]; then
|
||||
# log "ERROR: 因${spider_name}执行失败,终止后续执行"
|
||||
# exit ${last_exit_code}
|
||||
# fi
|
||||
|
||||
# 不是最后一个Spider则休眠
|
||||
if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then
|
||||
log "等待${SLEEP_SECONDS}秒后执行下一个Spider..."
|
||||
sleep ${SLEEP_SECONDS}
|
||||
fi
|
||||
done
|
||||
|
||||
log "===== 所有注册的Spider执行完毕 ====="
|
||||
exit 0
|
||||
Reference in New Issue
Block a user