This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/cron/cron_weekly.sh
2025-07-22 09:26:02 +08:00

117 lines
4.2 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# ==============================================
# 配置区:可根据需求修改或扩展
# ==============================================
# 补充环境变量(根据 which scrapy 的结果修改路径)
export PATH="/home/ubuntu/.local/bin:$PATH"
# 项目基础路径自动计算脚本所在目录的上一级即scrapy_proj/
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
LOG_DIR="${SCRAPY_PROJ_DIR}/log" # 日志存放目录
SLEEP_SECONDS=60 # 每个Spider执行间隔
# 计算公共日期参数当前日期往前减8天yyyy-mm-dd
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
# ==============================================
# 注册Spider在此处为每个Spider定义执行命令
# 格式register_spider "Spider名称" "完整执行命令(支持变量和自定义参数)"
# ==============================================
# 定义注册函数(无需修改)
declare -a SPIDER_REGISTRY=() # 存储注册的Spider信息
register_spider() {
local spider_name="$1"
local execute_cmd="$2"
SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}")
}
# 注册示例:根据实际需求修改或添加
# SpiderA仅需公共日期参数
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
# SpiderB需要公共日期+自定义参数
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
# SpiderB需要公共日期+自定义参数
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
# SpiderC完全自定义参数不依赖公共日期
register_spider "pbox" "scrapy crawl pbox "
# ==============================================
# 核心执行逻辑(无需修改)
# ==============================================
# 初始化日志目录
mkdir -p "${LOG_DIR}"
current_time=$(date +"%Y%m%d") # 执行时间戳(用于日志命名)
main_log="${LOG_DIR}/cron_${current_time}.log"
# 日志函数:带时间戳并写入主日志
log() {
local msg="$1"
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
echo "[$timestamp] $msg" | tee -a "${main_log}"
}
# 执行单个Spider的函数
execute_spider() {
local spider_name="$1"
local execute_cmd="$2"
log "===== 开始执行 ${spider_name} ====="
log "执行命令:${execute_cmd}"
# 单个Spider的日志文件独立记录便于排查
local spider_log="${LOG_DIR}/${spider_name}_${current_time}.log"
# 执行命令切换到项目目录确保scrapy命令生效
(cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1
local exit_code=$? # 捕获命令执行结果
# 执行结果判断
if [ ${exit_code} -eq 0 ]; then
log "${spider_name} 执行成功(日志:${spider_log}"
else
log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code}"
fi
return ${exit_code}
}
# ==============================================
# 主流程遍历注册的Spider并顺序执行
# ==============================================
log "===== 爬虫调度脚本启动 ====="
log "项目路径:${SCRAPY_PROJ_DIR}"
log "公共日期参数:${COMMON_DATE_PARAM}"
log "已注册Spider数量${#SPIDER_REGISTRY[@]}"
# 检查注册的Spider是否为空
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
log "ERROR: 未注册任何Spider脚本终止"
exit 1
fi
# 遍历执行所有注册的Spider
for spider_info in "${SPIDER_REGISTRY[@]}"; do
# 解析注册信息(分割名称和命令)
IFS="|" read -r spider_name execute_cmd <<< "${spider_info}"
# 执行当前Spider
execute_spider "${spider_name}" "${execute_cmd}"
last_exit_code=$?
# 若开启“失败即终止”,取消以下注释(某一个失败后不再执行后续)
# if [ ${last_exit_code} -ne 0 ]; then
# log "ERROR: 因${spider_name}执行失败,终止后续执行"
# exit ${last_exit_code}
# fi
# 不是最后一个Spider则休眠
if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then
log "等待${SLEEP_SECONDS}秒后执行下一个Spider..."
sleep ${SLEEP_SECONDS}
fi
done
log "===== 所有注册的Spider执行完毕 ====="
exit 0