226 lines
7.1 KiB
Bash
Executable File
226 lines
7.1 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# ==============================================
|
||
# 配置区:按周期类型区分配置
|
||
# ==============================================
|
||
# 补充环境变量
|
||
export PATH="/home/ubuntu/.local/bin:$PATH"
|
||
|
||
# 项目基础路径
|
||
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
||
GIT_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../" && pwd)
|
||
LOG_DIR="${SCRAPY_PROJ_DIR}/log"
|
||
mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录)
|
||
SLEEP_SECONDS=60
|
||
|
||
# ==============================================
|
||
# 核心:锁文件机制(解决并发执行冲突)
|
||
# ==============================================
|
||
# 锁文件路径(放在日志目录,便于管理)
|
||
LOCK_FILE="${LOG_DIR:-/tmp}/cron_scheduler.lock"
|
||
# 最大等待时间(秒):根据实际任务时长设置(示例:5小时=18000秒)
|
||
MAX_WAIT_SECONDS=18000
|
||
# 每次检查锁的间隔时间(秒)
|
||
WAIT_INTERVAL=60
|
||
|
||
# 获取锁函数:返回0表示成功获取,1表示超时
|
||
acquire_lock() {
|
||
local start_time=$(date +%s)
|
||
local current_time
|
||
|
||
while true; do
|
||
# 原子操作创建锁文件(若不存在则创建,存在则失败)
|
||
if mkdir "${LOCK_FILE}" 2>/dev/null; then
|
||
# 成功获取锁,记录PID(便于排查异常)
|
||
echo $$ > "${LOCK_FILE}/pid"
|
||
return 0
|
||
fi
|
||
|
||
# 已存在锁文件,检查是否超时
|
||
current_time=$(date +%s)
|
||
if [ $((current_time - start_time)) -ge ${MAX_WAIT_SECONDS} ]; then
|
||
echo "等待超时(超过${MAX_WAIT_SECONDS}秒),其他实例仍在执行"
|
||
return 1
|
||
fi
|
||
|
||
# 未超时,继续等待
|
||
local running_pid=$(cat "${LOCK_FILE}/pid" 2>/dev/null || echo "unknown")
|
||
echo "检测到其他任务正在执行(PID: ${running_pid}),将在${WAIT_INTERVAL}秒后再次检查..."
|
||
sleep ${WAIT_INTERVAL}
|
||
done
|
||
}
|
||
|
||
# 释放锁函数
|
||
release_lock() {
|
||
if [ -d "${LOCK_FILE}" ]; then
|
||
# 删除锁文件(目录)
|
||
rm -rf "${LOCK_FILE}"
|
||
echo "已释放锁文件"
|
||
fi
|
||
}
|
||
|
||
# 注册退出陷阱:无论脚本正常/异常退出,都释放锁
|
||
trap release_lock EXIT
|
||
|
||
|
||
# ==============================================
|
||
# Git操作:拉取最新代码
|
||
# ==============================================
|
||
# 功能:执行git pull并检查结果
|
||
# 参数:1. 项目目录 2. 日志函数(可选)
|
||
# 返回值:0=成功,1=失败
|
||
git_pull() {
|
||
local repo_dir="$1"
|
||
local log_func="${2:-echo}" # 允许传入日志函数
|
||
|
||
if [ ! -d "${repo_dir}/.git" ]; then
|
||
$log_func "ERROR: 目录${repo_dir}不是Git仓库,无法执行git pull"
|
||
return 1
|
||
fi
|
||
|
||
$log_func "开始执行git pull更新代码..."
|
||
local pull_output=$(cd "${repo_dir}" && git pull 2>&1)
|
||
local exit_code=$?
|
||
|
||
if [ ${exit_code} -eq 0 ]; then
|
||
$log_func "git pull成功:${pull_output}"
|
||
return 0
|
||
else
|
||
$log_func "ERROR: git pull失败(退出码${exit_code}):${pull_output}"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# ==============================================
|
||
# 参数解析:区分执行周期(每周/每月)
|
||
# ==============================================
|
||
if [ $# -ne 1 ]; then
|
||
echo "用法:$0 [--weekly|--monthly]"
|
||
exit 1
|
||
fi
|
||
|
||
PERIOD=$1
|
||
if [ "${PERIOD}" != "--weekly" ] && [ "${PERIOD}" != "--monthly" ]; then
|
||
echo "错误:参数必须是 --weekly 或 --monthly"
|
||
exit 1
|
||
fi
|
||
|
||
|
||
# 按周期设置日期参数
|
||
if [ "${PERIOD}" = "--weekly" ]; then
|
||
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
|
||
elif [ "${PERIOD}" = "--monthly" ]; then
|
||
COMMON_DATE_PARAM=$(date -d "32 days ago" +%Y-%m-%d)
|
||
fi
|
||
|
||
|
||
# ==============================================
|
||
# 注册Spider:按周期类型注册不同任务
|
||
# ==============================================
|
||
declare -a SPIDER_REGISTRY=()
|
||
register_spider() {
|
||
local spider_name="$1"
|
||
local execute_cmd="$2"
|
||
SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}")
|
||
}
|
||
|
||
# 每周任务
|
||
if [ "${PERIOD}" = "--weekly" ]; then
|
||
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
|
||
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
||
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||
fi
|
||
|
||
# 每月任务
|
||
if [ "${PERIOD}" = "--monthly" ]; then
|
||
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||
register_spider "pbox" "scrapy crawl javhd -a mod='update' "
|
||
fi
|
||
|
||
|
||
# ==============================================
|
||
# 核心执行逻辑(复用+锁机制整合)
|
||
# ==============================================
|
||
current_time=$(date +"%Y%m%d")
|
||
main_log="${LOG_DIR}/cron_${PERIOD#--}_${current_time}.log"
|
||
|
||
log() {
|
||
local msg="$1"
|
||
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
|
||
echo "[$timestamp] $msg" | tee -a "${main_log}"
|
||
}
|
||
|
||
execute_spider() {
|
||
local spider_name="$1"
|
||
local execute_cmd="$2"
|
||
|
||
log "===== 开始执行 ${spider_name} ====="
|
||
log "执行命令:${execute_cmd}"
|
||
|
||
local spider_log="${LOG_DIR}/${spider_name}_${PERIOD#--}_${current_time}.log"
|
||
|
||
(cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1
|
||
local exit_code=$?
|
||
|
||
if [ ${exit_code} -eq 0 ]; then
|
||
log "${spider_name} 执行成功(日志:${spider_log})"
|
||
else
|
||
log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code})"
|
||
fi
|
||
return ${exit_code}
|
||
}
|
||
|
||
|
||
# ==============================================
|
||
# 主流程(整合锁机制)
|
||
# ==============================================
|
||
log "===== 爬虫调度脚本启动(周期:${PERIOD#--}) ====="
|
||
log "项目路径:${SCRAPY_PROJ_DIR}"
|
||
log "公共日期参数:${COMMON_DATE_PARAM}"
|
||
log "已注册Spider数量:${#SPIDER_REGISTRY[@]}"
|
||
|
||
# 第一步:获取锁(若失败则退出)
|
||
log "尝试获取执行锁..."
|
||
if ! acquire_lock; then
|
||
log "ERROR: 无法获取执行锁,脚本终止"
|
||
exit 1
|
||
fi
|
||
log "成功获取执行锁,开始执行任务"
|
||
|
||
# 拉取最新代码(关键步骤:失败则终止执行)
|
||
if ! git_pull "${GIT_PROJ_DIR}" log; then
|
||
log "ERROR: 代码更新失败,终止后续执行"
|
||
exit 1
|
||
fi
|
||
|
||
|
||
# 第二步:检查注册任务
|
||
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
|
||
log "ERROR: 未注册任何${PERIOD#--}Spider,脚本终止"
|
||
release_lock # 释放锁
|
||
exit 1
|
||
fi
|
||
|
||
# 第三步:执行所有任务
|
||
for spider_info in "${SPIDER_REGISTRY[@]}"; do
|
||
IFS="|" read -r spider_name execute_cmd <<< "${spider_info}"
|
||
execute_spider "${spider_name}" "${execute_cmd}"
|
||
last_exit_code=$?
|
||
|
||
# 可选:失败即终止(取消注释启用)
|
||
# if [ ${last_exit_code} -ne 0 ]; then
|
||
# log "ERROR: 因${spider_name}执行失败,终止后续执行"
|
||
# release_lock # 释放锁
|
||
# exit ${last_exit_code}
|
||
# fi
|
||
|
||
if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then
|
||
log "等待${SLEEP_SECONDS}秒后执行下一个Spider..."
|
||
sleep ${SLEEP_SECONDS}
|
||
fi
|
||
done
|
||
|
||
# 第四步:执行完成,释放锁(脚本退出时也会自动释放)
|
||
log "===== 所有${PERIOD#--}Spider执行完毕 ====="
|
||
release_lock
|
||
exit 0 |