#!/bin/bash # ============================================== # 配置区:按周期类型区分配置 # ============================================== # 补充环境变量 export PATH="/home/ubuntu/.local/bin:$PATH" # 项目基础路径 SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) GIT_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../" && pwd) LOG_DIR="${SCRAPY_PROJ_DIR}/log" mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录) SLEEP_SECONDS=60 # ============================================== # 核心:锁文件机制(解决并发执行冲突) # ============================================== # 锁文件路径(放在日志目录,便于管理) LOCK_FILE="${LOG_DIR:-/tmp}/cron_scheduler.lock" # 最大等待时间(秒):根据实际任务时长设置(示例:5小时=18000秒) MAX_WAIT_SECONDS=18000 # 每次检查锁的间隔时间(秒) WAIT_INTERVAL=60 # 获取锁函数:返回0表示成功获取,1表示超时 acquire_lock() { local start_time=$(date +%s) local current_time while true; do # 原子操作创建锁文件(若不存在则创建,存在则失败) if mkdir "${LOCK_FILE}" 2>/dev/null; then # 成功获取锁,记录PID(便于排查异常) echo $$ > "${LOCK_FILE}/pid" return 0 fi # 已存在锁文件,检查是否超时 current_time=$(date +%s) if [ $((current_time - start_time)) -ge ${MAX_WAIT_SECONDS} ]; then echo "等待超时(超过${MAX_WAIT_SECONDS}秒),其他实例仍在执行" return 1 fi # 未超时,继续等待 local running_pid=$(cat "${LOCK_FILE}/pid" 2>/dev/null || echo "unknown") echo "检测到其他任务正在执行(PID: ${running_pid}),将在${WAIT_INTERVAL}秒后再次检查..." sleep ${WAIT_INTERVAL} done } # 释放锁函数 release_lock() { if [ -d "${LOCK_FILE}" ]; then # 删除锁文件(目录) rm -rf "${LOCK_FILE}" echo "已释放锁文件" fi } # 注册退出陷阱:无论脚本正常/异常退出,都释放锁 trap release_lock EXIT # ============================================== # Git操作:拉取最新代码 # ============================================== # 功能:执行git pull并检查结果 # 参数:1. 项目目录 2. 日志函数(可选) # 返回值:0=成功,1=失败 git_pull() { local repo_dir="$1" local log_func="${2:-echo}" # 允许传入日志函数 if [ ! -d "${repo_dir}/.git" ]; then $log_func "ERROR: 目录${repo_dir}不是Git仓库,无法执行git pull" return 1 fi $log_func "开始执行git pull更新代码..." local pull_output=$(cd "${repo_dir}" && git pull 2>&1) local exit_code=$? if [ ${exit_code} -eq 0 ]; then $log_func "git pull成功:${pull_output}" return 0 else $log_func "ERROR: git pull失败(退出码${exit_code}):${pull_output}" return 1 fi } # ============================================== # 参数解析:区分执行周期(每周/每月) # ============================================== if [ $# -ne 1 ]; then echo "用法:$0 [--weekly|--monthly]" exit 1 fi PERIOD=$1 if [ "${PERIOD}" != "--weekly" ] && [ "${PERIOD}" != "--monthly" ]; then echo "错误:参数必须是 --weekly 或 --monthly" exit 1 fi # 按周期设置日期参数 if [ "${PERIOD}" = "--weekly" ]; then COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d) elif [ "${PERIOD}" = "--monthly" ]; then COMMON_DATE_PARAM=$(date -d "32 days ago" +%Y-%m-%d) fi # ============================================== # 注册Spider:按周期类型注册不同任务 # ============================================== declare -a SPIDER_REGISTRY=() register_spider() { local spider_name="$1" local execute_cmd="$2" SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}") } # 每周任务 if [ "${PERIOD}" = "--weekly" ]; then register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}" register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}" register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' " fi # 每月任务 if [ "${PERIOD}" = "--monthly" ]; then register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' " register_spider "javhd" "scrapy crawl javhd -a mod='update' " register_spider "lord" "scrapy crawl lord -a mod='update' " register_spider "javbus" "scrapy crawl javbus -a cmd='actors' -s HTTPCACHE_DIR=/home/ubuntu/sharedata/scrapy_cached/ " fi # ============================================== # 核心执行逻辑(复用+锁机制整合) # ============================================== current_time=$(date +"%Y%m%d") main_log="${LOG_DIR}/cron_${PERIOD#--}_${current_time}.log" log() { local msg="$1" local timestamp=$(date +"%Y-%m-%d %H:%M:%S") echo "[$timestamp] $msg" | tee -a "${main_log}" } execute_spider() { local spider_name="$1" local execute_cmd="$2" log "===== 开始执行 ${spider_name} =====" log "执行命令:${execute_cmd}" local spider_log="${LOG_DIR}/${spider_name}_${PERIOD#--}_${current_time}.log" (cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1 local exit_code=$? if [ ${exit_code} -eq 0 ]; then log "${spider_name} 执行成功(日志:${spider_log})" else log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code})" fi return ${exit_code} } # ============================================== # 主流程(整合锁机制) # ============================================== log "===== 爬虫调度脚本启动(周期:${PERIOD#--}) =====" log "项目路径:${SCRAPY_PROJ_DIR}" log "公共日期参数:${COMMON_DATE_PARAM}" log "已注册Spider数量:${#SPIDER_REGISTRY[@]}" # 第一步:获取锁(若失败则退出) log "尝试获取执行锁..." if ! acquire_lock; then log "ERROR: 无法获取执行锁,脚本终止" exit 1 fi log "成功获取执行锁,开始执行任务" # 拉取最新代码(关键步骤:失败则终止执行) if ! git_pull "${GIT_PROJ_DIR}" log; then log "ERROR: 代码更新失败,终止后续执行" exit 1 fi # 第二步:检查注册任务 if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then log "ERROR: 未注册任何${PERIOD#--}Spider,脚本终止" release_lock # 释放锁 exit 1 fi # 第三步:执行所有任务 for spider_info in "${SPIDER_REGISTRY[@]}"; do IFS="|" read -r spider_name execute_cmd <<< "${spider_info}" execute_spider "${spider_name}" "${execute_cmd}" last_exit_code=$? # 可选:失败即终止(取消注释启用) # if [ ${last_exit_code} -ne 0 ]; then # log "ERROR: 因${spider_name}执行失败,终止后续执行" # release_lock # 释放锁 # exit ${last_exit_code} # fi if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then log "等待${SLEEP_SECONDS}秒后执行下一个Spider..." sleep ${SLEEP_SECONDS} fi done # 第四步:执行完成,释放锁(脚本退出时也会自动释放) log "===== 所有${PERIOD#--}Spider执行完毕 =====" release_lock exit 0