This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/cron/cron_scheduler.sh
2025-07-23 23:18:21 +08:00

226 lines
7.1 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# ==============================================
# 配置区:按周期类型区分配置
# ==============================================
# 补充环境变量
export PATH="/home/ubuntu/.local/bin:$PATH"
# 项目基础路径
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
GIT_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../" && pwd)
LOG_DIR="${SCRAPY_PROJ_DIR}/log"
mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录)
SLEEP_SECONDS=60
# ==============================================
# 核心:锁文件机制(解决并发执行冲突)
# ==============================================
# 锁文件路径(放在日志目录,便于管理)
LOCK_FILE="${LOG_DIR:-/tmp}/cron_scheduler.lock"
# 最大等待时间根据实际任务时长设置示例5小时=18000秒
MAX_WAIT_SECONDS=18000
# 每次检查锁的间隔时间(秒)
WAIT_INTERVAL=60
# 获取锁函数返回0表示成功获取1表示超时
acquire_lock() {
local start_time=$(date +%s)
local current_time
while true; do
# 原子操作创建锁文件(若不存在则创建,存在则失败)
if mkdir "${LOCK_FILE}" 2>/dev/null; then
# 成功获取锁记录PID便于排查异常
echo $$ > "${LOCK_FILE}/pid"
return 0
fi
# 已存在锁文件,检查是否超时
current_time=$(date +%s)
if [ $((current_time - start_time)) -ge ${MAX_WAIT_SECONDS} ]; then
echo "等待超时(超过${MAX_WAIT_SECONDS}秒),其他实例仍在执行"
return 1
fi
# 未超时,继续等待
local running_pid=$(cat "${LOCK_FILE}/pid" 2>/dev/null || echo "unknown")
echo "检测到其他任务正在执行PID: ${running_pid}),将在${WAIT_INTERVAL}秒后再次检查..."
sleep ${WAIT_INTERVAL}
done
}
# 释放锁函数
release_lock() {
if [ -d "${LOCK_FILE}" ]; then
# 删除锁文件(目录)
rm -rf "${LOCK_FILE}"
echo "已释放锁文件"
fi
}
# 注册退出陷阱:无论脚本正常/异常退出,都释放锁
trap release_lock EXIT
# ==============================================
# Git操作拉取最新代码
# ==============================================
# 功能执行git pull并检查结果
# 参数1. 项目目录 2. 日志函数(可选)
# 返回值0=成功1=失败
git_pull() {
local repo_dir="$1"
local log_func="${2:-echo}" # 允许传入日志函数
if [ ! -d "${repo_dir}/.git" ]; then
$log_func "ERROR: 目录${repo_dir}不是Git仓库无法执行git pull"
return 1
fi
$log_func "开始执行git pull更新代码..."
local pull_output=$(cd "${repo_dir}" && git pull 2>&1)
local exit_code=$?
if [ ${exit_code} -eq 0 ]; then
$log_func "git pull成功${pull_output}"
return 0
else
$log_func "ERROR: git pull失败退出码${exit_code}${pull_output}"
return 1
fi
}
# ==============================================
# 参数解析:区分执行周期(每周/每月)
# ==============================================
if [ $# -ne 1 ]; then
echo "用法:$0 [--weekly|--monthly]"
exit 1
fi
PERIOD=$1
if [ "${PERIOD}" != "--weekly" ] && [ "${PERIOD}" != "--monthly" ]; then
echo "错误:参数必须是 --weekly 或 --monthly"
exit 1
fi
# 按周期设置日期参数
if [ "${PERIOD}" = "--weekly" ]; then
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
elif [ "${PERIOD}" = "--monthly" ]; then
COMMON_DATE_PARAM=$(date -d "32 days ago" +%Y-%m-%d)
fi
# ==============================================
# 注册Spider按周期类型注册不同任务
# ==============================================
declare -a SPIDER_REGISTRY=()
register_spider() {
local spider_name="$1"
local execute_cmd="$2"
SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}")
}
# 每周任务
if [ "${PERIOD}" = "--weekly" ]; then
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
fi
# 每月任务
if [ "${PERIOD}" = "--monthly" ]; then
register_spider "pbox" "scrapy crawl pbox -a begin=${COMMON_DATE_PARAM} -a mod='update' "
register_spider "pbox" "scrapy crawl javhd -a mod='update' "
fi
# ==============================================
# 核心执行逻辑(复用+锁机制整合)
# ==============================================
current_time=$(date +"%Y%m%d")
main_log="${LOG_DIR}/cron_${PERIOD#--}_${current_time}.log"
log() {
local msg="$1"
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
echo "[$timestamp] $msg" | tee -a "${main_log}"
}
execute_spider() {
local spider_name="$1"
local execute_cmd="$2"
log "===== 开始执行 ${spider_name} ====="
log "执行命令:${execute_cmd}"
local spider_log="${LOG_DIR}/${spider_name}_${PERIOD#--}_${current_time}.log"
(cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1
local exit_code=$?
if [ ${exit_code} -eq 0 ]; then
log "${spider_name} 执行成功(日志:${spider_log}"
else
log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code}"
fi
return ${exit_code}
}
# ==============================================
# 主流程(整合锁机制)
# ==============================================
log "===== 爬虫调度脚本启动(周期:${PERIOD#--} ====="
log "项目路径:${SCRAPY_PROJ_DIR}"
log "公共日期参数:${COMMON_DATE_PARAM}"
log "已注册Spider数量${#SPIDER_REGISTRY[@]}"
# 第一步:获取锁(若失败则退出)
log "尝试获取执行锁..."
if ! acquire_lock; then
log "ERROR: 无法获取执行锁,脚本终止"
exit 1
fi
log "成功获取执行锁,开始执行任务"
# 拉取最新代码(关键步骤:失败则终止执行)
if ! git_pull "${GIT_PROJ_DIR}" log; then
log "ERROR: 代码更新失败,终止后续执行"
exit 1
fi
# 第二步:检查注册任务
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
log "ERROR: 未注册任何${PERIOD#--}Spider脚本终止"
release_lock # 释放锁
exit 1
fi
# 第三步:执行所有任务
for spider_info in "${SPIDER_REGISTRY[@]}"; do
IFS="|" read -r spider_name execute_cmd <<< "${spider_info}"
execute_spider "${spider_name}" "${execute_cmd}"
last_exit_code=$?
# 可选:失败即终止(取消注释启用)
# if [ ${last_exit_code} -ne 0 ]; then
# log "ERROR: 因${spider_name}执行失败,终止后续执行"
# release_lock # 释放锁
# exit ${last_exit_code}
# fi
if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then
log "等待${SLEEP_SECONDS}秒后执行下一个Spider..."
sleep ${SLEEP_SECONDS}
fi
done
# 第四步:执行完成,释放锁(脚本退出时也会自动释放)
log "===== 所有${PERIOD#--}Spider执行完毕 ====="
release_lock
exit 0