diff --git a/scrapy_proj/cron/cron_scheduler.sh b/scrapy_proj/cron/cron_scheduler.sh new file mode 100644 index 0000000..b28624e --- /dev/null +++ b/scrapy_proj/cron/cron_scheduler.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +# ============================================== +# 配置区:按周期类型区分配置 +# ============================================== +# 补充环境变量 +export PATH="/home/ubuntu/.local/bin:$PATH" + +# 项目基础路径 +SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +LOG_DIR="${SCRAPY_PROJ_DIR}/log" +mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录) +SLEEP_SECONDS=60 + +# ============================================== +# 核心:锁文件机制(解决并发执行冲突) +# ============================================== +# 锁文件路径(放在日志目录,便于管理) +LOCK_FILE="${LOG_DIR:-/tmp}/cron_scheduler.lock" +# 最大等待时间(秒):根据实际任务时长设置(示例:5小时=18000秒) +MAX_WAIT_SECONDS=18000 +# 每次检查锁的间隔时间(秒) +WAIT_INTERVAL=60 + +# 获取锁函数:返回0表示成功获取,1表示超时 +acquire_lock() { + local start_time=$(date +%s) + local current_time + + while true; do + # 原子操作创建锁文件(若不存在则创建,存在则失败) + if mkdir "${LOCK_FILE}" 2>/dev/null; then + # 成功获取锁,记录PID(便于排查异常) + echo $$ > "${LOCK_FILE}/pid" + return 0 + fi + + # 已存在锁文件,检查是否超时 + current_time=$(date +%s) + if [ $((current_time - start_time)) -ge ${MAX_WAIT_SECONDS} ]; then + echo "等待超时(超过${MAX_WAIT_SECONDS}秒),其他实例仍在执行" + return 1 + fi + + # 未超时,继续等待 + local running_pid=$(cat "${LOCK_FILE}/pid" 2>/dev/null || echo "unknown") + echo "检测到其他任务正在执行(PID: ${running_pid}),将在${WAIT_INTERVAL}秒后再次检查..." + sleep ${WAIT_INTERVAL} + done +} + +# 释放锁函数 +release_lock() { + if [ -d "${LOCK_FILE}" ]; then + # 删除锁文件(目录) + rm -rf "${LOCK_FILE}" + echo "已释放锁文件" + fi +} + +# 注册退出陷阱:无论脚本正常/异常退出,都释放锁 +trap release_lock EXIT + + +# ============================================== +# 参数解析:区分执行周期(每周/每月) +# ============================================== +if [ $# -ne 1 ]; then + echo "用法:$0 [--weekly|--monthly]" + exit 1 +fi + +PERIOD=$1 +if [ "${PERIOD}" != "--weekly" ] && [ "${PERIOD}" != "--monthly" ]; then + echo "错误:参数必须是 --weekly 或 --monthly" + exit 1 +fi + + +# 按周期设置日期参数 +if [ "${PERIOD}" = "--weekly" ]; then + COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d) +elif [ "${PERIOD}" = "--monthly" ]; then + COMMON_DATE_PARAM=$(date -d "31 days ago" +%Y-%m-%d) +fi + + +# ============================================== +# 注册Spider:按周期类型注册不同任务 +# ============================================== +declare -a SPIDER_REGISTRY=() +register_spider() { + local spider_name="$1" + local execute_cmd="$2" + SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}") +} + +# 每周任务 +if [ "${PERIOD}" = "--weekly" ]; then + register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}" + register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}" + register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' " +fi + +# 每月任务 +if [ "${PERIOD}" = "--monthly" ]; then + register_spider "pbox" "scrapy crawl pbox " +fi + + +# ============================================== +# 核心执行逻辑(复用+锁机制整合) +# ============================================== +current_time=$(date +"%Y%m%d") +main_log="${LOG_DIR}/cron_${PERIOD#--}_${current_time}.log" + +log() { + local msg="$1" + local timestamp=$(date +"%Y-%m-%d %H:%M:%S") + echo "[$timestamp] $msg" | tee -a "${main_log}" +} + +execute_spider() { + local spider_name="$1" + local execute_cmd="$2" + + log "===== 开始执行 ${spider_name} =====" + log "执行命令:${execute_cmd}" + + local spider_log="${LOG_DIR}/${spider_name}_${PERIOD#--}_${current_time}.log" + + (cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1 + local exit_code=$? + + if [ ${exit_code} -eq 0 ]; then + log "${spider_name} 执行成功(日志:${spider_log})" + else + log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code})" + fi + return ${exit_code} +} + + +# ============================================== +# 主流程(整合锁机制) +# ============================================== +log "===== 爬虫调度脚本启动(周期:${PERIOD#--}) =====" +log "项目路径:${SCRAPY_PROJ_DIR}" +log "公共日期参数:${COMMON_DATE_PARAM}" +log "已注册Spider数量:${#SPIDER_REGISTRY[@]}" + +# 第一步:获取锁(若失败则退出) +log "尝试获取执行锁..." +if ! acquire_lock; then + log "ERROR: 无法获取执行锁,脚本终止" + exit 1 +fi +log "成功获取执行锁,开始执行任务" + +# 第二步:检查注册任务 +if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then + log "ERROR: 未注册任何${PERIOD#--}Spider,脚本终止" + release_lock # 释放锁 + exit 1 +fi + +# 第三步:执行所有任务 +for spider_info in "${SPIDER_REGISTRY[@]}"; do + IFS="|" read -r spider_name execute_cmd <<< "${spider_info}" + execute_spider "${spider_name}" "${execute_cmd}" + last_exit_code=$? + + # 可选:失败即终止(取消注释启用) + # if [ ${last_exit_code} -ne 0 ]; then + # log "ERROR: 因${spider_name}执行失败,终止后续执行" + # release_lock # 释放锁 + # exit ${last_exit_code} + # fi + + if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then + log "等待${SLEEP_SECONDS}秒后执行下一个Spider..." + sleep ${SLEEP_SECONDS} + fi +done + +# 第四步:执行完成,释放锁(脚本退出时也会自动释放) +log "===== 所有${PERIOD#--}Spider执行完毕 =====" +release_lock +exit 0 \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index e14246f..863e628 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -25,23 +25,6 @@ class SisDBHandler(SQLiteDBHandler): def insert_item(self, item): self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True) - def _create_tables(self): - # 创建 sis001 数据表 - self.cursor.execute(f''' - CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - plate_name TEXT, - title TEXT, - url TEXT UNIQUE, - size_text TEXT, - size_gb REAL, - update_date TEXT, - created_at TEXT DEFAULT (datetime('now', 'localtime')), - updated_at TEXT DEFAULT (datetime('now', 'localtime')) - ) - ''') - self.conn.commit() - # 统计函数 def get_stat(self): try: @@ -67,30 +50,11 @@ class SisDBHandler(SQLiteDBHandler): class U3C3DBHandler(SQLiteDBHandler): def __init__(self, db_path=default_dbpath): super().__init__(db_path) - self.tbl_name_u3c3 = 'sis' + self.tbl_name_u3c3 = 'u3c3' def insert_item(self, item): self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True) - def _create_tables(self): - # 创建 u001 数据表 - self.cursor.execute(f''' - CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - category TEXT, - title TEXT, - url TEXT UNIQUE, - torrent_url TEXT, - magnet_url TEXT, - size_text TEXT, - size_gb REAL, - update_date TEXT, - created_at TEXT DEFAULT (datetime('now', 'localtime')), - updated_at TEXT DEFAULT (datetime('now', 'localtime')) - ) - ''') - self.conn.commit() - # 统计函数 def get_stat(self): try: