modify scripts

This commit is contained in:
2025-07-22 11:50:39 +08:00
parent 6200e5a1d6
commit 67224f41d0
2 changed files with 190 additions and 37 deletions

View File

@ -0,0 +1,189 @@
#!/bin/bash
# ==============================================
# 配置区:按周期类型区分配置
# ==============================================
# 补充环境变量
export PATH="/home/ubuntu/.local/bin:$PATH"
# 项目基础路径
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
LOG_DIR="${SCRAPY_PROJ_DIR}/log"
mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录)
SLEEP_SECONDS=60
# ==============================================
# 核心:锁文件机制(解决并发执行冲突)
# ==============================================
# 锁文件路径(放在日志目录,便于管理)
LOCK_FILE="${LOG_DIR:-/tmp}/cron_scheduler.lock"
# 最大等待时间根据实际任务时长设置示例5小时=18000秒
MAX_WAIT_SECONDS=18000
# 每次检查锁的间隔时间(秒)
WAIT_INTERVAL=60
# 获取锁函数返回0表示成功获取1表示超时
acquire_lock() {
local start_time=$(date +%s)
local current_time
while true; do
# 原子操作创建锁文件(若不存在则创建,存在则失败)
if mkdir "${LOCK_FILE}" 2>/dev/null; then
# 成功获取锁记录PID便于排查异常
echo $$ > "${LOCK_FILE}/pid"
return 0
fi
# 已存在锁文件,检查是否超时
current_time=$(date +%s)
if [ $((current_time - start_time)) -ge ${MAX_WAIT_SECONDS} ]; then
echo "等待超时(超过${MAX_WAIT_SECONDS}秒),其他实例仍在执行"
return 1
fi
# 未超时,继续等待
local running_pid=$(cat "${LOCK_FILE}/pid" 2>/dev/null || echo "unknown")
echo "检测到其他任务正在执行PID: ${running_pid}),将在${WAIT_INTERVAL}秒后再次检查..."
sleep ${WAIT_INTERVAL}
done
}
# 释放锁函数
release_lock() {
if [ -d "${LOCK_FILE}" ]; then
# 删除锁文件(目录)
rm -rf "${LOCK_FILE}"
echo "已释放锁文件"
fi
}
# 注册退出陷阱:无论脚本正常/异常退出,都释放锁
trap release_lock EXIT
# ==============================================
# 参数解析:区分执行周期(每周/每月)
# ==============================================
if [ $# -ne 1 ]; then
echo "用法:$0 [--weekly|--monthly]"
exit 1
fi
PERIOD=$1
if [ "${PERIOD}" != "--weekly" ] && [ "${PERIOD}" != "--monthly" ]; then
echo "错误:参数必须是 --weekly 或 --monthly"
exit 1
fi
# 按周期设置日期参数
if [ "${PERIOD}" = "--weekly" ]; then
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
elif [ "${PERIOD}" = "--monthly" ]; then
COMMON_DATE_PARAM=$(date -d "31 days ago" +%Y-%m-%d)
fi
# ==============================================
# 注册Spider按周期类型注册不同任务
# ==============================================
declare -a SPIDER_REGISTRY=()
register_spider() {
local spider_name="$1"
local execute_cmd="$2"
SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}")
}
# 每周任务
if [ "${PERIOD}" = "--weekly" ]; then
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
fi
# 每月任务
if [ "${PERIOD}" = "--monthly" ]; then
register_spider "pbox" "scrapy crawl pbox "
fi
# ==============================================
# 核心执行逻辑(复用+锁机制整合)
# ==============================================
current_time=$(date +"%Y%m%d")
main_log="${LOG_DIR}/cron_${PERIOD#--}_${current_time}.log"
log() {
local msg="$1"
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
echo "[$timestamp] $msg" | tee -a "${main_log}"
}
execute_spider() {
local spider_name="$1"
local execute_cmd="$2"
log "===== 开始执行 ${spider_name} ====="
log "执行命令:${execute_cmd}"
local spider_log="${LOG_DIR}/${spider_name}_${PERIOD#--}_${current_time}.log"
(cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1
local exit_code=$?
if [ ${exit_code} -eq 0 ]; then
log "${spider_name} 执行成功(日志:${spider_log}"
else
log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code}"
fi
return ${exit_code}
}
# ==============================================
# 主流程(整合锁机制)
# ==============================================
log "===== 爬虫调度脚本启动(周期:${PERIOD#--} ====="
log "项目路径:${SCRAPY_PROJ_DIR}"
log "公共日期参数:${COMMON_DATE_PARAM}"
log "已注册Spider数量${#SPIDER_REGISTRY[@]}"
# 第一步:获取锁(若失败则退出)
log "尝试获取执行锁..."
if ! acquire_lock; then
log "ERROR: 无法获取执行锁,脚本终止"
exit 1
fi
log "成功获取执行锁,开始执行任务"
# 第二步:检查注册任务
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
log "ERROR: 未注册任何${PERIOD#--}Spider脚本终止"
release_lock # 释放锁
exit 1
fi
# 第三步:执行所有任务
for spider_info in "${SPIDER_REGISTRY[@]}"; do
IFS="|" read -r spider_name execute_cmd <<< "${spider_info}"
execute_spider "${spider_name}" "${execute_cmd}"
last_exit_code=$?
# 可选:失败即终止(取消注释启用)
# if [ ${last_exit_code} -ne 0 ]; then
# log "ERROR: 因${spider_name}执行失败,终止后续执行"
# release_lock # 释放锁
# exit ${last_exit_code}
# fi
if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then
log "等待${SLEEP_SECONDS}秒后执行下一个Spider..."
sleep ${SLEEP_SECONDS}
fi
done
# 第四步:执行完成,释放锁(脚本退出时也会自动释放)
log "===== 所有${PERIOD#--}Spider执行完毕 ====="
release_lock
exit 0

View File

@ -25,23 +25,6 @@ class SisDBHandler(SQLiteDBHandler):
def insert_item(self, item):
self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
def _create_tables(self):
# 创建 sis001 数据表
self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
plate_name TEXT,
title TEXT,
url TEXT UNIQUE,
size_text TEXT,
size_gb REAL,
update_date TEXT,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
)
''')
self.conn.commit()
# 统计函数
def get_stat(self):
try:
@ -67,30 +50,11 @@ class SisDBHandler(SQLiteDBHandler):
class U3C3DBHandler(SQLiteDBHandler):
def __init__(self, db_path=default_dbpath):
super().__init__(db_path)
self.tbl_name_u3c3 = 'sis'
self.tbl_name_u3c3 = 'u3c3'
def insert_item(self, item):
self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
def _create_tables(self):
# 创建 u001 数据表
self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
category TEXT,
title TEXT,
url TEXT UNIQUE,
torrent_url TEXT,
magnet_url TEXT,
size_text TEXT,
size_gb REAL,
update_date TEXT,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
)
''')
self.conn.commit()
# 统计函数
def get_stat(self):
try: