modify scripts
This commit is contained in:
189
scrapy_proj/cron/cron_scheduler.sh
Normal file
189
scrapy_proj/cron/cron_scheduler.sh
Normal file
@ -0,0 +1,189 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ==============================================
|
||||
# 配置区:按周期类型区分配置
|
||||
# ==============================================
|
||||
# 补充环境变量
|
||||
export PATH="/home/ubuntu/.local/bin:$PATH"
|
||||
|
||||
# 项目基础路径
|
||||
SCRAPY_PROJ_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
|
||||
LOG_DIR="${SCRAPY_PROJ_DIR}/log"
|
||||
mkdir -p "${LOG_DIR}" # 确保日志目录存在(锁文件依赖此目录)
|
||||
SLEEP_SECONDS=60
|
||||
|
||||
# ==============================================
|
||||
# 核心:锁文件机制(解决并发执行冲突)
|
||||
# ==============================================
|
||||
# 锁文件路径(放在日志目录,便于管理)
|
||||
LOCK_FILE="${LOG_DIR:-/tmp}/cron_scheduler.lock"
|
||||
# 最大等待时间(秒):根据实际任务时长设置(示例:5小时=18000秒)
|
||||
MAX_WAIT_SECONDS=18000
|
||||
# 每次检查锁的间隔时间(秒)
|
||||
WAIT_INTERVAL=60
|
||||
|
||||
# 获取锁函数:返回0表示成功获取,1表示超时
|
||||
acquire_lock() {
|
||||
local start_time=$(date +%s)
|
||||
local current_time
|
||||
|
||||
while true; do
|
||||
# 原子操作创建锁文件(若不存在则创建,存在则失败)
|
||||
if mkdir "${LOCK_FILE}" 2>/dev/null; then
|
||||
# 成功获取锁,记录PID(便于排查异常)
|
||||
echo $$ > "${LOCK_FILE}/pid"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 已存在锁文件,检查是否超时
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - start_time)) -ge ${MAX_WAIT_SECONDS} ]; then
|
||||
echo "等待超时(超过${MAX_WAIT_SECONDS}秒),其他实例仍在执行"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 未超时,继续等待
|
||||
local running_pid=$(cat "${LOCK_FILE}/pid" 2>/dev/null || echo "unknown")
|
||||
echo "检测到其他任务正在执行(PID: ${running_pid}),将在${WAIT_INTERVAL}秒后再次检查..."
|
||||
sleep ${WAIT_INTERVAL}
|
||||
done
|
||||
}
|
||||
|
||||
# 释放锁函数
|
||||
release_lock() {
|
||||
if [ -d "${LOCK_FILE}" ]; then
|
||||
# 删除锁文件(目录)
|
||||
rm -rf "${LOCK_FILE}"
|
||||
echo "已释放锁文件"
|
||||
fi
|
||||
}
|
||||
|
||||
# 注册退出陷阱:无论脚本正常/异常退出,都释放锁
|
||||
trap release_lock EXIT
|
||||
|
||||
|
||||
# ==============================================
|
||||
# 参数解析:区分执行周期(每周/每月)
|
||||
# ==============================================
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "用法:$0 [--weekly|--monthly]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PERIOD=$1
|
||||
if [ "${PERIOD}" != "--weekly" ] && [ "${PERIOD}" != "--monthly" ]; then
|
||||
echo "错误:参数必须是 --weekly 或 --monthly"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# 按周期设置日期参数
|
||||
if [ "${PERIOD}" = "--weekly" ]; then
|
||||
COMMON_DATE_PARAM=$(date -d "8 days ago" +%Y-%m-%d)
|
||||
elif [ "${PERIOD}" = "--monthly" ]; then
|
||||
COMMON_DATE_PARAM=$(date -d "31 days ago" +%Y-%m-%d)
|
||||
fi
|
||||
|
||||
|
||||
# ==============================================
|
||||
# 注册Spider:按周期类型注册不同任务
|
||||
# ==============================================
|
||||
declare -a SPIDER_REGISTRY=()
|
||||
register_spider() {
|
||||
local spider_name="$1"
|
||||
local execute_cmd="$2"
|
||||
SPIDER_REGISTRY+=("${spider_name}|${execute_cmd}")
|
||||
}
|
||||
|
||||
# 每周任务
|
||||
if [ "${PERIOD}" = "--weekly" ]; then
|
||||
register_spider "u3c3" "scrapy crawl u3c3 -a begin=${COMMON_DATE_PARAM}"
|
||||
register_spider "sis" "scrapy crawl sis -a begin=${COMMON_DATE_PARAM}"
|
||||
register_spider "clm" "scrapy crawl clm -a begin=${COMMON_DATE_PARAM} -a mod='update' "
|
||||
fi
|
||||
|
||||
# 每月任务
|
||||
if [ "${PERIOD}" = "--monthly" ]; then
|
||||
register_spider "pbox" "scrapy crawl pbox "
|
||||
fi
|
||||
|
||||
|
||||
# ==============================================
|
||||
# 核心执行逻辑(复用+锁机制整合)
|
||||
# ==============================================
|
||||
current_time=$(date +"%Y%m%d")
|
||||
main_log="${LOG_DIR}/cron_${PERIOD#--}_${current_time}.log"
|
||||
|
||||
log() {
|
||||
local msg="$1"
|
||||
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
|
||||
echo "[$timestamp] $msg" | tee -a "${main_log}"
|
||||
}
|
||||
|
||||
execute_spider() {
|
||||
local spider_name="$1"
|
||||
local execute_cmd="$2"
|
||||
|
||||
log "===== 开始执行 ${spider_name} ====="
|
||||
log "执行命令:${execute_cmd}"
|
||||
|
||||
local spider_log="${LOG_DIR}/${spider_name}_${PERIOD#--}_${current_time}.log"
|
||||
|
||||
(cd "${SCRAPY_PROJ_DIR}" && eval "${execute_cmd}") > "${spider_log}" 2>&1
|
||||
local exit_code=$?
|
||||
|
||||
if [ ${exit_code} -eq 0 ]; then
|
||||
log "${spider_name} 执行成功(日志:${spider_log})"
|
||||
else
|
||||
log "ERROR: ${spider_name} 执行失败(日志:${spider_log},退出码:${exit_code})"
|
||||
fi
|
||||
return ${exit_code}
|
||||
}
|
||||
|
||||
|
||||
# ==============================================
|
||||
# 主流程(整合锁机制)
|
||||
# ==============================================
|
||||
log "===== 爬虫调度脚本启动(周期:${PERIOD#--}) ====="
|
||||
log "项目路径:${SCRAPY_PROJ_DIR}"
|
||||
log "公共日期参数:${COMMON_DATE_PARAM}"
|
||||
log "已注册Spider数量:${#SPIDER_REGISTRY[@]}"
|
||||
|
||||
# 第一步:获取锁(若失败则退出)
|
||||
log "尝试获取执行锁..."
|
||||
if ! acquire_lock; then
|
||||
log "ERROR: 无法获取执行锁,脚本终止"
|
||||
exit 1
|
||||
fi
|
||||
log "成功获取执行锁,开始执行任务"
|
||||
|
||||
# 第二步:检查注册任务
|
||||
if [ ${#SPIDER_REGISTRY[@]} -eq 0 ]; then
|
||||
log "ERROR: 未注册任何${PERIOD#--}Spider,脚本终止"
|
||||
release_lock # 释放锁
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 第三步:执行所有任务
|
||||
for spider_info in "${SPIDER_REGISTRY[@]}"; do
|
||||
IFS="|" read -r spider_name execute_cmd <<< "${spider_info}"
|
||||
execute_spider "${spider_name}" "${execute_cmd}"
|
||||
last_exit_code=$?
|
||||
|
||||
# 可选:失败即终止(取消注释启用)
|
||||
# if [ ${last_exit_code} -ne 0 ]; then
|
||||
# log "ERROR: 因${spider_name}执行失败,终止后续执行"
|
||||
# release_lock # 释放锁
|
||||
# exit ${last_exit_code}
|
||||
# fi
|
||||
|
||||
if [ "${spider_info}" != "${SPIDER_REGISTRY[-1]}" ]; then
|
||||
log "等待${SLEEP_SECONDS}秒后执行下一个Spider..."
|
||||
sleep ${SLEEP_SECONDS}
|
||||
fi
|
||||
done
|
||||
|
||||
# 第四步:执行完成,释放锁(脚本退出时也会自动释放)
|
||||
log "===== 所有${PERIOD#--}Spider执行完毕 ====="
|
||||
release_lock
|
||||
exit 0
|
||||
@ -25,23 +25,6 @@ class SisDBHandler(SQLiteDBHandler):
|
||||
def insert_item(self, item):
|
||||
self.insert_or_update_common(item, tbl_name=self.tbl_name_sis, uniq_key='url', exists_do_nothing=True)
|
||||
|
||||
def _create_tables(self):
|
||||
# 创建 sis001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
plate_name TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
self.conn.commit()
|
||||
|
||||
# 统计函数
|
||||
def get_stat(self):
|
||||
try:
|
||||
@ -67,30 +50,11 @@ class SisDBHandler(SQLiteDBHandler):
|
||||
class U3C3DBHandler(SQLiteDBHandler):
|
||||
def __init__(self, db_path=default_dbpath):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_u3c3 = 'sis'
|
||||
self.tbl_name_u3c3 = 'u3c3'
|
||||
|
||||
def insert_item(self, item):
|
||||
self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True)
|
||||
|
||||
def _create_tables(self):
|
||||
# 创建 u001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
category TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
torrent_url TEXT,
|
||||
magnet_url TEXT,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
self.conn.commit()
|
||||
|
||||
# 统计函数
|
||||
def get_stat(self):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user