From 7c0cf12704b121894449b0d6484385e8870b61b3 Mon Sep 17 00:00:00 2001 From: oscar Date: Sat, 15 Mar 2025 17:39:25 +0800 Subject: [PATCH] modify scripts --- stockapp/reports_em/config.py | 3 +- stockapp/reports_em/deploy.sh | 80 +++++++++++++++++++++++++++++++ stockapp/reports_em/em_reports.py | 25 +++++++++- stockapp/reports_em/fetch.py | 21 ++++---- 4 files changed, 117 insertions(+), 12 deletions(-) create mode 100755 stockapp/reports_em/deploy.sh diff --git a/stockapp/reports_em/config.py b/stockapp/reports_em/config.py index 4e9c1a8..f22fa38 100644 --- a/stockapp/reports_em/config.py +++ b/stockapp/reports_em/config.py @@ -6,7 +6,8 @@ from datetime import datetime from logging.handlers import RotatingFileHandler from collections import defaultdict -global_host_data_dir = '/root/hostdir/stock_data' +home_dir = os.path.expanduser("~") +global_host_data_dir = f'{home_dir}/hostdir/stock_data' # 统计日志频率 log_count = defaultdict(int) # 记录日志的次数 diff --git a/stockapp/reports_em/deploy.sh b/stockapp/reports_em/deploy.sh new file mode 100755 index 0000000..e78fd9a --- /dev/null +++ b/stockapp/reports_em/deploy.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# 远程服务器列表,按需修改 +SERVERS=("175.178.54.98" "1.12.218.143" "43.139.169.25" "129.204.180.174" "42.194.142.169") +REMOTE_USER="ubuntu" +REMOTE_SCRIPT_DIR="/home/ubuntu/pyscripts/stockapp/reports_em" +DATA_DIR="/home/ubuntu/hostdir/stock_data/pdfs" +LOCAL_FILES=("config.py" "em_reports.py" "fetch.py") + +# 远程任务参数配置(每台服务器的不同参数) +TASK_PARAMS=( + "--mode=down --begin=2024-01-01 --end=2024-06-30" + "--mode=down --begin=2023-01-01 --end=2023-06-30" + "--mode=down --begin=2022-01-01 --end=2022-06-30" + "--mode=down --begin=2021-01-01 --end=2021-06-30" + "--mode=down --begin=2020-01-01 --end=2020-06-30" +) + +# 推送代码到所有服务器 +function push_code() { + for SERVER in "${SERVERS[@]}"; do + echo "Pushing code to $SERVER..." + scp "${LOCAL_FILES[@]}" "$REMOTE_USER@$SERVER:$REMOTE_SCRIPT_DIR/" + done +} + +# 启动任务 +function start_tasks() { + for i in "${!SERVERS[@]}"; do + SERVER="${SERVERS[$i]}" + PARAMS="${TASK_PARAMS[$i]}" + echo "Starting task on $SERVER with params: $PARAMS" + #ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 &" + #ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown" + # nohup ... < /dev/null:防止 nohup 等待 stdin,立即释放控制权。 + # & disown:确保进程与 SSH 彻底分离,避免 SIGHUP 信号影响,SSH 立即返回。 + # ssh -n:禁用 ssh 的 stdin,防止远程进程等待输入。 + ssh -n "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown" + done +} + +# 停止任务 +function stop_tasks() { + for SERVER in "${SERVERS[@]}"; do + echo "Stopping task on $SERVER..." + ssh "$REMOTE_USER@$SERVER" "pkill -f 'python3 ./fetch.py'" + done +} + +# 获取任务进度 +function check_progress() { + for SERVER in "${SERVERS[@]}"; do + echo -e "\nChecking progress on $SERVER..." + FILE_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ls -lRh $DATA_DIR | grep pdf | wc -l") + FILE_SIZE=$(ssh "$REMOTE_USER@$SERVER" "du -sh $DATA_DIR") + #PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "pgrep -f 'python3 ./fetch.py' | wc -l") + PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ps aux | grep '[f]etch.py' | wc -l") + + if [ "$PROCESS_COUNT" -gt 0 ]; then + echo "Process status: Running ($PROCESS_COUNT instances), if 2, include parent progress" + else + echo "Process status: Not running" + fi + + echo "Total files: $FILE_COUNT" + echo "Total size : $FILE_SIZE" + done +} + +# 脚本菜单 +case "$1" in + push) push_code ;; + start) start_tasks ;; + stop) stop_tasks ;; + check) check_progress ;; + *) + echo "Usage: $0 {push|start|stop|check}" + exit 1 + ;; +esac diff --git a/stockapp/reports_em/em_reports.py b/stockapp/reports_em/em_reports.py index 6538eb9..390f6a7 100644 --- a/stockapp/reports_em/em_reports.py +++ b/stockapp/reports_em/em_reports.py @@ -268,13 +268,34 @@ def fetch_pdf_link(url, max_retries = 3): except requests.RequestException as e: logging.error(f"请求失败: {url} {e}") - return None logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 +def is_valid_pdf(file_path): + try: + with open(file_path, "rb") as f: + header = f.read(4) + return header == b"%PDF" + except Exception as e: + logging.error(f"验证 PDF 失败: {e}") + return False + +def download_pdf_wget(pdf_url, save_path): + cmd = f'wget -O "{save_path}" "{pdf_url}" --quiet --user-agent="Mozilla/5.0"' + os.system(cmd) + return os.path.exists(save_path) and is_valid_pdf(save_path) + + # 下载 PDF 并保存到本地 -def download_pdf(pdf_url, save_path): +def download_pdf(pdf_url, save_path, max_retries=5): + for attempt in range(max_retries): + down = download_pdf_wget(pdf_url, save_path) + if down: + return True + + return False + headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" } diff --git a/stockapp/reports_em/fetch.py b/stockapp/reports_em/fetch.py index 00f1572..55e698d 100644 --- a/stockapp/reports_em/fetch.py +++ b/stockapp/reports_em/fetch.py @@ -18,7 +18,7 @@ config.setup_logging() debug = False force = False -pdf_base_dir = "/root/hostdir/stock_data/pdfs" # 下载 PDF 存放目录 +pdf_base_dir = f"{config.global_host_data_dir}/pdfs" # 下载 PDF 存放目录 map_pdf_page = { @@ -128,15 +128,18 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d if file_path and os.path.isfile(file_path): logging.info(f'{file_path} already exists. skipping...') continue - # 获取pdf链接地址 - if url: - pdf_url = em.fetch_pdf_link(url) - if pdf_url: - # 下载 PDF - down = em.download_pdf(pdf_url, file_path) - if down: - logging.info(f'saved file {file_path}') + # 获取pdf链接地址 + pdf_url = em.fetch_pdf_link(url) + if pdf_url: + # 下载 PDF + down = em.download_pdf(pdf_url, file_path) + if down: + logging.info(f'saved file {file_path}') + else: + logging.warning(f'download pdf file error. file_path: {pdf_url}, save_path: {file_path}') + else: + logging.warning(f'cannot get pdf link. url: {url}, save_path: {file_path}') time.sleep(1) # 避免请求过快