modify scripts

This commit is contained in:
2025-03-15 17:39:25 +08:00
parent 3b76c00500
commit 7c0cf12704
4 changed files with 117 additions and 12 deletions

View File

@ -6,7 +6,8 @@ from datetime import datetime
from logging.handlers import RotatingFileHandler
from collections import defaultdict
global_host_data_dir = '/root/hostdir/stock_data'
home_dir = os.path.expanduser("~")
global_host_data_dir = f'{home_dir}/hostdir/stock_data'
# 统计日志频率
log_count = defaultdict(int) # 记录日志的次数

80
stockapp/reports_em/deploy.sh Executable file
View File

@ -0,0 +1,80 @@
#!/bin/bash
# 远程服务器列表,按需修改
SERVERS=("175.178.54.98" "1.12.218.143" "43.139.169.25" "129.204.180.174" "42.194.142.169")
REMOTE_USER="ubuntu"
REMOTE_SCRIPT_DIR="/home/ubuntu/pyscripts/stockapp/reports_em"
DATA_DIR="/home/ubuntu/hostdir/stock_data/pdfs"
LOCAL_FILES=("config.py" "em_reports.py" "fetch.py")
# 远程任务参数配置(每台服务器的不同参数)
TASK_PARAMS=(
"--mode=down --begin=2024-01-01 --end=2024-06-30"
"--mode=down --begin=2023-01-01 --end=2023-06-30"
"--mode=down --begin=2022-01-01 --end=2022-06-30"
"--mode=down --begin=2021-01-01 --end=2021-06-30"
"--mode=down --begin=2020-01-01 --end=2020-06-30"
)
# 推送代码到所有服务器
function push_code() {
for SERVER in "${SERVERS[@]}"; do
echo "Pushing code to $SERVER..."
scp "${LOCAL_FILES[@]}" "$REMOTE_USER@$SERVER:$REMOTE_SCRIPT_DIR/"
done
}
# 启动任务
function start_tasks() {
for i in "${!SERVERS[@]}"; do
SERVER="${SERVERS[$i]}"
PARAMS="${TASK_PARAMS[$i]}"
echo "Starting task on $SERVER with params: $PARAMS"
#ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 &"
#ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown"
# nohup ... < /dev/null防止 nohup 等待 stdin立即释放控制权。
# & disown确保进程与 SSH 彻底分离,避免 SIGHUP 信号影响SSH 立即返回。
# ssh -n禁用 ssh 的 stdin防止远程进程等待输入。
ssh -n "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown"
done
}
# 停止任务
function stop_tasks() {
for SERVER in "${SERVERS[@]}"; do
echo "Stopping task on $SERVER..."
ssh "$REMOTE_USER@$SERVER" "pkill -f 'python3 ./fetch.py'"
done
}
# 获取任务进度
function check_progress() {
for SERVER in "${SERVERS[@]}"; do
echo -e "\nChecking progress on $SERVER..."
FILE_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ls -lRh $DATA_DIR | grep pdf | wc -l")
FILE_SIZE=$(ssh "$REMOTE_USER@$SERVER" "du -sh $DATA_DIR")
#PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "pgrep -f 'python3 ./fetch.py' | wc -l")
PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ps aux | grep '[f]etch.py' | wc -l")
if [ "$PROCESS_COUNT" -gt 0 ]; then
echo "Process status: Running ($PROCESS_COUNT instances), if 2, include parent progress"
else
echo "Process status: Not running"
fi
echo "Total files: $FILE_COUNT"
echo "Total size : $FILE_SIZE"
done
}
# 脚本菜单
case "$1" in
push) push_code ;;
start) start_tasks ;;
stop) stop_tasks ;;
check) check_progress ;;
*)
echo "Usage: $0 {push|start|stop|check}"
exit 1
;;
esac

View File

@ -268,13 +268,34 @@ def fetch_pdf_link(url, max_retries = 3):
except requests.RequestException as e:
logging.error(f"请求失败: {url} {e}")
return None
logging.error(f'Fetching failed after max retries. {url}')
return None # 达到最大重试次数仍然失败
def is_valid_pdf(file_path):
try:
with open(file_path, "rb") as f:
header = f.read(4)
return header == b"%PDF"
except Exception as e:
logging.error(f"验证 PDF 失败: {e}")
return False
def download_pdf_wget(pdf_url, save_path):
cmd = f'wget -O "{save_path}" "{pdf_url}" --quiet --user-agent="Mozilla/5.0"'
os.system(cmd)
return os.path.exists(save_path) and is_valid_pdf(save_path)
# 下载 PDF 并保存到本地
def download_pdf(pdf_url, save_path):
def download_pdf(pdf_url, save_path, max_retries=5):
for attempt in range(max_retries):
down = download_pdf_wget(pdf_url, save_path)
if down:
return True
return False
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}

View File

@ -18,7 +18,7 @@ config.setup_logging()
debug = False
force = False
pdf_base_dir = "/root/hostdir/stock_data/pdfs" # 下载 PDF 存放目录
pdf_base_dir = f"{config.global_host_data_dir}/pdfs" # 下载 PDF 存放目录
map_pdf_page = {
@ -128,15 +128,18 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
if file_path and os.path.isfile(file_path):
logging.info(f'{file_path} already exists. skipping...')
continue
# 获取pdf链接地址
if url:
pdf_url = em.fetch_pdf_link(url)
if pdf_url:
# 下载 PDF
down = em.download_pdf(pdf_url, file_path)
if down:
logging.info(f'saved file {file_path}')
# 获取pdf链接地址
pdf_url = em.fetch_pdf_link(url)
if pdf_url:
# 下载 PDF
down = em.download_pdf(pdf_url, file_path)
if down:
logging.info(f'saved file {file_path}')
else:
logging.warning(f'download pdf file error. file_path: {pdf_url}, save_path: {file_path}')
else:
logging.warning(f'cannot get pdf link. url: {url}, save_path: {file_path}')
time.sleep(1) # 避免请求过快