modify scripts
This commit is contained in:
@ -6,7 +6,8 @@ from datetime import datetime
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from collections import defaultdict
|
||||
|
||||
global_host_data_dir = '/root/hostdir/stock_data'
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_host_data_dir = f'{home_dir}/hostdir/stock_data'
|
||||
|
||||
# 统计日志频率
|
||||
log_count = defaultdict(int) # 记录日志的次数
|
||||
|
||||
80
stockapp/reports_em/deploy.sh
Executable file
80
stockapp/reports_em/deploy.sh
Executable file
@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 远程服务器列表,按需修改
|
||||
SERVERS=("175.178.54.98" "1.12.218.143" "43.139.169.25" "129.204.180.174" "42.194.142.169")
|
||||
REMOTE_USER="ubuntu"
|
||||
REMOTE_SCRIPT_DIR="/home/ubuntu/pyscripts/stockapp/reports_em"
|
||||
DATA_DIR="/home/ubuntu/hostdir/stock_data/pdfs"
|
||||
LOCAL_FILES=("config.py" "em_reports.py" "fetch.py")
|
||||
|
||||
# 远程任务参数配置(每台服务器的不同参数)
|
||||
TASK_PARAMS=(
|
||||
"--mode=down --begin=2024-01-01 --end=2024-06-30"
|
||||
"--mode=down --begin=2023-01-01 --end=2023-06-30"
|
||||
"--mode=down --begin=2022-01-01 --end=2022-06-30"
|
||||
"--mode=down --begin=2021-01-01 --end=2021-06-30"
|
||||
"--mode=down --begin=2020-01-01 --end=2020-06-30"
|
||||
)
|
||||
|
||||
# 推送代码到所有服务器
|
||||
function push_code() {
|
||||
for SERVER in "${SERVERS[@]}"; do
|
||||
echo "Pushing code to $SERVER..."
|
||||
scp "${LOCAL_FILES[@]}" "$REMOTE_USER@$SERVER:$REMOTE_SCRIPT_DIR/"
|
||||
done
|
||||
}
|
||||
|
||||
# 启动任务
|
||||
function start_tasks() {
|
||||
for i in "${!SERVERS[@]}"; do
|
||||
SERVER="${SERVERS[$i]}"
|
||||
PARAMS="${TASK_PARAMS[$i]}"
|
||||
echo "Starting task on $SERVER with params: $PARAMS"
|
||||
#ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 &"
|
||||
#ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown"
|
||||
# nohup ... < /dev/null:防止 nohup 等待 stdin,立即释放控制权。
|
||||
# & disown:确保进程与 SSH 彻底分离,避免 SIGHUP 信号影响,SSH 立即返回。
|
||||
# ssh -n:禁用 ssh 的 stdin,防止远程进程等待输入。
|
||||
ssh -n "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown"
|
||||
done
|
||||
}
|
||||
|
||||
# 停止任务
|
||||
function stop_tasks() {
|
||||
for SERVER in "${SERVERS[@]}"; do
|
||||
echo "Stopping task on $SERVER..."
|
||||
ssh "$REMOTE_USER@$SERVER" "pkill -f 'python3 ./fetch.py'"
|
||||
done
|
||||
}
|
||||
|
||||
# 获取任务进度
|
||||
function check_progress() {
|
||||
for SERVER in "${SERVERS[@]}"; do
|
||||
echo -e "\nChecking progress on $SERVER..."
|
||||
FILE_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ls -lRh $DATA_DIR | grep pdf | wc -l")
|
||||
FILE_SIZE=$(ssh "$REMOTE_USER@$SERVER" "du -sh $DATA_DIR")
|
||||
#PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "pgrep -f 'python3 ./fetch.py' | wc -l")
|
||||
PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ps aux | grep '[f]etch.py' | wc -l")
|
||||
|
||||
if [ "$PROCESS_COUNT" -gt 0 ]; then
|
||||
echo "Process status: Running ($PROCESS_COUNT instances), if 2, include parent progress"
|
||||
else
|
||||
echo "Process status: Not running"
|
||||
fi
|
||||
|
||||
echo "Total files: $FILE_COUNT"
|
||||
echo "Total size : $FILE_SIZE"
|
||||
done
|
||||
}
|
||||
|
||||
# 脚本菜单
|
||||
case "$1" in
|
||||
push) push_code ;;
|
||||
start) start_tasks ;;
|
||||
stop) stop_tasks ;;
|
||||
check) check_progress ;;
|
||||
*)
|
||||
echo "Usage: $0 {push|start|stop|check}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@ -268,13 +268,34 @@ def fetch_pdf_link(url, max_retries = 3):
|
||||
|
||||
except requests.RequestException as e:
|
||||
logging.error(f"请求失败: {url} {e}")
|
||||
return None
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
|
||||
def is_valid_pdf(file_path):
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
header = f.read(4)
|
||||
return header == b"%PDF"
|
||||
except Exception as e:
|
||||
logging.error(f"验证 PDF 失败: {e}")
|
||||
return False
|
||||
|
||||
def download_pdf_wget(pdf_url, save_path):
|
||||
cmd = f'wget -O "{save_path}" "{pdf_url}" --quiet --user-agent="Mozilla/5.0"'
|
||||
os.system(cmd)
|
||||
return os.path.exists(save_path) and is_valid_pdf(save_path)
|
||||
|
||||
|
||||
# 下载 PDF 并保存到本地
|
||||
def download_pdf(pdf_url, save_path):
|
||||
def download_pdf(pdf_url, save_path, max_retries=5):
|
||||
for attempt in range(max_retries):
|
||||
down = download_pdf_wget(pdf_url, save_path)
|
||||
if down:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
@ -18,7 +18,7 @@ config.setup_logging()
|
||||
|
||||
debug = False
|
||||
force = False
|
||||
pdf_base_dir = "/root/hostdir/stock_data/pdfs" # 下载 PDF 存放目录
|
||||
pdf_base_dir = f"{config.global_host_data_dir}/pdfs" # 下载 PDF 存放目录
|
||||
|
||||
|
||||
map_pdf_page = {
|
||||
@ -128,15 +128,18 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
|
||||
if file_path and os.path.isfile(file_path):
|
||||
logging.info(f'{file_path} already exists. skipping...')
|
||||
continue
|
||||
# 获取pdf链接地址
|
||||
if url:
|
||||
pdf_url = em.fetch_pdf_link(url)
|
||||
|
||||
if pdf_url:
|
||||
# 下载 PDF
|
||||
down = em.download_pdf(pdf_url, file_path)
|
||||
if down:
|
||||
logging.info(f'saved file {file_path}')
|
||||
# 获取pdf链接地址
|
||||
pdf_url = em.fetch_pdf_link(url)
|
||||
if pdf_url:
|
||||
# 下载 PDF
|
||||
down = em.download_pdf(pdf_url, file_path)
|
||||
if down:
|
||||
logging.info(f'saved file {file_path}')
|
||||
else:
|
||||
logging.warning(f'download pdf file error. file_path: {pdf_url}, save_path: {file_path}')
|
||||
else:
|
||||
logging.warning(f'cannot get pdf link. url: {url}, save_path: {file_path}')
|
||||
|
||||
time.sleep(1) # 避免请求过快
|
||||
|
||||
|
||||
Reference in New Issue
Block a user