modify scripts
This commit is contained in:
@ -6,7 +6,8 @@ from datetime import datetime
|
|||||||
from logging.handlers import RotatingFileHandler
|
from logging.handlers import RotatingFileHandler
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
global_host_data_dir = '/root/hostdir/stock_data'
|
home_dir = os.path.expanduser("~")
|
||||||
|
global_host_data_dir = f'{home_dir}/hostdir/stock_data'
|
||||||
|
|
||||||
# 统计日志频率
|
# 统计日志频率
|
||||||
log_count = defaultdict(int) # 记录日志的次数
|
log_count = defaultdict(int) # 记录日志的次数
|
||||||
|
|||||||
80
stockapp/reports_em/deploy.sh
Executable file
80
stockapp/reports_em/deploy.sh
Executable file
@ -0,0 +1,80 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# 远程服务器列表,按需修改
|
||||||
|
SERVERS=("175.178.54.98" "1.12.218.143" "43.139.169.25" "129.204.180.174" "42.194.142.169")
|
||||||
|
REMOTE_USER="ubuntu"
|
||||||
|
REMOTE_SCRIPT_DIR="/home/ubuntu/pyscripts/stockapp/reports_em"
|
||||||
|
DATA_DIR="/home/ubuntu/hostdir/stock_data/pdfs"
|
||||||
|
LOCAL_FILES=("config.py" "em_reports.py" "fetch.py")
|
||||||
|
|
||||||
|
# 远程任务参数配置(每台服务器的不同参数)
|
||||||
|
TASK_PARAMS=(
|
||||||
|
"--mode=down --begin=2024-01-01 --end=2024-06-30"
|
||||||
|
"--mode=down --begin=2023-01-01 --end=2023-06-30"
|
||||||
|
"--mode=down --begin=2022-01-01 --end=2022-06-30"
|
||||||
|
"--mode=down --begin=2021-01-01 --end=2021-06-30"
|
||||||
|
"--mode=down --begin=2020-01-01 --end=2020-06-30"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 推送代码到所有服务器
|
||||||
|
function push_code() {
|
||||||
|
for SERVER in "${SERVERS[@]}"; do
|
||||||
|
echo "Pushing code to $SERVER..."
|
||||||
|
scp "${LOCAL_FILES[@]}" "$REMOTE_USER@$SERVER:$REMOTE_SCRIPT_DIR/"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# 启动任务
|
||||||
|
function start_tasks() {
|
||||||
|
for i in "${!SERVERS[@]}"; do
|
||||||
|
SERVER="${SERVERS[$i]}"
|
||||||
|
PARAMS="${TASK_PARAMS[$i]}"
|
||||||
|
echo "Starting task on $SERVER with params: $PARAMS"
|
||||||
|
#ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 &"
|
||||||
|
#ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown"
|
||||||
|
# nohup ... < /dev/null:防止 nohup 等待 stdin,立即释放控制权。
|
||||||
|
# & disown:确保进程与 SSH 彻底分离,避免 SIGHUP 信号影响,SSH 立即返回。
|
||||||
|
# ssh -n:禁用 ssh 的 stdin,防止远程进程等待输入。
|
||||||
|
ssh -n "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# 停止任务
|
||||||
|
function stop_tasks() {
|
||||||
|
for SERVER in "${SERVERS[@]}"; do
|
||||||
|
echo "Stopping task on $SERVER..."
|
||||||
|
ssh "$REMOTE_USER@$SERVER" "pkill -f 'python3 ./fetch.py'"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# 获取任务进度
|
||||||
|
function check_progress() {
|
||||||
|
for SERVER in "${SERVERS[@]}"; do
|
||||||
|
echo -e "\nChecking progress on $SERVER..."
|
||||||
|
FILE_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ls -lRh $DATA_DIR | grep pdf | wc -l")
|
||||||
|
FILE_SIZE=$(ssh "$REMOTE_USER@$SERVER" "du -sh $DATA_DIR")
|
||||||
|
#PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "pgrep -f 'python3 ./fetch.py' | wc -l")
|
||||||
|
PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ps aux | grep '[f]etch.py' | wc -l")
|
||||||
|
|
||||||
|
if [ "$PROCESS_COUNT" -gt 0 ]; then
|
||||||
|
echo "Process status: Running ($PROCESS_COUNT instances), if 2, include parent progress"
|
||||||
|
else
|
||||||
|
echo "Process status: Not running"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Total files: $FILE_COUNT"
|
||||||
|
echo "Total size : $FILE_SIZE"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# 脚本菜单
|
||||||
|
case "$1" in
|
||||||
|
push) push_code ;;
|
||||||
|
start) start_tasks ;;
|
||||||
|
stop) stop_tasks ;;
|
||||||
|
check) check_progress ;;
|
||||||
|
*)
|
||||||
|
echo "Usage: $0 {push|start|stop|check}"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@ -268,13 +268,34 @@ def fetch_pdf_link(url, max_retries = 3):
|
|||||||
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
logging.error(f"请求失败: {url} {e}")
|
logging.error(f"请求失败: {url} {e}")
|
||||||
return None
|
|
||||||
logging.error(f'Fetching failed after max retries. {url}')
|
logging.error(f'Fetching failed after max retries. {url}')
|
||||||
return None # 达到最大重试次数仍然失败
|
return None # 达到最大重试次数仍然失败
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_pdf(file_path):
|
||||||
|
try:
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
header = f.read(4)
|
||||||
|
return header == b"%PDF"
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"验证 PDF 失败: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def download_pdf_wget(pdf_url, save_path):
|
||||||
|
cmd = f'wget -O "{save_path}" "{pdf_url}" --quiet --user-agent="Mozilla/5.0"'
|
||||||
|
os.system(cmd)
|
||||||
|
return os.path.exists(save_path) and is_valid_pdf(save_path)
|
||||||
|
|
||||||
|
|
||||||
# 下载 PDF 并保存到本地
|
# 下载 PDF 并保存到本地
|
||||||
def download_pdf(pdf_url, save_path):
|
def download_pdf(pdf_url, save_path, max_retries=5):
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
down = download_pdf_wget(pdf_url, save_path)
|
||||||
|
if down:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,7 +18,7 @@ config.setup_logging()
|
|||||||
|
|
||||||
debug = False
|
debug = False
|
||||||
force = False
|
force = False
|
||||||
pdf_base_dir = "/root/hostdir/stock_data/pdfs" # 下载 PDF 存放目录
|
pdf_base_dir = f"{config.global_host_data_dir}/pdfs" # 下载 PDF 存放目录
|
||||||
|
|
||||||
|
|
||||||
map_pdf_page = {
|
map_pdf_page = {
|
||||||
@ -128,15 +128,18 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
|
|||||||
if file_path and os.path.isfile(file_path):
|
if file_path and os.path.isfile(file_path):
|
||||||
logging.info(f'{file_path} already exists. skipping...')
|
logging.info(f'{file_path} already exists. skipping...')
|
||||||
continue
|
continue
|
||||||
# 获取pdf链接地址
|
|
||||||
if url:
|
|
||||||
pdf_url = em.fetch_pdf_link(url)
|
|
||||||
|
|
||||||
if pdf_url:
|
# 获取pdf链接地址
|
||||||
# 下载 PDF
|
pdf_url = em.fetch_pdf_link(url)
|
||||||
down = em.download_pdf(pdf_url, file_path)
|
if pdf_url:
|
||||||
if down:
|
# 下载 PDF
|
||||||
logging.info(f'saved file {file_path}')
|
down = em.download_pdf(pdf_url, file_path)
|
||||||
|
if down:
|
||||||
|
logging.info(f'saved file {file_path}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'download pdf file error. file_path: {pdf_url}, save_path: {file_path}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'cannot get pdf link. url: {url}, save_path: {file_path}')
|
||||||
|
|
||||||
time.sleep(1) # 避免请求过快
|
time.sleep(1) # 避免请求过快
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user