From 26cd6b52ca3c0ac1b32ca7f8ab189c1a38a3871c Mon Sep 17 00:00:00 2001 From: oscarz Date: Sun, 13 Apr 2025 16:51:58 +0800 Subject: [PATCH] modify scripts --- commit.sh | 11 +++--- src/db_utils/reports.py | 13 ++++++++ src/em_reports/fetch.py | 59 ++++++++++++++++++++++++++++++--- src/static/akshare_get_index.py | 8 ++++- 4 files changed, 80 insertions(+), 11 deletions(-) diff --git a/commit.sh b/commit.sh index 6a5de8f..a92c570 100755 --- a/commit.sh +++ b/commit.sh @@ -14,11 +14,12 @@ commit_msg="$1" # 如果没有提供 commit message,提示用户输入 if [ -z "$commit_msg" ]; then - read -p "请输入 commit message: " commit_msg - if [ -z "$commit_msg" ]; then - echo "❌ 提交信息不能为空!" - exit 1 - fi + commit_msg="modify scripts" + #read -p "请输入 commit message: " commit_msg + #if [ -z "$commit_msg" ]; then + # echo "❌ 提交信息不能为空!" + # exit 1 + #fi fi # 添加所有更改 diff --git a/src/db_utils/reports.py b/src/db_utils/reports.py index 783b771..4d3f9cc 100644 --- a/src/db_utils/reports.py +++ b/src/db_utils/reports.py @@ -89,6 +89,19 @@ class StockReportDB: logging.error(f"Error inserting or updating data: {e}") return None + def update_pages(self, data, tbl_name, uniq_key='infoCode'): + try: + sql = f''' + update {tbl_name} SET attachPages={data['attachPages']} where id={data['id']} + ''' + self.cursor.execute(sql) + self.conn.commit() + + return data['id'] + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + def query_reports_comm(self, tbl_name, querystr='', limit=None): try: if tbl_name in [StockReportDB.TBL_STOCK, StockReportDB.TBL_NEW_STOCK, StockReportDB.TBL_INDUSTRY, StockReportDB.TBL_MACRESEARCH, StockReportDB.TBL_STRATEGY]: diff --git a/src/em_reports/fetch.py b/src/em_reports/fetch.py index d5a1f31..c06ea52 100644 --- a/src/em_reports/fetch.py +++ b/src/em_reports/fetch.py @@ -14,6 +14,7 @@ import src.utils.utils as utils from src.config.config import global_host_data_dir, global_share_db_dir from src.db_utils.reports import StockReportDB, DatabaseConnectionError from src.logger.logger import setup_logging +import PyPDF2 # 初始化日志 setup_logging() @@ -52,6 +53,19 @@ start_date = two_years_ago.strftime("%Y-%m-%d") end_date = current_date.strftime("%Y-%m-%d") this_week_date = seven_days_ago.strftime("%Y-%m-%d") +min_down_pages = 10 +def get_pdf_page_count(pdf_path): + try: + # 以二进制只读模式打开 PDF 文件 + with open(pdf_path, 'rb') as file: + # 创建一个 PdfReader 对象 + pdf_reader = PyPDF2.PdfReader(file) + # 获取 PDF 文件的页数 + page_count = len(pdf_reader.pages) + return page_count + except Exception as e: + logging.warning(f"处理文件 {pdf_path} 时出错: {e}") + return None def fetch_reports_list_general(fetch_func, table_name, s_date, e_date, data_dir_prefix): # 示例:获取前 3 页的数据 @@ -111,18 +125,49 @@ def parse_func_general(row, tbl_name): url = url.format(info_code) # 拼目录 dir_year = publish_date[:4] if len(publish_date)>4 else '' - dir_path = f'{pdf_base_dir}/{dir_year}/{map_tbl_name[tbl_name]}' + #dir_path = f'{pdf_base_dir}/{dir_year}/{map_tbl_name[tbl_name]}' + dir_path = f'{pdf_base_dir}/{dir_year}' os.makedirs(dir_path, exist_ok=True) return url, os.path.join(dir_path, file_name) +# 检查pdf的页数,如果小于限定的值,则移动到其他目录 +def check_pdf_pages(file_path, row, tbl_name): + pages = get_pdf_page_count(file_path) + if pages is None or pages < min_down_pages: + # 获取文件所在目录 + file_dir = os.path.dirname(file_path) + # 创建 tmp 子目录 + tmp_dir = os.path.join(file_dir, 'tmp') + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + # 移动文件到 tmp 子目录 + file_name = os.path.basename(file_path) + new_path = os.path.join(tmp_dir, file_name) + shutil.move(file_path, new_path) + logging.debug(f"move {file_name} to {tmp_dir}") + + # macro 和 stra 表,需要更新页码回去 + if tbl_name == StockReportDB.TBL_MACRESEARCH or tbl_name == StockReportDB.TBL_STRATEGY: + data={} + data['infoCode'] = row['infoCode'] + data['id'] = row['id'] + data['attachPages'] = pages + row_id = db_tools.update_pages(data, tbl_name) + if row_id: + logging.debug(f"update one row. tbl: {tbl_name}, rowid:{row_id}") + else: + logging.warning(f"update data failed. tbl: {tbl_name}, rowid:{row['id']}") + return False # 通用下载函数 -def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_date, e_date=end_date, limit=None): +def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_date, e_date=end_date, limit=None, min_page=None): # 下载pdf if s_date: querystr += f" AND publishDate >= '{s_date} 00:00:00.000' " if e_date: querystr += f" AND publishDate <= '{e_date} 23:59:59.999' " + if min_page: + querystr += f" AND attachPages >= {min_page} " rows = db_tools.query_reports_comm(tbl_name, querystr=querystr, limit=limit) if rows is None: @@ -145,6 +190,7 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d down = em.download_pdf(pdf_url, file_path) if down: logging.info(f'saved file {file_path}') + check_pdf_pages(file_path, row, tbl_name) else: logging.warning(f'download pdf file error. file_path: {pdf_url}, save_path: {file_path}') else: @@ -176,13 +222,13 @@ def fetch_reports_list_strategy(s_date=start_date, e_date=end_date): # 下载股票pdf def download_pdf_stock(s_date=start_date, e_date=end_date): - download_pdf_stock_general(parse_func_general, StockReportDB.TBL_STOCK, ' ', s_date, e_date, limit=2 if debug else None) + download_pdf_stock_general(parse_func_general, StockReportDB.TBL_STOCK, ' ', s_date, e_date, limit=2 if debug else None, min_page=min_down_pages) def download_pdf_newstock(s_date=start_date, e_date=end_date): - download_pdf_stock_general(parse_func_general, StockReportDB.TBL_NEW_STOCK, ' ', s_date, e_date, limit=2 if debug else None) + download_pdf_stock_general(parse_func_general, StockReportDB.TBL_NEW_STOCK, ' ', s_date, e_date, limit=2 if debug else None, min_page=min_down_pages) def download_pdf_industry(s_date=start_date, e_date=end_date): - download_pdf_stock_general(parse_func_general, StockReportDB.TBL_INDUSTRY, ' ', s_date, e_date, limit=2 if debug else None) + download_pdf_stock_general(parse_func_general, StockReportDB.TBL_INDUSTRY, ' ', s_date, e_date, limit=2 if debug else None, min_page=min_down_pages) def download_pdf_macresearch(s_date=start_date, e_date=end_date): download_pdf_stock_general(parse_func_general, StockReportDB.TBL_MACRESEARCH, ' ', s_date, e_date, limit=2 if debug else None) @@ -263,6 +309,9 @@ def run_func(function_names, function_map): def main(cmd, mode, args_debug, args_force, begin, end): global debug debug = args_debug + if debug: + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) global force force = args_force diff --git a/src/static/akshare_get_index.py b/src/static/akshare_get_index.py index 6478bfa..5c00248 100644 --- a/src/static/akshare_get_index.py +++ b/src/static/akshare_get_index.py @@ -56,13 +56,19 @@ def get_stock_by_fs(): save_to_csv(df, config.global_host_input_dir, f"em_index_all.csv") return df +# 东财-概念板块成分股 +def get_stock_by_xx(sy): + df = ak.stock_board_concept_cons_em(symbol=sy) + save_to_csv(df, config.global_host_input_dir, f"em_{sy}.csv") + def refresh_main_index(): #get_csindex(symbol='000300') #get_csindex(symbol='000510') #get_sina_index_stock(symbol='000300') #get_ah_stock() #get_hk_ggt_components() - get_stock_by_fs() + #get_stock_by_fs() + get_stock_by_xx('半导体概念')