modify scripts
This commit is contained in:
11
commit.sh
11
commit.sh
@ -14,11 +14,12 @@ commit_msg="$1"
|
||||
|
||||
# 如果没有提供 commit message,提示用户输入
|
||||
if [ -z "$commit_msg" ]; then
|
||||
read -p "请输入 commit message: " commit_msg
|
||||
if [ -z "$commit_msg" ]; then
|
||||
echo "❌ 提交信息不能为空!"
|
||||
exit 1
|
||||
fi
|
||||
commit_msg="modify scripts"
|
||||
#read -p "请输入 commit message: " commit_msg
|
||||
#if [ -z "$commit_msg" ]; then
|
||||
# echo "❌ 提交信息不能为空!"
|
||||
# exit 1
|
||||
#fi
|
||||
fi
|
||||
|
||||
# 添加所有更改
|
||||
|
||||
@ -89,6 +89,19 @@ class StockReportDB:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
def update_pages(self, data, tbl_name, uniq_key='infoCode'):
|
||||
try:
|
||||
sql = f'''
|
||||
update {tbl_name} SET attachPages={data['attachPages']} where id={data['id']}
|
||||
'''
|
||||
self.cursor.execute(sql)
|
||||
self.conn.commit()
|
||||
|
||||
return data['id']
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
def query_reports_comm(self, tbl_name, querystr='', limit=None):
|
||||
try:
|
||||
if tbl_name in [StockReportDB.TBL_STOCK, StockReportDB.TBL_NEW_STOCK, StockReportDB.TBL_INDUSTRY, StockReportDB.TBL_MACRESEARCH, StockReportDB.TBL_STRATEGY]:
|
||||
|
||||
@ -14,6 +14,7 @@ import src.utils.utils as utils
|
||||
from src.config.config import global_host_data_dir, global_share_db_dir
|
||||
from src.db_utils.reports import StockReportDB, DatabaseConnectionError
|
||||
from src.logger.logger import setup_logging
|
||||
import PyPDF2
|
||||
|
||||
# 初始化日志
|
||||
setup_logging()
|
||||
@ -52,6 +53,19 @@ start_date = two_years_ago.strftime("%Y-%m-%d")
|
||||
end_date = current_date.strftime("%Y-%m-%d")
|
||||
this_week_date = seven_days_ago.strftime("%Y-%m-%d")
|
||||
|
||||
min_down_pages = 10
|
||||
def get_pdf_page_count(pdf_path):
|
||||
try:
|
||||
# 以二进制只读模式打开 PDF 文件
|
||||
with open(pdf_path, 'rb') as file:
|
||||
# 创建一个 PdfReader 对象
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
# 获取 PDF 文件的页数
|
||||
page_count = len(pdf_reader.pages)
|
||||
return page_count
|
||||
except Exception as e:
|
||||
logging.warning(f"处理文件 {pdf_path} 时出错: {e}")
|
||||
return None
|
||||
|
||||
def fetch_reports_list_general(fetch_func, table_name, s_date, e_date, data_dir_prefix):
|
||||
# 示例:获取前 3 页的数据
|
||||
@ -111,18 +125,49 @@ def parse_func_general(row, tbl_name):
|
||||
url = url.format(info_code)
|
||||
# 拼目录
|
||||
dir_year = publish_date[:4] if len(publish_date)>4 else ''
|
||||
dir_path = f'{pdf_base_dir}/{dir_year}/{map_tbl_name[tbl_name]}'
|
||||
#dir_path = f'{pdf_base_dir}/{dir_year}/{map_tbl_name[tbl_name]}'
|
||||
dir_path = f'{pdf_base_dir}/{dir_year}'
|
||||
os.makedirs(dir_path, exist_ok=True)
|
||||
return url, os.path.join(dir_path, file_name)
|
||||
|
||||
# 检查pdf的页数,如果小于限定的值,则移动到其他目录
|
||||
def check_pdf_pages(file_path, row, tbl_name):
|
||||
pages = get_pdf_page_count(file_path)
|
||||
if pages is None or pages < min_down_pages:
|
||||
# 获取文件所在目录
|
||||
file_dir = os.path.dirname(file_path)
|
||||
# 创建 tmp 子目录
|
||||
tmp_dir = os.path.join(file_dir, 'tmp')
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
# 移动文件到 tmp 子目录
|
||||
file_name = os.path.basename(file_path)
|
||||
new_path = os.path.join(tmp_dir, file_name)
|
||||
shutil.move(file_path, new_path)
|
||||
logging.debug(f"move {file_name} to {tmp_dir}")
|
||||
|
||||
# macro 和 stra 表,需要更新页码回去
|
||||
if tbl_name == StockReportDB.TBL_MACRESEARCH or tbl_name == StockReportDB.TBL_STRATEGY:
|
||||
data={}
|
||||
data['infoCode'] = row['infoCode']
|
||||
data['id'] = row['id']
|
||||
data['attachPages'] = pages
|
||||
row_id = db_tools.update_pages(data, tbl_name)
|
||||
if row_id:
|
||||
logging.debug(f"update one row. tbl: {tbl_name}, rowid:{row_id}")
|
||||
else:
|
||||
logging.warning(f"update data failed. tbl: {tbl_name}, rowid:{row['id']}")
|
||||
return False
|
||||
|
||||
# 通用下载函数
|
||||
def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_date, e_date=end_date, limit=None):
|
||||
def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_date, e_date=end_date, limit=None, min_page=None):
|
||||
# 下载pdf
|
||||
if s_date:
|
||||
querystr += f" AND publishDate >= '{s_date} 00:00:00.000' "
|
||||
if e_date:
|
||||
querystr += f" AND publishDate <= '{e_date} 23:59:59.999' "
|
||||
if min_page:
|
||||
querystr += f" AND attachPages >= {min_page} "
|
||||
|
||||
rows = db_tools.query_reports_comm(tbl_name, querystr=querystr, limit=limit)
|
||||
if rows is None:
|
||||
@ -145,6 +190,7 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
|
||||
down = em.download_pdf(pdf_url, file_path)
|
||||
if down:
|
||||
logging.info(f'saved file {file_path}')
|
||||
check_pdf_pages(file_path, row, tbl_name)
|
||||
else:
|
||||
logging.warning(f'download pdf file error. file_path: {pdf_url}, save_path: {file_path}')
|
||||
else:
|
||||
@ -176,13 +222,13 @@ def fetch_reports_list_strategy(s_date=start_date, e_date=end_date):
|
||||
|
||||
# 下载股票pdf
|
||||
def download_pdf_stock(s_date=start_date, e_date=end_date):
|
||||
download_pdf_stock_general(parse_func_general, StockReportDB.TBL_STOCK, ' ', s_date, e_date, limit=2 if debug else None)
|
||||
download_pdf_stock_general(parse_func_general, StockReportDB.TBL_STOCK, ' ', s_date, e_date, limit=2 if debug else None, min_page=min_down_pages)
|
||||
|
||||
def download_pdf_newstock(s_date=start_date, e_date=end_date):
|
||||
download_pdf_stock_general(parse_func_general, StockReportDB.TBL_NEW_STOCK, ' ', s_date, e_date, limit=2 if debug else None)
|
||||
download_pdf_stock_general(parse_func_general, StockReportDB.TBL_NEW_STOCK, ' ', s_date, e_date, limit=2 if debug else None, min_page=min_down_pages)
|
||||
|
||||
def download_pdf_industry(s_date=start_date, e_date=end_date):
|
||||
download_pdf_stock_general(parse_func_general, StockReportDB.TBL_INDUSTRY, ' ', s_date, e_date, limit=2 if debug else None)
|
||||
download_pdf_stock_general(parse_func_general, StockReportDB.TBL_INDUSTRY, ' ', s_date, e_date, limit=2 if debug else None, min_page=min_down_pages)
|
||||
|
||||
def download_pdf_macresearch(s_date=start_date, e_date=end_date):
|
||||
download_pdf_stock_general(parse_func_general, StockReportDB.TBL_MACRESEARCH, ' ', s_date, e_date, limit=2 if debug else None)
|
||||
@ -263,6 +309,9 @@ def run_func(function_names, function_map):
|
||||
def main(cmd, mode, args_debug, args_force, begin, end):
|
||||
global debug
|
||||
debug = args_debug
|
||||
if debug:
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
|
||||
@ -56,13 +56,19 @@ def get_stock_by_fs():
|
||||
save_to_csv(df, config.global_host_input_dir, f"em_index_all.csv")
|
||||
return df
|
||||
|
||||
# 东财-概念板块成分股
|
||||
def get_stock_by_xx(sy):
|
||||
df = ak.stock_board_concept_cons_em(symbol=sy)
|
||||
save_to_csv(df, config.global_host_input_dir, f"em_{sy}.csv")
|
||||
|
||||
def refresh_main_index():
|
||||
#get_csindex(symbol='000300')
|
||||
#get_csindex(symbol='000510')
|
||||
#get_sina_index_stock(symbol='000300')
|
||||
#get_ah_stock()
|
||||
#get_hk_ggt_components()
|
||||
get_stock_by_fs()
|
||||
#get_stock_by_fs()
|
||||
get_stock_by_xx('半导体概念')
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user