modify scripts

This commit is contained in:
2025-03-15 08:02:25 +08:00
parent af92229a3e
commit 3b76c00500
4 changed files with 89 additions and 49 deletions

View File

@ -29,12 +29,12 @@ map_pdf_page = {
utils.tbl_industry : "https://data.eastmoney.com/report/zw_industry.jshtml?infocode={}"
}
map_pdf_path = {
utils.tbl_stock : f'{pdf_base_dir}/stock',
utils.tbl_new_stock : f'{pdf_base_dir}/newstock',
utils.tbl_strategy : f'{pdf_base_dir}/strategy',
utils.tbl_macresearch : f'{pdf_base_dir}/macresearch',
utils.tbl_industry : f'{pdf_base_dir}/industry'
map_tbl_name = {
utils.tbl_stock : '个股研报',
utils.tbl_new_stock : '新股研报',
utils.tbl_strategy : '策略报告',
utils.tbl_macresearch : '宏观研究',
utils.tbl_industry : '行业研报'
}
current_date = datetime.now()
@ -76,7 +76,7 @@ def fetch_reports_list_general(fetch_func, table_name, s_date, e_date, data_dir_
# 股票所用的url
def parse_func_stock(row, tbl_name):
def parse_func_general(row, tbl_name):
info_code = row['infoCode']
title = row['title'].replace("/", "_").replace("\\", "_")
org_sname = row['orgSName']
@ -84,34 +84,28 @@ def parse_func_stock(row, tbl_name):
industry_name = row['industryName']
publish_date = row['publishDate'].split(" ")[0]
file_name = f"{publish_date}_{org_sname}_{stock_name}_{title}.pdf"
# 建表的时候默认值有点问题
if stock_name == '' or stock_name=="''":
stock_name = 'None'
if industry_name == '':
industry_name = 'None'
if org_sname == '':
org_sname = 'None'
report_type = map_tbl_name.get(tbl_name, 'None')
file_name = f"{publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf"
url = map_pdf_page.get(tbl_name, None)
if url is None:
logging.warning(f'wrong table name: {tbl_name}')
return None, None, None
return None, None
url = url.format(info_code)
os.makedirs(map_pdf_path[tbl_name], exist_ok=True)
return url, os.path.join(map_pdf_path[tbl_name], file_name), None
# 拼目录
dir_year = publish_date[:4] if len(publish_date)>4 else ''
dir_path = f'{pdf_base_dir}/{dir_year}/{map_tbl_name[tbl_name]}'
os.makedirs(dir_path, exist_ok=True)
return url, os.path.join(dir_path, file_name)
# 其它所用的url
def parse_func_other(row, tbl_name):
info_code = row['infoCode']
title = row['title'].replace("/", "_").replace("\\", "_")
org_sname = row['orgSName']
industry_name = row['industryName']
publish_date = row['publishDate'].split(" ")[0]
file_name = f"{publish_date}_{org_sname}_{industry_name}_{title}.pdf"
old_file_name = f"{publish_date}_{industry_name}_{org_sname}_{title}.pdf"
url = map_pdf_page.get(tbl_name, None)
if url is None:
logging.warning(f'wrong table name: {tbl_name}')
return None, None, None
url = url.format(info_code)
os.makedirs(map_pdf_path[tbl_name], exist_ok=True)
return url, os.path.join(map_pdf_path[tbl_name], file_name), os.path.join(map_pdf_path[tbl_name], old_file_name)
# 通用下载函数
def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_date, e_date=end_date, limit=None):
@ -126,7 +120,7 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
rows = []
for row in rows:
url, file_path, old_file_path = parse_func(row, tbl_name)
url, file_path = parse_func(row, tbl_name)
if url is None or file_path is None:
logging.warning(f'wrong url or file_path. tbl_name: {tbl_name}')
continue
@ -134,11 +128,6 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
if file_path and os.path.isfile(file_path):
logging.info(f'{file_path} already exists. skipping...')
continue
# 旧方式命名的rename
if old_file_path and os.path.isfile(old_file_path):
shutil.move(old_file_path, file_path)
logging.info(f'rename existed file to {file_path}')
continue
# 获取pdf链接地址
if url:
pdf_url = em.fetch_pdf_link(url)
@ -175,19 +164,19 @@ def fetch_reports_list_strategy(s_date=start_date, e_date=end_date):
# 下载股票pdf
def download_pdf_stock(s_date=start_date, e_date=end_date):
download_pdf_stock_general(parse_func_stock, utils.tbl_stock, ' AND attachPages>=30', s_date, e_date, limit=2 if debug else None)
download_pdf_stock_general(parse_func_general, utils.tbl_stock, ' ', s_date, e_date, limit=2 if debug else None)
def download_pdf_newstock(s_date=start_date, e_date=end_date):
download_pdf_stock_general(parse_func_stock, utils.tbl_new_stock, ' AND attachPages>=30', s_date, e_date, limit=2 if debug else None)
download_pdf_stock_general(parse_func_general, utils.tbl_new_stock, ' ', s_date, e_date, limit=2 if debug else None)
def download_pdf_industry(s_date=start_date, e_date=end_date):
download_pdf_stock_general(parse_func_other, utils.tbl_industry, ' AND attachPages>=30', s_date, e_date, limit=2 if debug else None)
download_pdf_stock_general(parse_func_general, utils.tbl_industry, ' ', s_date, e_date, limit=2 if debug else None)
def download_pdf_macresearch(s_date=start_date, e_date=end_date):
download_pdf_stock_general(parse_func_other, utils.tbl_macresearch, ' ', s_date, e_date, limit=2 if debug else None)
download_pdf_stock_general(parse_func_general, utils.tbl_macresearch, ' ', s_date, e_date, limit=2 if debug else None)
def download_pdf_strategy(s_date=start_date, e_date=end_date):
download_pdf_stock_general(parse_func_other, utils.tbl_strategy, ' ', s_date, e_date, limit=2 if debug else None)
download_pdf_stock_general(parse_func_general, utils.tbl_strategy, ' ', s_date, e_date, limit=2 if debug else None)
# 建立缩写到函数的映射

View File

@ -82,10 +82,8 @@ def insert_or_update_common(data, tbl_name, uniq_key='infoCode'):
# 查询数据
def query_reports_comm(tbl_name, querystr='', limit=None ):
try:
if tbl_name in [utils.tbl_stock, utils.tbl_new_stock] :
if tbl_name in [utils.tbl_stock, utils.tbl_new_stock, utils.tbl_industry, utils.tbl_macresearch, utils.tbl_strategy] :
sql = f"SELECT id, infoCode, title, orgSName, industryName, stockName, publishDate FROM {tbl_name} WHERE 1=1 {querystr}"
elif tbl_name in [utils.tbl_industry, utils.tbl_macresearch, utils.tbl_strategy] :
sql = f"SELECT id, infoCode, title, orgSName, industryName, publishDate FROM {tbl_name} WHERE 1=1 {querystr}"
else:
logging.warning(f'wrong table name: {tbl_name}')
return None