add crawling and fetch em data files.

This commit is contained in:
2024-10-25 08:05:03 +08:00
parent 593c6ce419
commit c8342de5a2
21 changed files with 818 additions and 12922 deletions

607
stockapp/src/code.txt Normal file
View File

@ -0,0 +1,607 @@
000001
000002
000063
000100
000157
000166
000301
000333
000338
000408
000425
000538
000568
000596
000617
000625
000651
000661
000708
000725
000733
000768
000776
000786
000792
000800
000807
000858
000876
000895
000938
000963
000977
000983
000999
001289
001965
001979
002001
002007
002027
002049
002050
002074
002129
002142
002179
002180
002230
002236
002241
002252
002271
002304
002311
002352
002371
002410
002415
002459
002460
002466
002475
002493
002555
002594
002601
002603
002648
002709
002714
002736
002812
002821
002841
002916
002920
002938
003816
300014
300015
300033
300059
300122
300124
300142
300223
300274
300308
300316
300347
300408
300413
300418
300433
300442
300450
300454
300496
300498
300628
300661
300750
300751
300759
300760
300782
300832
300896
300919
300957
300979
300999
301269
600000
600009
600010
600011
600015
600016
600018
600019
600023
600025
600026
600027
600028
600029
600030
600031
600036
600039
600048
600050
600061
600085
600089
600104
600111
600115
600132
600150
600161
600176
600183
600188
600196
600219
600233
600276
600309
600332
600346
600362
600372
600406
600415
600426
600436
600438
600460
600489
600515
600519
600547
600570
600584
600585
600588
600600
600660
600674
600690
600732
600741
600745
600760
600795
600803
600809
600837
600845
600875
600886
600887
600893
600900
600905
600918
600919
600926
600938
600941
600958
600989
600999
601006
601009
601012
601021
601059
601066
601088
601100
601111
601117
601138
601166
601169
601186
601211
601225
601229
601236
601238
601288
601318
601319
601328
601336
601360
601377
601390
601398
601600
601601
601607
601618
601628
601633
601658
601668
601669
601688
601689
601698
601699
601728
601766
601788
601799
601800
601808
601816
601818
601838
601857
601865
601868
601872
601877
601878
601881
601888
601898
601899
601901
601916
601919
601939
601985
601988
601989
601995
601998
603019
603195
603259
603260
603288
603296
603369
603392
603501
603659
603799
603806
603833
603899
603986
603993
605117
605499
688008
688009
688012
688036
688041
688082
688111
688126
688187
688223
688256
688271
688303
688363
688396
688599
688981
000009
000034
000035
000039
000066
000069
000400
000423
000519
000547
000591
000623
000629
000630
000683
000690
000738
000818
000830
000831
000878
000887
000932
000933
000960
000967
000975
000988
000998
001914
002008
002025
002028
002044
002064
002065
002078
002080
002081
002085
002091
002120
002123
002131
002138
002145
002151
002152
002156
002176
002185
002192
002195
002202
002212
002223
002240
002245
002266
002268
002273
002281
002292
002294
002312
002340
002353
002368
002372
002384
002389
002396
002405
002407
002409
002414
002422
002432
002436
002439
002444
002456
002463
002465
002472
002497
002508
002511
002517
002532
002541
002544
002558
002572
002602
002607
002624
002625
002738
002739
002756
002791
002831
300001
300002
300003
300012
300017
300024
300037
300054
300058
300068
300070
300073
300088
300118
300133
300136
300144
300182
300207
300212
300251
300253
300285
300296
300315
300339
300346
300383
300390
300394
300395
300438
300457
300459
300474
300502
300529
300558
300567
300568
300573
300595
300601
300604
300627
300676
300699
300724
300763
300769
300866
301236
301558
600004
600007
600008
600038
600066
600096
600118
600129
600131
600141
600143
600153
600157
600160
600166
600167
600170
600177
600256
600258
600316
600323
600325
600352
600392
600398
600399
600418
600482
600486
600497
600498
600499
600516
600521
600529
600535
600549
600563
600637
600655
600667
600699
600704
600754
600755
600763
600765
600771
600816
600820
600839
600859
600862
600867
600879
600884
600885
600895
600959
600988
600998
601058
601155
601168
601216
601233
601615
601636
601677
601727
601866
601880
601966
603000
603077
603129
603156
603236
603290
603444
603456
603486
603568
603588
603596
603605
603606
603613
603650
603688
603737
603816
603882
603885
603939
605358
688002
688005
688063
688072
688099
688122
688169
688188
688390
688536
688598
688617
688777
688001
688019
688037
688047
688048
688052
688106
688107
688110
688120
688123
688141
688146
688153
688172
688200
688213
688220
688234
688249
688279
688347
688352
688361
688362
688385
688409
688432
688484
688498
688521
688525
688582
688596
688608
688702
688728
688798
688349
688472
688506

View File

@ -1,7 +1,8 @@
import logging
import os
import inspect
import datetime
from datetime import datetime
from pathlib import Path
# MySQL 配置
db_config = {
@ -13,18 +14,41 @@ db_config = {
log_dir_prefix = '../log'
# 获取log目录
def get_log_directory():
"""
获取项目根目录下的 log 目录路径。如果 log 目录不存在,则自动创建。
"""
# 获取当前文件所在目录
current_dir = Path(__file__).resolve().parent
# 找到项目根目录,假设项目根目录下有一个 log 文件夹
project_root = current_dir
while project_root.name != 'src' and project_root != project_root.parent:
project_root = project_root.parent
project_root = project_root.parent # 回到项目根目录
# 确保 log 目录存在
log_dir = project_root / 'log'
log_dir.mkdir(parents=True, exist_ok=True)
return log_dir
def get_caller_filename():
# 获取调用 setup_logging 的脚本文件名
caller_frame = inspect.stack()[2]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
return caller_filename
# 设置日志配置
def setup_logging(log_filename=None):
# 如果未传入 log_filename则使用当前脚本名称作为日志文件名
if log_filename is None:
# 获取调用 setup_logging 的脚本文件名
caller_frame = inspect.stack()[1]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
# 获取当前日期,格式为 yyyymmdd
caller_filename = get_caller_filename()
common_log_dir = get_log_directory()
current_date = datetime.now().strftime('%Y%m%d')
# 拼接 log 文件名,将日期加在扩展名前
log_filename = f'{log_dir_prefix}/{caller_filename}_{current_date}.log'
log_filename = f'{common_log_dir}/{caller_filename}_{current_date}.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
handlers=[

View File

@ -233,6 +233,70 @@ def code_id_map_em() -> dict:
temp_df_sz = pd.DataFrame(data_json["data"]["diff"])
temp_df_sz["bj_id"] = 0
code_id_dict.update(dict(zip(temp_df_sz["f12"], temp_df_sz["bj_id"])))
params = {
"pn": "1",
"pz": "50000",
"po": "1",
"np": "1",
"ut": "bd1d9ddb04089700cf9c27f6f7426281",
"fltt": "2",
"invt": "2",
"fid": "f3",
"fs": "m:128 t:3",
"fields": "f12",
"_": "1623833739532",
}
r = requests.get(url, params=params)
data_json = r.json()
if not data_json["data"]["diff"]:
return dict()
temp_df_sz = pd.DataFrame(data_json["data"]["diff"])
temp_df_sz["hk_main"] = 116
code_id_dict.update(dict(zip(temp_df_sz["f12"], temp_df_sz["hk_main"])))
params = {
"pn": "1",
"pz": "50000",
"po": "1",
"np": "1",
"ut": "bd1d9ddb04089700cf9c27f6f7426281",
"fltt": "2",
"invt": "2",
"fid": "f3",
"fs": "m:128 t:4",
"fields": "f12",
"_": "1623833739532",
}
r = requests.get(url, params=params)
data_json = r.json()
if not data_json["data"]["diff"]:
return dict()
temp_df_sz = pd.DataFrame(data_json["data"]["diff"])
temp_df_sz["hk_cyb"] = 116
code_id_dict.update(dict(zip(temp_df_sz["f12"], temp_df_sz["hk_cyb"])))
params = {
"pn": "1",
"pz": "50000",
"po": "1",
"np": "1",
"ut": "bd1d9ddb04089700cf9c27f6f7426281",
"fltt": "2",
"invt": "2",
"fid": "f3",
"fs": "m:105,m:106,m:107",
"fields": "f12",
"_": "1623833739532",
}
r = requests.get(url, params=params)
data_json = r.json()
if not data_json["data"]["diff"]:
return dict()
temp_df_sz = pd.DataFrame(data_json["data"]["diff"])
temp_df_sz["us_all"] = 105
code_id_dict.update(dict(zip(temp_df_sz["f12"], temp_df_sz["us_all"])))
return code_id_dict

View File

@ -0,0 +1,108 @@
import time
import logging
import pandas as pd
import config
import crawling.stock_hist_em as his_em
config.setup_logging()
def flush_code_map():
code_id_map_em_df = his_em.code_id_map_em()
print(code_id_map_em_df)
return code_id_map_em_df
# 获取历史K线如果失败就重试
def fetch_with_retry(code: str, adjust: str = '', max_retries: int = 5) -> pd.DataFrame :
retries = 0
while retries < max_retries:
try:
# 调用 stock_zh_a_hist 获取历史数据
df = his_em.stock_zh_a_hist(
symbol=code,
period="daily",
start_date="19000101",
end_date="20241020",
adjust=adjust,
)
# 如果获取到的数据为空,记录日志并重试
if df.empty:
retries += 1
time.sleep(3) # 每次重试前休眠 3 秒
else:
return df
except Exception as e:
retries += 1
time.sleep(3) # 每次重试前休眠 3 秒
return pd.DataFrame()
# 读取 code.txt 文件,并获取每个股票代码
def read_stock_codes(filename: str) -> list:
try:
with open(filename, 'r') as f:
codes = [line.strip() for line in f if line.strip()]
return codes
except FileNotFoundError:
logging.error(f"文件 {filename} 未找到。")
return []
# 从文件获取指定的代码并拉取历史K线
def fetch_parts():
# 读取股票代码列表
codes = read_stock_codes('code.txt')
# 如果没有代码,结束程序
if not codes:
logging.error("没有找到有效的股票代码,程序终止。")
return
adjust_values = ['', 'qfq', 'hfq']
code_id_map_em_df = his_em.code_id_map_em()
for adjust in adjust_values:
adjust_str = adjust if adjust != '' else 'none'
for key in codes:
val = code_id_map_em_df.get(key)
if val is None:
logging.error(f'cannot find stock code. code: ({key}), adjust: ({adjust_str})')
continue
stock_zh_a_hist_df =fetch_with_retry(key, adjust)
if stock_zh_a_hist_df.empty:
logging.info(f'fetch his data error. code: ({key}), adjust: ({adjust_str})')
else:
# 将 DataFrame 输出为 CSV 文件
stock_zh_a_hist_df.to_csv(f'../data/{val}/{key}_{adjust_str}_his_data.csv', index=False, encoding='utf-8')
lines = stock_zh_a_hist_df.shape[0]
logging.info(f'fetch his data and write to file. code: ({key}), adjust: ({adjust_str}), lines: ({lines})')
time.sleep(5)
time.sleep(10)
# 获取全量代码的历史K线
def fetch_all():
adjust_values = ['', 'qfq', 'hfq']
code_id_map_em_df = his_em.code_id_map_em()
for adjust in adjust_values:
adjust_str = adjust if adjust != '' else 'none'
for key, val in code_id_map_em_df.items():
stock_zh_a_hist_df =fetch_with_retry(key, adjust)
if stock_zh_a_hist_df.empty:
logging.error(f'fetch his data error. code: ({key}), adjust: ({adjust_str})')
else:
# 将 DataFrame 输出为 CSV 文件
stock_zh_a_hist_df.to_csv(f'../data/{val}/{key}_{adjust_str}_his_data.csv', index=False, encoding='utf-8')
lines = stock_zh_a_hist_df.shape[0]
logging.info(f'fetch his data and write to file. code: ({key}), adjust: ({adjust_str}), lines: ({lines})')
time.sleep(5)
time.sleep(10)
# 主函数
if __name__ == '__main__':
fetch_all()
#fetch_parts()

View File

@ -0,0 +1,9 @@
from futu import *
quote_ctx = OpenQuoteContext(host='127.0.0.1', port=11111)
ret, data = quote_ctx.get_market_snapshot(['SH.600000', 'HK.00700'])
if ret == RET_OK:
print(data)
else:
print('error:', data)
quote_ctx.close() # 结束后记得关闭当条连接,防止连接条数用尽

View File

@ -0,0 +1,62 @@
import pymysql
import pandas as pd
from futu import *
import logging
from config import db_config # 引用config.py中的数据库配置
# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s - %(message)s')
logger = logging.getLogger()
# 数据库连接函数
def connect_to_db():
return pymysql.connect(**db_config)
# 从sp300表中获取所有股票代码
def fetch_sp300_codes(connection):
query = "SELECT code FROM hs300"
return pd.read_sql(query, connection)
# 获取市场快照并保存到 CSV 文件
def get_market_snapshot_and_save_to_csv(stock_codes, output_file):
quote_ctx = OpenQuoteContext(host='127.0.0.1', port=11111)
ret, data = quote_ctx.get_market_snapshot(stock_codes)
if ret == RET_OK:
logger.info(f"Successfully fetched market snapshot for {len(stock_codes)} codes.")
# 将数据写入CSV文件
data.to_csv(output_file, index=False)
logger.info(f"Snapshot data saved to {output_file}")
else:
logger.error(f"Error fetching market snapshot: {data}")
quote_ctx.close()
# 主函数
def main():
try:
# 连接数据库
connection = connect_to_db()
# 从 sp300 表中获取所有的股票代码
sp300_codes_df = fetch_sp300_codes(connection)
# 提取股票代码列表
stock_codes = sp300_codes_df['code'].tolist()
if not stock_codes:
logger.warning("No stock codes found in sp300 table.")
return
# 获取市场快照并保存到 CSV 文件
output_file = "market_snapshot.csv"
get_market_snapshot_and_save_to_csv(stock_codes, output_file)
except Exception as e:
logger.error(f"An error occurred: {e}")
finally:
if connection:
connection.close()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,18 @@
from futu import *
quote_ctx = OpenQuoteContext(host='127.0.0.1', port=11111)
ret, data, page_req_key = quote_ctx.request_history_kline('HK.00700', autype=AuType.NONE, start='2021-10-03', end='2021-11-08', max_count=50) # 每页5个请求第一页
if ret == RET_OK:
print(data)
print(data['code'][0]) # 取第一条的股票代码
print(data['close'].values.tolist()) # 第一页收盘价转为 list
else:
print('error:', data)
while page_req_key != None: # 请求后面的所有结果
print('*************************************')
ret, data, page_req_key = quote_ctx.request_history_kline('HK.00700', start='2024-04-11', end='2024-06-18', max_count=50, page_req_key=page_req_key) # 请求翻页后的数据
if ret == RET_OK:
print(data)
else:
print('error:', data)
print('All pages are finished!')
quote_ctx.close() # 结束后记得关闭当条连接,防止连接条数用尽

View File

@ -0,0 +1,12 @@
from futu import *
quote_ctx = OpenQuoteContext(host='127.0.0.1', port=11111)
ret, data = quote_ctx.get_user_security("港股")
if ret == RET_OK:
print(data)
if data.shape[0] > 0: # 如果自选股列表不为空
print(data['code'][0]) # 取第一条的股票代码
print(data['code'].values.tolist()) # 转为 list
else:
print('error:', data)
quote_ctx.close() # 结束后记得关闭当条连接,防止连接条数用尽

View File

@ -0,0 +1,11 @@
from futu import *
quote_ctx = OpenQuoteContext(host='127.0.0.1', port=11111)
ret, data = quote_ctx.get_plate_list(Market.HK, Plate.CONCEPT)
if ret == RET_OK:
print(data)
print(data['plate_name'][0]) # 取第一条的板块名称
print(data['plate_name'].values.tolist()) # 转为 list
else:
print('error:', data)
quote_ctx.close() # 结束后记得关闭当条连接,防止连接条数用尽

View File

@ -0,0 +1,11 @@
from futu import *
quote_ctx = OpenQuoteContext(host='127.0.0.1', port=11111)
ret, data = quote_ctx.get_plate_stock('SH.LIST3000005')
if ret == RET_OK:
print(data)
#print(data['stock_name'][0]) # 取第一条的股票名称
#print(data['stock_name'].values.tolist()) # 转为 list
else:
print('error:', data)
quote_ctx.close() # 结束后记得关闭当条连接,防止连接条数用尽

View File

@ -0,0 +1,11 @@
from futu import *
quote_ctx = OpenQuoteContext(host='127.0.0.1', port=11111)
ret, data = quote_ctx.get_rehab("US.AAPL")
if ret == RET_OK:
print(data)
print(data['ex_div_date'][0]) # 取第一条的除权除息日
print(data['ex_div_date'].values.tolist()) # 转为 list
else:
print('error:', data)
quote_ctx.close() # 结束后记得关闭当条连接,防止连接条数用尽

View File

@ -0,0 +1,28 @@
import yfinance as yf
import pandas as pd
code = 'KDP'
# 获取AAPL的股票数据
stock = yf.Ticker(code)
# 获取过去十年的日K线数据前复权
hist_data = stock.history(period="10y", auto_adjust=True)
print (hist_data['Close'].resample('Y').last().pct_change)
# 打印数据前几行
print(hist_data.head())
# 保存到CSV文件
hist_data.to_csv(f"{code}_10year_data_adjust.csv")
# 获取过去十年的日K线数据不复权
hist_data = stock.history(period="10y", auto_adjust=False)
# 打印数据前几行
print(hist_data.head())
# 保存到CSV文件
hist_data.to_csv(f"{code}_10year_data.csv")

View File

@ -0,0 +1,17 @@
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
# 获取当前日期
end_date = datetime.today().strftime('%Y-%m-%d')
# 获取十年前的日期
start_date = (datetime.today() - timedelta(days=365*10)).strftime('%Y-%m-%d')
# 下载 AAPL 股票数据
data = yf.download('AAPL', start=start_date, end=end_date)
# 将数据保存为 CSV 文件
data.to_csv('AAPL.csv')
print(f"Downloaded AAPL stock data from {start_date} to {end_date} and saved to AAPL.csv")