From fcf6f8a9453ba603b1a5a0bc5ee9d9430b0ce323 Mon Sep 17 00:00:00 2001
From: oscar <oscar@easyprompt8.com>
Date: Fri, 25 Oct 2024 09:59:08 +0800
Subject: [PATCH] modify fetch em data files.

---
 stockapp/{data => input/index}/000300cons.csv |   0
 stockapp/{data => input/index}/000510cons.csv |   0
 stockapp/{data => input/index}/000685cons.csv |   0
 stockapp/{data => input/index}/930050cons.csv |   0
 stockapp/{data => input/index}/931643cons.csv |   0
 ...SP-500-Index-Constituents-Sept-23-2024.csv |   0
 .../his_kline_em_codes.txt}                   |   0
 .../src/cursor/his_kline_em_done_codes.txt    |  11 ++
 stockapp/src/get_his_kline_em.py              | 144 +++++++++++++++++-
 9 files changed, 150 insertions(+), 5 deletions(-)
 rename stockapp/{data => input/index}/000300cons.csv (100%)
 rename stockapp/{data => input/index}/000510cons.csv (100%)
 rename stockapp/{data => input/index}/000685cons.csv (100%)
 rename stockapp/{data => input/index}/930050cons.csv (100%)
 rename stockapp/{data => input/index}/931643cons.csv (100%)
 rename stockapp/{data => input/index}/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv (100%)
 rename stockapp/src/{code.txt => cursor/his_kline_em_codes.txt} (100%)
 create mode 100644 stockapp/src/cursor/his_kline_em_done_codes.txt

diff --git a/stockapp/data/000300cons.csv b/stockapp/input/index/000300cons.csv
similarity index 100%
rename from stockapp/data/000300cons.csv
rename to stockapp/input/index/000300cons.csv
diff --git a/stockapp/data/000510cons.csv b/stockapp/input/index/000510cons.csv
similarity index 100%
rename from stockapp/data/000510cons.csv
rename to stockapp/input/index/000510cons.csv
diff --git a/stockapp/data/000685cons.csv b/stockapp/input/index/000685cons.csv
similarity index 100%
rename from stockapp/data/000685cons.csv
rename to stockapp/input/index/000685cons.csv
diff --git a/stockapp/data/930050cons.csv b/stockapp/input/index/930050cons.csv
similarity index 100%
rename from stockapp/data/930050cons.csv
rename to stockapp/input/index/930050cons.csv
diff --git a/stockapp/data/931643cons.csv b/stockapp/input/index/931643cons.csv
similarity index 100%
rename from stockapp/data/931643cons.csv
rename to stockapp/input/index/931643cons.csv
diff --git a/stockapp/data/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv b/stockapp/input/index/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv
similarity index 100%
rename from stockapp/data/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv
rename to stockapp/input/index/Complete-List-of-SP-500-Index-Constituents-Sept-23-2024.csv
diff --git a/stockapp/src/code.txt b/stockapp/src/cursor/his_kline_em_codes.txt
similarity index 100%
rename from stockapp/src/code.txt
rename to stockapp/src/cursor/his_kline_em_codes.txt
diff --git a/stockapp/src/cursor/his_kline_em_done_codes.txt b/stockapp/src/cursor/his_kline_em_done_codes.txt
new file mode 100644
index 0000000..5a7dc70
--- /dev/null
+++ b/stockapp/src/cursor/his_kline_em_done_codes.txt
@@ -0,0 +1,11 @@
+689009
+688981
+688517
+000001
+000002
+000001
+688353
+688093
+688303
+000063
+000100
diff --git a/stockapp/src/get_his_kline_em.py b/stockapp/src/get_his_kline_em.py
index 6d2404d..04366d7 100644
--- a/stockapp/src/get_his_kline_em.py
+++ b/stockapp/src/get_his_kline_em.py
@@ -1,18 +1,25 @@
 import time
 import logging
 import pandas as pd
+import os
+import sys
 import config
 import crawling.stock_hist_em as his_em
 
+file_selected_codes = './cursor/his_kline_em_codes.txt'  # 指定拉取的代码列表，每行一个代码
+file_done_codes = './cursor/his_kline_em_done_codes.txt' # 已完成拉取的代码列表，每行一个代码
+dir_his_kline_em = '../data/his_kline_em'
+
 config.setup_logging()
 
+# 刷新代码列表，并返回
 def flush_code_map():
     code_id_map_em_df = his_em.code_id_map_em()
     print(code_id_map_em_df)
     return code_id_map_em_df
 
 # 获取历史K线，如果失败，就重试
-def fetch_with_retry(code: str, adjust: str = '', max_retries: int = 5) -> pd.DataFrame :
+def fetch_with_retry(code: str, adjust: str = '', max_retries: int = 20) -> pd.DataFrame :
     retries = 0
     while retries < max_retries:
         try:
@@ -37,6 +44,13 @@ def fetch_with_retry(code: str, adjust: str = '', max_retries: int = 5) -> pd.Da
     return pd.DataFrame()
 
 
+# 检查子目录是否存在，不存在则创建
+def create_directory_if_not_exists(dir_name):
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+        logging.info(f"Created directory: {dir_name}")
+
+
 # 读取 code.txt 文件，并获取每个股票代码
 def read_stock_codes(filename: str) -> list:
     try:
@@ -47,10 +61,115 @@ def read_stock_codes(filename: str) -> list:
         logging.error(f"文件 {filename} 未找到。")
         return []
 
+
 # 从文件获取指定的代码，并拉取历史K线
+def fetch_parts_by_codes():
+    # 读取股票代码列表
+    codes = read_stock_codes(file_selected_codes)
+    # 如果没有代码，结束程序
+    if not codes:
+        logging.error("没有找到有效的股票代码，程序终止。")
+        return
+
+    # 读取已经下载的代码列表，后续下载时忽略
+    done_codes = []
+    if os.path.exists(file_done_codes):
+        with open(file_done_codes, 'r', encoding='utf-8') as f:
+            done_codes = [line.strip() for line in f]  # 使用strip()去掉每行的换行符和多余的空白
+
+    adjust_values = ['', 'qfq', 'hfq']
+    code_id_map_em_df = his_em.code_id_map_em()    
+    for key in codes:
+        val = code_id_map_em_df.get(key)
+        if key in done_codes:
+            logging.info(f'Skipping already code. code: ({key})')
+            continue
+
+        if val is None:
+            logging.error(f'cannot find stock code. code: ({key}), adjust: ({adjust_str})')
+            continue
+
+        succ = True
+        start_time = time.time()  # 在函数执行前获取当前时间
+        for adjust in adjust_values:
+            adjust_str = adjust if adjust != '' else 'none'
+        
+            stock_zh_a_hist_df =fetch_with_retry(key, adjust)
+            if stock_zh_a_hist_df.empty:
+                logging.info(f'fetch his data error. code: ({key}), adjust: ({adjust_str})')
+                succ = False
+            else:
+                # 将 DataFrame 输出为 CSV 文件
+                curr_dir = f'{dir_his_kline_em}/{val}_{adjust_str}'
+                create_directory_if_not_exists(curr_dir)
+                curr_file = f'{curr_dir}/{key}_{adjust_str}_his_data.csv'
+
+                stock_zh_a_hist_df.to_csv(curr_file, index=False, encoding='utf-8')
+                lines = stock_zh_a_hist_df.shape[0]
+                logging.info(f'fetch his data and write to file. code: ({key}), adjust: ({adjust_str}), file: ({curr_file}) lines: ({lines})')
+                time.sleep(5)
+
+        end_time = time.time()  # 在函数执行后获取当前时间
+        elapsed_time = int(end_time - start_time)  # 计算时间差,秒
+        if succ:
+            # 下载后，记录日志
+            with open(file_done_codes, 'a', encoding='utf-8') as done_list:
+                done_list.write(f"{key}\n")
+            logging.info(f"Downloaded and recorded: ({key}) total lines: {lines} time cost: {elapsed_time} s")
+
+        time.sleep(10)    
+
+# 获取全量代码的历史K线
+def fetch_all_by_codes():
+    # 读取已经下载的代码列表，后续下载时忽略
+    done_codes = []
+    if os.path.exists(file_done_codes):
+        with open(file_done_codes, 'r', encoding='utf-8') as f:
+            done_codes = [line.strip() for line in f]  # 使用strip()去掉每行的换行符和多余的空白
+
+    adjust_values = ['', 'qfq', 'hfq']
+    code_id_map_em_df = his_em.code_id_map_em()
+
+    for key, val in code_id_map_em_df.items():
+        if key in done_codes:
+            logging.info(f'Skipping already code. code: ({key})')
+            continue
+
+        succ = True
+        start_time = time.time()  # 在函数执行前获取当前时间
+        for adjust in adjust_values:
+            adjust_str = adjust if adjust != '' else 'none'
+            stock_zh_a_hist_df =fetch_with_retry(key, adjust)
+
+            if stock_zh_a_hist_df.empty:
+                logging.error(f'fetch his data error. code: ({key}), adjust: ({adjust_str})')
+                succ = False
+            else:
+                # 将 DataFrame 输出为 CSV 文件
+                curr_dir = f'{dir_his_kline_em}/{val}_{adjust_str}'
+                create_directory_if_not_exists(curr_dir)
+                curr_file = f'{curr_dir}/{key}_{adjust_str}_his_data.csv'
+                
+                stock_zh_a_hist_df.to_csv(curr_file, index=False, encoding='utf-8')
+                lines = stock_zh_a_hist_df.shape[0]
+                logging.info(f'fetch his data and write to file. code: ({key}), adjust: ({adjust_str}), file: ({curr_file}) lines: ({lines})')
+                time.sleep(5)                
+
+        end_time = time.time()  # 在函数执行后获取当前时间
+        elapsed_time = int(end_time - start_time)  # 计算时间差,秒
+        if succ:
+            # 下载后，记录日志
+            with open(file_done_codes, 'a', encoding='utf-8') as done_list:
+                done_list.write(f"{key}\n")
+            logging.info(f"Downloaded and recorded: ({key}) total lines: {lines} time cost: {elapsed_time} s")
+
+        time.sleep(10)    
+
+
+# 从文件获取指定的代码，并拉取历史K线，废弃
 def fetch_parts():
     # 读取股票代码列表
-    codes = read_stock_codes('code.txt')
+    codes = read_stock_codes(file_selected_codes)
     # 如果没有代码，结束程序
     if not codes:
         logging.error("没有找到有效的股票代码，程序终止。")
@@ -80,7 +199,7 @@ def fetch_parts():
 
         time.sleep(10)    
 
-# 获取全量代码的历史K线
+# 获取全量代码的历史K线，废弃
 def fetch_all():
     adjust_values = ['', 'qfq', 'hfq']
     code_id_map_em_df = his_em.code_id_map_em()
@@ -104,5 +223,20 @@ def fetch_all():
 
 # 主函数
 if __name__ == '__main__':
-    fetch_all()
-    #fetch_parts()
\ No newline at end of file
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <cmd>")
+        print("cmd: all, parts")
+        sys.exit(1)
+
+    cmd = sys.argv[1]
+
+    if cmd == "all":
+        fetch_all_by_codes()  # 拉取所有的代码
+    elif cmd == "parts":
+        fetch_parts_by_codes()  # 拉取指定的代码
+    elif cmd == "all_other":
+        fetch_all()
+    elif cmd == "parts_other":
+        fetch_parts()
+    else:
+        print(f"Unknown command: {cmd}")