modify scripts

2025-03-24 10:48:35 +08:00
parent 7ded7c5a19
commit 1521ff1fc0
11 changed files with 248 additions and 565 deletions
--- a/reports_em/config.py
+++ b/reports_em/config.py
@ -1,86 +0,0 @@
 import logging
 import os
 import inspect
 import time
 from datetime import datetime
 from logging.handlers import RotatingFileHandler
 from collections import defaultdict
 home_dir = os.path.expanduser("~")
 global_host_data_dir = f'{home_dir}/hostdir/stock_data'
 global_share_db_dir  = f'{home_dir}/sharedata/sqlite'
 # 统计日志频率
 log_count = defaultdict(int)  # 记录日志的次数
 last_log_time = defaultdict(float)  # 记录上次写入的时间戳
 class RateLimitFilter(logging.Filter):
    """
    频率限制过滤器：
    1. 在 60 秒内，同样的日志最多写入 60 次，超过则忽略
    2. 如果日志速率超过 100 条/秒，发出告警
    """
    LOG_LIMIT = 600  # 每分钟最多记录相同消息 10 次
    def filter(self, record):
        global log_count, last_log_time
        message_key = record.getMessage()  # 获取日志内容
        # 计算当前时间
        now = time.time()
        elapsed = now - last_log_time[message_key]
        # 限制相同日志的写入频率
        if elapsed < 60:  # 60 秒内
            log_count[message_key] += 1
            if log_count[message_key] > self.LOG_LIMIT:
                return False  # 直接丢弃
        else:
            log_count[message_key] = 1  # 超过 60 秒，重新计数
        last_log_time[message_key] = now
        return True  # 允许写入日志
 def setup_logging(log_filename=None):
    if log_filename is None:
        caller_frame = inspect.stack()[1]
        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
        current_date = datetime.now().strftime('%Y%m%d')
        log_filename = f'../log/{caller_filename}_{current_date}.log'
    max_log_size = 100 * 1024 * 1024  # 10 MB
    max_log_files = 10  # 最多保留 10 个日志文件
    file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
    file_handler.setFormatter(logging.Formatter(
        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
    ))
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(
        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
    ))
    # 创建 logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.handlers = []  # 避免重复添加 handler
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    # 添加频率限制
    rate_limit_filter = RateLimitFilter()
    file_handler.addFilter(rate_limit_filter)
    console_handler.addFilter(rate_limit_filter)
 # 运行示例
 if __name__ == "__main__":
    setup_logging()
    for i in range(1000):
        logging.info("测试日志，检测频率限制")
        time.sleep(0.01)  # 模拟快速写入日志
--- a/reports_em/deploy.sh
+++ b/reports_em/deploy.sh
@ -1,115 +0,0 @@
 #!/bin/bash
 # 远程服务器列表，按需修改
 SERVERS=("175.178.54.98" "1.12.218.143" "43.139.169.25" "129.204.180.174" "42.194.142.169")
 REMOTE_USER="ubuntu"
 REMOTE_SCRIPT_DIR="/home/ubuntu/pyscripts/stockapp/reports_em"
 REMOTE_LOG_DIR="/home/ubuntu/pyscripts/stockapp/log"
 DATA_DIR="/home/ubuntu/hostdir/stock_data/pdfs"
 LOCAL_FILES=("config.py" "em_reports.py" "fetch.py")
 # 远程任务参数配置（每台服务器的不同参数）
 TASK_PARAMS=(
    "--mode=down --begin=2024-01-01 --end=2024-06-30"
    "--mode=down --begin=2023-01-01 --end=2023-06-30"
    "--mode=down --begin=2022-01-01 --end=2022-06-30"
    "--mode=down --begin=2021-01-01 --end=2021-06-30"
    "--mode=down --begin=2020-01-01 --end=2020-06-30"
 )
 # 推送代码到所有服务器
 function push_code() {
    for SERVER in "${SERVERS[@]}"; do
        echo "Pushing code to $SERVER..."
        scp "${LOCAL_FILES[@]}" "$REMOTE_USER@$SERVER:$REMOTE_SCRIPT_DIR/"
    done
 }
 # 启动任务
 function start_tasks() {
    for i in "${!SERVERS[@]}"; do
        SERVER="${SERVERS[$i]}"
        PARAMS="${TASK_PARAMS[$i]}"
        echo "Starting task on $SERVER with params: $PARAMS"
        #ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 &"
        #ssh "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown"
        # nohup ... < /dev/null：防止 nohup 等待 stdin，立即释放控制权。
        # & disown：确保进程与 SSH 彻底分离，避免 SIGHUP 信号影响，SSH 立即返回。
        # ssh -n：禁用 ssh 的 stdin，防止远程进程等待输入。
        ssh -n "$REMOTE_USER@$SERVER" "cd $REMOTE_SCRIPT_DIR && nohup python3 ./fetch.py $PARAMS > ../log/nohup.log 2>&1 < /dev/null & disown" 
    done
 }
 # 停止任务
 function stop_tasks() {
    for SERVER in "${SERVERS[@]}"; do
        echo "Stopping task on $SERVER..."
        ssh "$REMOTE_USER@$SERVER" "pkill -f 'python3 ./fetch.py'"
    done
 }
 # 获取任务进度
 function check_progress() {
    for SERVER in "${SERVERS[@]}"; do
        echo -e "\nChecking progress on $SERVER..."
        FILE_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ls -lRh $DATA_DIR | grep pdf | wc -l")
        FILE_SIZE=$(ssh "$REMOTE_USER@$SERVER" "du -sh $DATA_DIR")
        PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ps aux | grep '[f]etch.py' | wc -l")
        if [ "$PROCESS_COUNT" -gt 0 ]; then
            echo "Process status: Running ($PROCESS_COUNT instances), if 2, include parent progress"
        else
            echo "Process status: Not running"
        fi
        echo "Total files: $FILE_COUNT"
        echo "Total size : $FILE_SIZE"
        ERROR_LINES=$(ssh "$REMOTE_USER@$SERVER" "grep -v INFO $REMOTE_LOG_DIR/fetch_202503* | wc -l")
        echo "Error lines: $ERROR_LINES"
        TASK_COUNT=$(ssh "$REMOTE_USER@$SERVER" "grep 'running task. id' ~/pyscripts/stockapp/log/fetch_20250316.log | wc -l")
        echo "Task count: $TASK_COUNT"
    done
 }
 # 获取任务进度
 function check_progress_robot() {
    result=""
    for SERVER in "${SERVERS[@]}"; do
        result+="\nChecking progress on $SERVER...\n"
        FILE_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ls -lRh $DATA_DIR | grep pdf | wc -l")
        FILE_SIZE=$(ssh "$REMOTE_USER@$SERVER" "du -sh $DATA_DIR")
        PROCESS_COUNT=$(ssh "$REMOTE_USER@$SERVER" "ps aux | grep '[f]etch.py' | wc -l")
        if [ "$PROCESS_COUNT" -gt 0 ]; then
            result+="Process status: Running ($PROCESS_COUNT instances), if 2, include parent progress\n"
        else
            result+="Process status: Not running\n"
        fi
        result+="Total files: $FILE_COUNT\n"
        result+="Total size : $FILE_SIZE\n"
        ERROR_LINES=$(ssh "$REMOTE_USER@$SERVER" "grep -v INFO $REMOTE_LOG_DIR/fetch_202503* | wc -l")
        result+="Error lines: $ERROR_LINES\n"
        TASK_COUNT=$(ssh "$REMOTE_USER@$SERVER" "grep 'running task. id' ~/pyscripts/stockapp/log/fetch_20250316.log | wc -l")
    done
    echo -e "$result"
    # 调用 Python 脚本发送消息
    python3 ./robot.py "$result"
 }
 # 脚本菜单
 case "$1" in
    push) push_code ;;
    start) start_tasks ;;
    stop) stop_tasks ;;
    check) check_progress ;;
    *)
        echo "Usage: $0 {push|start|stop|check}"
        exit 1
        ;;
 esac
--- a/reports_em/sqlite_utils.py
+++ b/reports_em/sqlite_utils.py
@ -1,248 +0,0 @@
 import sqlite3
 import json
 import config
 import utils
 import logging
 import sys
 from datetime import datetime
 # 连接 SQLite 数据库
 DB_PATH = f"{config.global_share_db_dir}/stock_report.db"  # 替换为你的数据库文件
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
 # 获取表的列名和默认值
 def get_table_columns_and_defaults(tbl_name):
    try:
        cursor.execute(f"PRAGMA table_info({tbl_name})")
        columns = cursor.fetchall()
        column_info = {}
        for col in columns:
            col_name = col[1]
            default_value = col[4]
            column_info[col_name] = default_value
        return column_info
    except sqlite3.Error as e:
        logging.error(f"Error getting table columns: {e}")
        return None
 # 检查并处理数据
 def check_and_process_data(data, tbl_name):
    column_info = get_table_columns_and_defaults(tbl_name=tbl_name)
    if column_info is None:
        return None
    processed_data = {}
    for col, default in column_info.items():
        if col == 'id':  # 自增主键，不需要用户提供
            continue
        if col == 'created_at' or col == 'updated_at':  # 日期函数，用户自己指定即可
            continue
        if col in ['author', 'authorID']:
            values = data.get(col, [])
            processed_data[col] = ','.join(values)
        elif col in data:
            processed_data[col] = data[col]
        else:
            if default is not None:
                processed_data[col] = default
            else:
                processed_data[col] = None
    return processed_data
 # 插入或更新数据
 def insert_or_update_common(data, tbl_name, uniq_key='infoCode'):
    try:
        processed_data = check_and_process_data(data, tbl_name)
        if processed_data is None:
            return None
        columns = ', '.join(processed_data.keys())
        values = list(processed_data.values())
        placeholders = ', '.join(['?' for _ in values])
        update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != 'infoCode']) + ', updated_at=datetime(\'now\', \'localtime\')'
        sql = f'''
            INSERT INTO {tbl_name} ({columns}, updated_at)
            VALUES ({placeholders}, datetime('now', 'localtime'))
            ON CONFLICT (infoCode) DO UPDATE SET {update_clause}
        '''
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data["infoCode"],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None
 # 查询数据
 def query_reports_comm(tbl_name, querystr='', limit=None ):
    try:
        if tbl_name in [utils.tbl_stock, utils.tbl_new_stock, utils.tbl_industry, utils.tbl_macresearch, utils.tbl_strategy] :
            sql = f"SELECT id, infoCode, title, orgSName, industryName, stockName, publishDate FROM {tbl_name} WHERE 1=1 {querystr}"
        else:
            logging.warning(f'wrong table name: {tbl_name}')
            return None
        if limit :
            sql = sql + f' limit {limit}'
        cursor.execute(sql)
        results = cursor.fetchall()
        # 获取列名
        column_names = [description[0] for description in cursor.description]
        # 将结果转换为字典列表
        result_dict_list = []
        for row in results:
            row_dict = {column_names[i]: value for i, value in enumerate(row)}
            result_dict_list.append(row_dict)
        return result_dict_list
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 '''
 # 插入或更新 industry_report 数据
 def insert_or_update_report(report):
    try:
        sql = """
            INSERT INTO industry_report (infoCode, title, orgCode, orgName, orgSName, publishDate, 
                                        industryCode, industryName, emRatingCode, emRatingValue, 
                                        emRatingName, attachSize, attachPages, updated_at) 
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
            ON CONFLICT(infoCode) DO UPDATE SET 
                title=excluded.title,
                orgCode=excluded.orgCode,
                orgName=excluded.orgName,
                orgSName=excluded.orgSName,
                publishDate=excluded.publishDate,
                industryCode=excluded.industryCode,
                industryName=excluded.industryName,
                emRatingCode=excluded.emRatingCode,
                emRatingValue=excluded.emRatingValue,
                emRatingName=excluded.emRatingName,
                attachSize=excluded.attachSize,
                attachPages=excluded.attachPages,
                updated_at=datetime('now', 'localtime')
        """
        values = (
            report["infoCode"], report["title"], report["orgCode"], report["orgName"],
            report["orgSName"], report["publishDate"], report["industryCode"],
            report["industryName"], report["emRatingCode"], report["emRatingValue"],
            report["emRatingName"], report.get("attachSize", 0), report.get("attachPages", 0)
        )
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute("SELECT id FROM industry_report WHERE infoCode = ?", (report["infoCode"],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"数据库错误: {e}")
        return None
    except Exception as e:
        conn.rollback()
        logging.error(f"未知错误: {e}")
        return None
 # 查询研报数据
 def query_industry_reports(querystr='', limit=None):
    try:
        sql = f"SELECT id, infoCode, title, orgSName, industryName, publishDate FROM industry_report WHERE 1=1 {querystr}"
        if limit :
            sql = sql + f' limit {limit}'
        cursor.execute(sql)
        results = cursor.fetchall()
        return results
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 # 插入或更新 industry_report 数据
 def insert_or_update_stock_report(report):
    try:
        sql = """
            INSERT INTO stock_report (infoCode, title, stockName, stockCode, orgCode, orgName, orgSName, 
                                        publishDate, industryCode, industryName, emIndustryCode, emRatingCode, 
                                        emRatingValue, emRatingName, attachPages, attachSize, author, authorID, updated_at) 
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now', 'localtime'))
            ON CONFLICT(infoCode) DO UPDATE SET 
                title=excluded.title,
                stockName=excluded.stockName,
                stockCode=excluded.stockCode,
                orgCode=excluded.orgCode,
                orgName=excluded.orgName,
                orgSName=excluded.orgSName,
                publishDate=excluded.publishDate,
                industryCode=excluded.industryCode,
                industryName=excluded.industryName,
                emIndustryCode=excluded.emIndustryCode,
                emRatingCode=excluded.emRatingCode,
                emRatingValue=excluded.emRatingValue,
                emRatingName=excluded.emRatingName,
                attachPages=excluded.attachPages,
                attachSize=excluded.attachSize,
                author=excluded.author,
                authorID=excluded.authorID,
                updated_at=datetime('now', 'localtime')
        """
        values = (
            report["infoCode"], report["title"], report["stockName"], report["stockCode"],
            report["orgCode"], report["orgName"], report["orgSName"], report["publishDate"],
            report.get("industryCode", ""), report.get("industryName", ""), report.get("emIndustryCode", ""),
            report["emRatingCode"], report["emRatingValue"], report["emRatingName"],
            report.get("attachPages", 0), report.get("attachSize", 0), 
            ",".join(report.get("author", [])), ",".join(report.get("authorID", []))
        )
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute("SELECT id FROM stock_report WHERE infoCode = ?", (report["infoCode"],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"数据库错误: {e}")
        return None
    except Exception as e:
        conn.rollback()
        logging.error(f"未知错误: {e}")
        return None
 # 查询研报数据
 def query_stock_reports(querystr='', limit=None):
    try:
        sql = f"SELECT id, infoCode, title, orgSName, stockName, publishDate FROM stock_report WHERE 1=1 {querystr}"
        if limit :
            sql = sql + f' limit {limit}'
        cursor.execute(sql)
        results = cursor.fetchall()
        return results
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 '''
--- a/src/config/bak.py
+++ b/src/config/bak.py
@ -1,65 +0,0 @@
 import logging
 import os
 import inspect
 from datetime import datetime
 from pathlib import Path
 # MySQL 配置
 db_config = {
    'host': 'testdb',
    'user': 'root',
    'password': 'mysqlpw',
    'database': 'stockdb'
 }
 log_dir_prefix = '../log'
 global_share_data_dir = '/root/sharedata'
 global_stock_data_dir = '/root/hostdir/stock_data'
 # 获取当前文件所在目录
 current_dir = os.path.dirname(os.path.abspath(__file__))
 # 获取项目根目录（假设当前文件在 src/strategy 下）
 project_root = os.path.abspath(os.path.join(current_dir, '..', '..'))
 # 获取log目录
 def get_log_directory():
    """
    获取项目根目录下的 log 目录路径。如果 log 目录不存在，则自动创建。
    """
    # 获取当前文件所在目录
    current_dir = Path(__file__).resolve().parent
    # 找到项目根目录，假设项目根目录下有一个 log 文件夹
    project_root = current_dir
    while project_root.name != 'src' and project_root != project_root.parent:
        project_root = project_root.parent
    project_root = project_root.parent  # 回到项目根目录
    # 确保 log 目录存在
    log_dir = project_root / 'log'
    log_dir.mkdir(parents=True, exist_ok=True)
    return log_dir
 def get_caller_filename():
    # 获取调用 setup_logging 的脚本文件名
    caller_frame = inspect.stack()[2]
    caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
    return caller_filename
 # 设置日志配置
 def setup_logging(log_filename=None):
    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
    if log_filename is None:
        caller_filename = get_caller_filename()
        common_log_dir = get_log_directory()
        current_date = datetime.now().strftime('%Y%m%d')
        # 拼接 log 文件名，将日期加在扩展名前
        log_filename = f'{common_log_dir}/{caller_filename}_{current_date}.log'
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
                        handlers=[
                            logging.FileHandler(log_filename),
                            logging.StreamHandler()
                        ])
--- a/src/config/config.py
+++ b/src/config/config.py
@ -9,8 +9,13 @@ db_config = {
    'database': 'stockdb'
 }
-global_share_data_dir = '/root/sharedata'
+home_dir = os.path.expanduser("~")
-global_stock_data_dir = '/root/hostdir/stock_data'
+global_host_data_dir = f'{home_dir}/hostdir/stock_data'
 global_share_db_dir  = f'{home_dir}/sharedata/sqlite'
 # 兼容以前的定义
 global_stock_data_dir = global_host_data_dir
 global_share_data_dir = f'{home_dir}/sharedata'
 # 获取当前文件所在目录
 current_dir = Path(__file__).resolve().parent
--- a/src/crawler/em/reports.py
+++ b/src/crawler/em/reports.py
@ -4,8 +4,6 @@ import requests
 import time
 import logging
 from bs4 import BeautifulSoup
 import sqlite_utils as db_tools
 import config
 # 获取个股研报列表的指定页
 def fetch_reports_by_stock(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3):
--- a/src/db_utils/reports.py
+++ b/src/db_utils/reports.py
@ -0,0 +1,123 @@
 import sqlite3
 import logging
 import sys
 from datetime import datetime
 class DatabaseConnectionError(Exception):
    pass
 class StockReportDB:
    # 定义类属性（静态变量）
    TBL_STOCK = 'reports_stock'
    TBL_NEW_STOCK = 'reports_newstrock'
    TBL_STRATEGY = 'reports_strategy'
    TBL_MACRESEARCH = 'reports_macresearch'
    TBL_INDUSTRY = 'reports_industry'
    def __init__(self, db_path):
        self.DB_PATH = db_path
        self.conn = None
        self.cursor = None
        try:
            self.conn = sqlite3.connect(self.DB_PATH)
            self.cursor = self.conn.cursor()
        except sqlite3.Error as e:
            logging.error(f"数据库连接失败: {e}")
            raise DatabaseConnectionError("数据库连接失败")
    def __get_table_columns_and_defaults(self, tbl_name):
        try:
            self.cursor.execute(f"PRAGMA table_info({tbl_name})")
            columns = self.cursor.fetchall()
            column_info = {}
            for col in columns:
                col_name = col[1]
                default_value = col[4]
                column_info[col_name] = default_value
            return column_info
        except sqlite3.Error as e:
            logging.error(f"Error getting table columns: {e}")
            return None
    def __check_and_process_data(self, data, tbl_name):
        column_info = self.__get_table_columns_and_defaults(tbl_name=tbl_name)
        if column_info is None:
            return None
        processed_data = {}
        for col, default in column_info.items():
            if col == 'id':  # 自增主键，不需要用户提供
                continue
            if col == 'created_at' or col == 'updated_at':  # 日期函数，用户自己指定即可
                continue
            if col in ['author', 'authorID']:
                values = data.get(col, [])
                processed_data[col] = ','.join(values)
            elif col in data:
                processed_data[col] = data[col]
            else:
                if default is not None:
                    processed_data[col] = default
                else:
                    processed_data[col] = None
        return processed_data
    def insert_or_update_common(self, data, tbl_name, uniq_key='infoCode'):
        try:
            processed_data = self.__check_and_process_data(data, tbl_name)
            if processed_data is None:
                return None
            columns = ', '.join(processed_data.keys())
            values = list(processed_data.values())
            placeholders = ', '.join(['?' for _ in values])
            update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != 'infoCode']) + ', updated_at=datetime(\'now\', \'localtime\')'
            sql = f'''
                INSERT INTO {tbl_name} ({columns}, updated_at)
                VALUES ({placeholders}, datetime('now', 'localtime'))
                ON CONFLICT (infoCode) DO UPDATE SET {update_clause}
            '''
            self.cursor.execute(sql, values)
            self.conn.commit()
            # 获取插入或更新后的 report_id
            self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data["infoCode"],))
            report_id = self.cursor.fetchone()[0]
            return report_id
        except sqlite3.Error as e:
            logging.error(f"Error inserting or updating data: {e}")
            return None
    def query_reports_comm(self, tbl_name, querystr='', limit=None):
        try:
            if tbl_name in [StockReportDB.TBL_STOCK, StockReportDB.TBL_NEW_STOCK, StockReportDB.TBL_INDUSTRY, StockReportDB.TBL_MACRESEARCH, StockReportDB.TBL_STRATEGY]:
                sql = f"SELECT id, infoCode, title, orgSName, industryName, stockName, publishDate FROM {tbl_name} WHERE 1=1 {querystr}"
            else:
                logging.warning(f'wrong table name: {tbl_name}')
                return None
            if limit:
                sql = sql + f' limit {limit}'
            self.cursor.execute(sql)
            results = self.cursor.fetchall()
            # 获取列名
            column_names = [description[0] for description in self.cursor.description]
            # 将结果转换为字典列表
            result_dict_list = []
            for row in results:
                row_dict = {column_names[i]: value for i, value in enumerate(row)}
                result_dict_list.append(row_dict)
            return result_dict_list
        except sqlite3.Error as e:
            logging.error(f"查询 href 失败: {e}")
            return None
    def __del__(self):
        if self.conn:
            self.conn.close()
--- a/src/em_reports/fetch.py
+++ b/src/em_reports/fetch.py
@ -9,34 +9,41 @@ import shutil
 import logging
 from datetime import datetime, timedelta
 from functools import partial
-import config
+import src.crawler.em.reports as em
-import sqlite_utils as db_tools
+import src.utils.utils as utils
-import em_reports as em
+from src.config.config import global_host_data_dir, global_share_db_dir
-import utils 
+from src.db_utils.reports import StockReportDB, DatabaseConnectionError
 from src.logger.logger import setup_logging
-config.setup_logging()
+# 初始化日志
 setup_logging()
 debug = False
 force = False
-pdf_base_dir = f"{config.global_host_data_dir}/pdfs"  # 下载 PDF 存放目录
+pdf_base_dir = f"{global_host_data_dir}/pdfs"  # 下载 PDF 存放目录
 # 定义下载页面的链接
 map_pdf_page = {
-    utils.tbl_stock :     "https://data.eastmoney.com/report/info/{}.html",
+    StockReportDB.TBL_STOCK         : "https://data.eastmoney.com/report/info/{}.html",
-    utils.tbl_new_stock : "https://data.eastmoney.com/report/info/{}.html",
+    StockReportDB.TBL_NEW_STOCK     : "https://data.eastmoney.com/report/info/{}.html",
-    utils.tbl_strategy :  "https://data.eastmoney.com/report/zw_strategy.jshtml?encodeUrl={}",
+    StockReportDB.TBL_STRATEGY      : "https://data.eastmoney.com/report/zw_strategy.jshtml?encodeUrl={}",
-    utils.tbl_macresearch : "https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={}",
+    StockReportDB.TBL_MACRESEARCH   : "https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={}",
-    utils.tbl_industry :  "https://data.eastmoney.com/report/zw_industry.jshtml?infocode={}"
+    StockReportDB.TBL_INDUSTRY      : "https://data.eastmoney.com/report/zw_industry.jshtml?infocode={}"
 }
 # 定义表名的映射，作为存储路径用
 map_tbl_name = {
-    utils.tbl_stock :     '个股研报',
+    StockReportDB.TBL_STOCK         : '个股研报',
-    utils.tbl_new_stock : '新股研报',
+    StockReportDB.TBL_NEW_STOCK     : '新股研报',
-    utils.tbl_strategy :  '策略报告',
+    StockReportDB.TBL_STRATEGY      : '策略报告',
-    utils.tbl_macresearch : '宏观研究',
+    StockReportDB.TBL_MACRESEARCH   : '宏观研究',
-    utils.tbl_industry :  '行业研报'
+    StockReportDB.TBL_INDUSTRY      : '行业研报'
 }
 # 初始化数据库连接
 db_path = f"{global_share_db_dir}/stock_report.db"
 db_tools = None
 current_date = datetime.now()
 seven_days_ago = current_date - timedelta(days=7)
 two_years_ago = current_date - timedelta(days=2*365)
@ -45,6 +52,7 @@ start_date = two_years_ago.strftime("%Y-%m-%d")
 end_date = current_date.strftime("%Y-%m-%d")
 this_week_date = seven_days_ago.strftime("%Y-%m-%d")
 def fetch_reports_list_general(fetch_func, table_name, s_date, e_date, data_dir_prefix):
    # 示例：获取前 3 页的数据
    max_pages = 100000
@ -148,39 +156,39 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
 # 获取股票报告列表
 def fetch_reports_list_stock(s_date=start_date, e_date=end_date):
-    return fetch_reports_list_general(em.fetch_reports_by_stock, utils.tbl_stock, s_date, e_date, 'stock')
+    return fetch_reports_list_general(em.fetch_reports_by_stock, StockReportDB.TBL_STOCK, s_date, e_date, 'stock')
 # 获取股票报告列表
 def fetch_reports_list_newstock(s_date=start_date, e_date=end_date):
-    return fetch_reports_list_general(em.fetch_reports_by_newstock, utils.tbl_new_stock, s_date, e_date, 'new')
+    return fetch_reports_list_general(em.fetch_reports_by_newstock, StockReportDB.TBL_NEW_STOCK, s_date, e_date, 'new')
 # 获取行业报告列表
 def fetch_reports_list_industry(s_date=start_date, e_date=end_date):
-    return fetch_reports_list_general(em.fetch_reports_by_industry, utils.tbl_industry, s_date, e_date, 'industry')
+    return fetch_reports_list_general(em.fetch_reports_by_industry, StockReportDB.TBL_INDUSTRY, s_date, e_date, 'industry')
 # 获取行业报告列表
 def fetch_reports_list_macresearch(s_date=start_date, e_date=end_date):
-    return fetch_reports_list_general(em.fetch_reports_by_macresearch, utils.tbl_macresearch, s_date, e_date, 'macresearch')
+    return fetch_reports_list_general(em.fetch_reports_by_macresearch, StockReportDB.TBL_MACRESEARCH, s_date, e_date, 'macresearch')
 # 获取行业报告列表
 def fetch_reports_list_strategy(s_date=start_date, e_date=end_date):
-    return fetch_reports_list_general(em.fetch_reports_by_strategy, utils.tbl_strategy, s_date, e_date, 'strategy')
+    return fetch_reports_list_general(em.fetch_reports_by_strategy, StockReportDB.TBL_STRATEGY, s_date, e_date, 'strategy')
 # 下载股票pdf
 def download_pdf_stock(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_general, utils.tbl_stock, ' ', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, StockReportDB.TBL_STOCK, ' ', s_date, e_date, limit=2 if debug else None)
 def download_pdf_newstock(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_general, utils.tbl_new_stock, ' ', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, StockReportDB.TBL_NEW_STOCK, ' ', s_date, e_date, limit=2 if debug else None)
 def download_pdf_industry(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_general, utils.tbl_industry, ' ', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, StockReportDB.TBL_INDUSTRY, ' ', s_date, e_date, limit=2 if debug else None)
 def download_pdf_macresearch(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_general, utils.tbl_macresearch, ' ', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, StockReportDB.TBL_MACRESEARCH, ' ', s_date, e_date, limit=2 if debug else None)
 def download_pdf_strategy(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_general, utils.tbl_strategy, ' ', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, StockReportDB.TBL_STRATEGY, ' ', s_date, e_date, limit=2 if debug else None)
 # 建立缩写到函数的映射
@ -265,6 +273,15 @@ def main(cmd, mode, args_debug, args_force, begin, end):
    global end_date
    end_date = end if end else end_date
    # 初始化DB
    global db_tools
    try:
        db_tools = StockReportDB(db_path)
        # 进行数据库操作
    except DatabaseConnectionError as e:
        logging.error(f"数据库连接失败: {e}")
        return False
    # 开启任务
    #task_id = db_tools.insert_task_log()
    task_id = 0
--- a/src/logger/logger.py
+++ b/src/logger/logger.py
@ -1,10 +1,46 @@
 import logging
 import os
 import inspect
 import time
 from datetime import datetime
 from pathlib import Path
 from logging.handlers import RotatingFileHandler
 from collections import defaultdict
 from src.config.config import get_log_directory, get_src_directory
 # 统计日志频率
 log_count = defaultdict(int)  # 记录日志的次数
 last_log_time = defaultdict(float)  # 记录上次写入的时间戳
 class RateLimitFilter(logging.Filter):
    """
    频率限制过滤器：
    1. 在 60 秒内，同样的日志最多写入 60 次，超过则忽略
    2. 如果日志速率超过 100 条/秒，发出告警
    """
    LOG_LIMIT = 600  # 每分钟最多记录相同消息 10 次
    def filter(self, record):
        global log_count, last_log_time
        message_key = record.getMessage()  # 获取日志内容
        # 计算当前时间
        now = time.time()
        elapsed = now - last_log_time[message_key]
        # 限制相同日志的写入频率
        if elapsed < 60:  # 60 秒内
            log_count[message_key] += 1
            if log_count[message_key] > self.LOG_LIMIT:
                return False  # 直接丢弃
        else:
            log_count[message_key] = 1  # 超过 60 秒，重新计数
        last_log_time[message_key] = now
        return True  # 允许写入日志
 def get_caller_filename():
    # 获取调用栈
    stack = inspect.stack()
@ -26,7 +62,7 @@ def get_caller_filename():
                return os.path.splitext(os.path.basename(frame_info.filename))[0]
    return None
-# 设置日志配置
+
 def setup_logging(log_filename=None):
    # 如果未传入 log_filename，则使用当前脚本名称作为日志文件名
    if log_filename is None:
@ -36,8 +72,28 @@ def setup_logging(log_filename=None):
        # 拼接 log 文件名，将日期加在扩展名前
        log_filename = f'{common_log_dir}/{caller_filename}_{current_date}.log'
-    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
+    max_log_size = 100 * 1024 * 1024  # 10 MB
-                        handlers=[
+    max_log_files = 10  # 最多保留 10 个日志文件
-                            logging.FileHandler(log_filename),
+
-                            logging.StreamHandler()
+    file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
-                        ])
+    file_handler.setFormatter(logging.Formatter(
        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
    ))
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(
        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
    ))
    # 创建 logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.handlers = []  # 避免重复添加 handler
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    # 添加频率限制
    rate_limit_filter = RateLimitFilter()
    file_handler.addFilter(rate_limit_filter)
    console_handler.addFilter(rate_limit_filter)
--- a/src/strategy/rsi.py
+++ b/src/strategy/rsi.py
@ -1,6 +1,7 @@
 import pandas as pd
 import numpy as np
 import os
 import warnings
 from src.strategy.prepare import fetch_his_kline
 import src.config.config as config
 import src.crawler.em.stock as em_stock
@ -15,10 +16,15 @@ def select_stocks(stock_map):
        df = fetch_his_kline(stock_code)
        close_prices = df['close'].values
        # 捕获 RuntimeWarning 警告
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always", RuntimeWarning)
            # 使用 MyTT 库计算不同周期的 RSI
            rsi_6 = RSI(close_prices, 6)
            rsi_12 = RSI(close_prices, 12)
            rsi_24 = RSI(close_prices, 24)
            if w:
                print(f"股票代码 {stock_code} {stock_name} 在计算 RSI 时出现警告: {w[0].message}")
        df['rsi_6'] = rsi_6
        df['rsi_12'] = rsi_12
@ -96,7 +102,8 @@ if __name__ == "__main__":
    codes = ['105.QFIN', '105.FUTU']
    # 从网络上获取
-    stock_map = em_stock.code_by_fs('hk_famous', em_stock.em_market_fs_types['hk_famous'])
+    plat_id = 'cn_hs300'
    stock_map = em_stock.code_by_fs(plat_id, em_stock.em_market_fs_types[plat_id])
    if stock_map:
        select_stocks(stock_map)
--- a/reports_em/utils.py
+++ b/reports_em/utils.py
@ -5,15 +5,6 @@ import time
 import csv
 import logging
 from datetime import datetime
 import config
 tbl_stock       = 'reports_stock'
 tbl_new_stock   = 'reports_newstrock'
 tbl_strategy    = 'reports_strategy'
 tbl_macresearch = 'reports_macresearch'
 tbl_industry    = 'reports_industry'
 json_data_dir = f'{config.global_host_data_dir}/em_reports/json_data'
 # 保存 JSON 数据到本地文件
 def save_json_to_file(data, file_path, file_name):