modify scripts

2025-03-15 08:02:25 +08:00
parent af92229a3e
commit 3b76c00500
4 changed files with 89 additions and 49 deletions
--- a/commit.sh
+++ b/commit.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+
+# 确保脚本有执行权限（只需执行一次）
+# chmod +x git_commit.sh
+
+# 检查是否在 Git 仓库内
+if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
+    echo "❌ 当前目录不是 Git 仓库，请先执行 git init"
+    exit 1
+fi
+
+# 获取 commit message
+commit_msg="$1"
+
+# 如果没有提供 commit message，提示用户输入
+if [ -z "$commit_msg" ]; then
+    read -p "请输入 commit message: " commit_msg
+    if [ -z "$commit_msg" ]; then
+        echo "❌ 提交信息不能为空！"
+        exit 1
+    fi
+fi
+
+# 添加所有更改
+git add .
+if [ $? -ne 0 ]; then
+    echo "❌ git add 失败！"
+    exit 1
+fi
+
+# 提交更改
+git commit -m "$commit_msg"
+if [ $? -ne 0 ]; then
+    echo "❌ git commit 失败！"
+    exit 1
+fi
+
+# 推送到远程仓库
+git push -u origin master
+if [ $? -ne 0 ]; then
+    echo "❌ git push 失败！请检查远程仓库设置。"
+    exit 1
+fi
+
+echo "✅ 代码提交成功！"
--- a/scripts/iafd/src/fetch.py
+++ b/scripts/iafd/src/fetch.py
@ -233,6 +233,7 @@ def fetch_movies_by_stu():

 # 更新演员信息，单次循环
 def fetch_performers_detail_once(perfomers_list):
+    last_performer_id = 0
    for performer in perfomers_list:
        url = performer['href']
        person = performer['name']
@ -247,7 +248,8 @@ def fetch_performers_detail_once(perfomers_list):
                    **data
                })
                if performer_id:
-                    logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
+                    logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
+                    last_performer_id = performer_id
                else:
                    logging.warning(f'insert person: ({person}) {url} failed.')

@ -264,10 +266,11 @@ def fetch_performers_detail_once(perfomers_list):
        else:
            logging.warning(f'fetch_page error. person: ({person}), url: {url}')
        time.sleep(1)
+    return last_performer_id

 # 更新演员信息
 def fetch_performers_detail():
-    limit_count = 5 if debug else 1000
+    limit_count = 5 if debug else 100
    perfomers_list = []

    # 获取新演员的列表
@ -276,7 +279,8 @@ def fetch_performers_detail():
        if len(perfomers_list) < 1:
            logging.info(f'all new performers fetched. ')
            break
-        fetch_performers_detail_once(perfomers_list)
+        last_perfomer_id = fetch_performers_detail_once(perfomers_list)
+        logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
        if debug:
            break

@ -286,19 +290,21 @@ def fetch_performers_detail():
        if len(perfomers_list) < 1:
            logging.info(f'all existed performers updated. ')
            break
-        fetch_performers_detail_once(perfomers_list)
+        last_perfomer_id = fetch_performers_detail_once(perfomers_list)
+        logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
        if debug:
            break

 # 更新影片信息
 def fetch_movies_detail():
-    limit_count = 10 if debug else 1000
+    limit_count = 10 if debug else 100
    movies_list = []
    while True:
        movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
        if len(movies_list) < 1:
            logging.info(f'all movies fetched.')
            break
+        last_movie_id = 0
        for movie in movies_list:
            url = movie['href']
            title = movie['title']
@ -314,7 +320,8 @@ def fetch_movies_detail():
                        movie_data['StudioHref'] = utils.dist_stu_href_rewrite(movie_data['StudioHref'].lower())
                    movie_id = db_tools.insert_or_update_movie(movie_data)
                    if movie_id:
-                        logging.info(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
+                        logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
+                        last_movie_id = movie_id
                    else:
                        logging.warning(f'insert movie {url} failed.')

@ -327,6 +334,7 @@ def fetch_movies_detail():
            else:
                logging.warning(f'fetch_page error. url: {url}')
            time.sleep(1)
+        logging.info(f'insert {len(movies_list)} movies. last movie id: {last_movie_id}')
        # 调试增加break
        if debug:
            return True
--- a/stockapp/reports_em/fetch.py
+++ b/stockapp/reports_em/fetch.py
@ -29,12 +29,12 @@ map_pdf_page = {
    utils.tbl_industry :  "https://data.eastmoney.com/report/zw_industry.jshtml?infocode={}"
 }

-map_pdf_path = {
-    utils.tbl_stock :     f'{pdf_base_dir}/stock',
-    utils.tbl_new_stock : f'{pdf_base_dir}/newstock',
-    utils.tbl_strategy :  f'{pdf_base_dir}/strategy',
-    utils.tbl_macresearch : f'{pdf_base_dir}/macresearch',
-    utils.tbl_industry :  f'{pdf_base_dir}/industry'
+map_tbl_name = {
+    utils.tbl_stock :     '个股研报',
+    utils.tbl_new_stock : '新股研报',
+    utils.tbl_strategy :  '策略报告',
+    utils.tbl_macresearch : '宏观研究',
+    utils.tbl_industry :  '行业研报'
 }

 current_date = datetime.now()
@ -76,7 +76,7 @@ def fetch_reports_list_general(fetch_func, table_name, s_date, e_date, data_dir_


 # 股票所用的url
-def parse_func_stock(row, tbl_name):
+def parse_func_general(row, tbl_name):
    info_code = row['infoCode']
    title = row['title'].replace("/", "_").replace("\\", "_")
    org_sname = row['orgSName']
@ -84,34 +84,28 @@ def parse_func_stock(row, tbl_name):
    industry_name = row['industryName']
    publish_date = row['publishDate'].split(" ")[0]

-    file_name = f"{publish_date}_{org_sname}_{stock_name}_{title}.pdf"
+    # 建表的时候默认值有点问题
+    if stock_name == '' or stock_name=="''":
+        stock_name = 'None'
+    if industry_name == '':
+        industry_name = 'None'
+    if org_sname == '':
+        org_sname = 'None'
+    report_type = map_tbl_name.get(tbl_name, 'None')
+
+    file_name = f"{publish_date}_{report_type}_{org_sname}_{industry_name}_{stock_name}_{title}.pdf"
    url = map_pdf_page.get(tbl_name, None)
    if url is None:
        logging.warning(f'wrong table name: {tbl_name}')
-        return None, None, None
+        return None, None
    
    url = url.format(info_code)
-    os.makedirs(map_pdf_path[tbl_name], exist_ok=True)
-    return url, os.path.join(map_pdf_path[tbl_name], file_name), None
+    # 拼目录
+    dir_year = publish_date[:4] if len(publish_date)>4 else ''
+    dir_path = f'{pdf_base_dir}/{dir_year}/{map_tbl_name[tbl_name]}'
+    os.makedirs(dir_path, exist_ok=True)
+    return url, os.path.join(dir_path, file_name)

-# 其它所用的url
-def parse_func_other(row, tbl_name):
-    info_code = row['infoCode']
-    title = row['title'].replace("/", "_").replace("\\", "_")
-    org_sname = row['orgSName']
-    industry_name = row['industryName']
-    publish_date = row['publishDate'].split(" ")[0]
-
-    file_name = f"{publish_date}_{org_sname}_{industry_name}_{title}.pdf"
-    old_file_name = f"{publish_date}_{industry_name}_{org_sname}_{title}.pdf"
-    url = map_pdf_page.get(tbl_name, None)
-    if url is None:
-        logging.warning(f'wrong table name: {tbl_name}')
-        return None, None, None
-    
-    url = url.format(info_code)
-    os.makedirs(map_pdf_path[tbl_name], exist_ok=True)
-    return url, os.path.join(map_pdf_path[tbl_name], file_name), os.path.join(map_pdf_path[tbl_name], old_file_name)

 # 通用下载函数
 def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_date, e_date=end_date, limit=None):
@ -126,7 +120,7 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
        rows = []

    for row in rows:
-        url, file_path, old_file_path = parse_func(row, tbl_name)
+        url, file_path = parse_func(row, tbl_name)
        if url is None or file_path is None:
            logging.warning(f'wrong url or file_path. tbl_name: {tbl_name}')
            continue
@ -134,11 +128,6 @@ def download_pdf_stock_general(parse_func, tbl_name, querystr='', s_date=start_d
        if file_path and os.path.isfile(file_path):
            logging.info(f'{file_path} already exists. skipping...')
            continue
-        # 旧方式命名的，rename
-        if old_file_path and os.path.isfile(old_file_path):
-            shutil.move(old_file_path, file_path)
-            logging.info(f'rename existed file to {file_path}')
-            continue
        # 获取pdf链接地址
        if url:
            pdf_url = em.fetch_pdf_link(url)
@ -175,19 +164,19 @@ def fetch_reports_list_strategy(s_date=start_date, e_date=end_date):

 # 下载股票pdf
 def download_pdf_stock(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_stock, utils.tbl_stock, ' AND attachPages>=30', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, utils.tbl_stock, ' ', s_date, e_date, limit=2 if debug else None)

 def download_pdf_newstock(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_stock, utils.tbl_new_stock, ' AND attachPages>=30', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, utils.tbl_new_stock, ' ', s_date, e_date, limit=2 if debug else None)

 def download_pdf_industry(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_other, utils.tbl_industry, ' AND attachPages>=30', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, utils.tbl_industry, ' ', s_date, e_date, limit=2 if debug else None)

 def download_pdf_macresearch(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_other, utils.tbl_macresearch, ' ', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, utils.tbl_macresearch, ' ', s_date, e_date, limit=2 if debug else None)

 def download_pdf_strategy(s_date=start_date, e_date=end_date):
-    download_pdf_stock_general(parse_func_other, utils.tbl_strategy, ' ', s_date, e_date, limit=2 if debug else None)
+    download_pdf_stock_general(parse_func_general, utils.tbl_strategy, ' ', s_date, e_date, limit=2 if debug else None)


 # 建立缩写到函数的映射
--- a/stockapp/reports_em/sqlite_utils.py
+++ b/stockapp/reports_em/sqlite_utils.py
@ -82,10 +82,8 @@ def insert_or_update_common(data, tbl_name, uniq_key='infoCode'):
 # 查询数据
 def query_reports_comm(tbl_name, querystr='', limit=None ):
    try:
-        if tbl_name in [utils.tbl_stock, utils.tbl_new_stock] :
+        if tbl_name in [utils.tbl_stock, utils.tbl_new_stock, utils.tbl_industry, utils.tbl_macresearch, utils.tbl_strategy] :
            sql = f"SELECT id, infoCode, title, orgSName, industryName, stockName, publishDate FROM {tbl_name} WHERE 1=1 {querystr}"
-        elif tbl_name in [utils.tbl_industry, utils.tbl_macresearch, utils.tbl_strategy] :
-            sql = f"SELECT id, infoCode, title, orgSName, industryName, publishDate FROM {tbl_name} WHERE 1=1 {querystr}"
        else:
            logging.warning(f'wrong table name: {tbl_name}')
            return None