import os import json import requests import time import logging from bs4 import BeautifulSoup import sqlite_utils as db_tools import config # 获取个股研报列表的指定页 def fetch_reports_by_stock(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3): # 请求头 HEADERS = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/json", "Origin": "https://data.eastmoney.com", "Referer": "https://data.eastmoney.com/report/stock.jshtml", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", } # 请求 URL URL = "https://reportapi.eastmoney.com/report/list2" payload = { "beginTime": start_date, "endTime": end_date, "industryCode": "*", "ratingChange": None, "rating": None, "orgCode": None, "code": "*", "rcode": "", "pageSize": page_size, "p": page_no, "pageNo": page_no, "pageNum": page_no, "pageNumber": page_no } logging.debug(f'begin: {start_date}, end: {end_date}') for attempt in range(max_retries): try: response = requests.post(URL, headers=HEADERS, json=payload, timeout=10) response.raise_for_status() data = response.json() return data except requests.RequestException as e: logging.warning(f"network error on {URL}: {e}, Retring...") logging.error(f'Fetching failed after max retries. {URL}') return None # 达到最大重试次数仍然失败 # 获取行业研报列表的指定页 def fetch_reports_by_industry(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", "Referer": "https://data.eastmoney.com/report/industry.jshtml" } url = "https://reportapi.eastmoney.com/report/list" params = { "cb": "datatable1413600", "industryCode": "*", "pageSize": page_size, "industry": "*", "rating": "*", "ratingChange": "*", "beginTime": start_date, "endTime": end_date, "pageNo": page_no, "fields": "", "qType": 1, "orgCode": "", "rcode": "", "p": page_no, "pageNum": page_no, "pageNumber": page_no, "_": int(time.time() * 1000) # 动态时间戳 } for attempt in range(max_retries): try: response = requests.get(url, headers=headers, params=params, timeout=10) response.raise_for_status() # 去掉回调函数包装 json_text = response.text.strip("datatable1413600(").rstrip(");") data = json.loads(json_text) return data except requests.RequestException as e: logging.warning(f"network error on {url}: {e}, Retring...") return None except json.JSONDecodeError as e: logging.warning(f"json decode error on {url}: {e}, Retring...") return None logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 # 获取宏观研报列表的指定页 def fetch_reports_by_macresearch(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", "Referer": "https://data.eastmoney.com/report/macresearch.jshtml" } url = "https://reportapi.eastmoney.com/report/jg" params = { "cb": "datatable2612129", "industryCode": "*", "pageSize": page_size, "author": "", "beginTime": start_date, "endTime": end_date, "pageNo": page_no, "fields": "", "qType": 3, "orgCode": "", "rcode": "", "p": page_no, "pageNum": page_no, "pageNumber": page_no, "_": int(time.time() * 1000) # 动态时间戳 } for attempt in range(max_retries): try: response = requests.get(url, headers=headers, params=params, timeout=10) response.raise_for_status() # 去掉回调函数包装 json_text = response.text.strip("datatable2612129(").rstrip(");") data = json.loads(json_text) return data except requests.RequestException as e: logging.warning(f"network error on {url}: {e}, Retring...") return None except json.JSONDecodeError as e: logging.warning(f"json decode error on {url}: {e}, Retring...") return None logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 # 获取策略研报列表的指定页 def fetch_reports_by_strategy(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", "Referer": "https://data.eastmoney.com/report/strategyreport.jshtml" } url = "https://reportapi.eastmoney.com/report/jg" params = { "cb": "datatable5349866", "industryCode": "*", "pageSize": page_size, "author": "", "beginTime": start_date, "endTime": end_date, "pageNo": page_no, "fields": "", "qType": 2, "orgCode": "", "rcode": "", "p": page_no, "pageNum": page_no, "pageNumber": page_no, "_": int(time.time() * 1000) # 动态时间戳 } for attempt in range(max_retries): try: response = requests.get(url, headers=headers, params=params, timeout=10) response.raise_for_status() # 去掉回调函数包装 json_text = response.text.strip("datatable5349866(").rstrip(");") data = json.loads(json_text) return data except requests.RequestException as e: logging.warning(f"network error on {url}: {e}, Retring...") return None except json.JSONDecodeError as e: logging.warning(f"json decode error on {url}: {e}, Retring...") return None logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 # 获取新股研报列表的指定页 def fetch_reports_by_newstock(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", "Referer": "https://data.eastmoney.com/report/newstock.jshtml" } url = "https://reportapi.eastmoney.com/report/newStockList" params = { "cb": "datatable5144183", "pageSize": page_size, "author": "", "beginTime": start_date, "endTime": end_date, "pageNo": page_no, "fields": "", "qType": 4, "orgCode": "", "rcode": "", "p": page_no, "pageNum": page_no, "pageNumber": page_no, "_": int(time.time() * 1000) # 动态时间戳 } for attempt in range(max_retries): try: response = requests.get(url, headers=headers, params=params, timeout=10) response.raise_for_status() # 去掉回调函数包装 json_text = response.text.strip("datatable5144183(").rstrip(");") data = json.loads(json_text) return data except requests.RequestException as e: logging.warning(f"network error on {url}: {e}, Retring...") return None except json.JSONDecodeError as e: logging.warning(f"json decode error on {url}: {e}, Retring...") return None logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 # 访问指定 infoCode 的页面,提取 PDF 下载链接 def fetch_pdf_link(url, max_retries = 3): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" } for attempt in range(max_retries): try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # 解析 HTML soup = BeautifulSoup(response.text, "html.parser") pdf_link = soup.find("a", class_="pdf-link") if pdf_link and "href" in pdf_link.attrs: return pdf_link["href"] else: logging.warning(f"未找到 PDF 链接: {url}") return None except requests.RequestException as e: logging.error(f"请求失败: {url} {e}") return None logging.error(f'Fetching failed after max retries. {url}') return None # 达到最大重试次数仍然失败 # 下载 PDF 并保存到本地 def download_pdf(pdf_url, save_path): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" } try: response = requests.get(pdf_url, headers=headers, stream=True, timeout=20) response.raise_for_status() with open(save_path, "wb") as file: for chunk in response.iter_content(chunk_size=1024): file.write(chunk) return True except requests.RequestException as e: logging.error(f"PDF 下载失败: {e}") return False