modify scripts
This commit is contained in:
314
src/crawler/em/reports.py
Normal file
314
src/crawler/em/reports.py
Normal file
@ -0,0 +1,314 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
import logging
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 获取个股研报列表的指定页
|
||||
def fetch_reports_by_stock(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3):
|
||||
# 请求头
|
||||
HEADERS = {
|
||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"Content-Type": "application/json",
|
||||
"Origin": "https://data.eastmoney.com",
|
||||
"Referer": "https://data.eastmoney.com/report/stock.jshtml",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
# 请求 URL
|
||||
URL = "https://reportapi.eastmoney.com/report/list2"
|
||||
|
||||
payload = {
|
||||
"beginTime": start_date,
|
||||
"endTime": end_date,
|
||||
"industryCode": "*",
|
||||
"ratingChange": None,
|
||||
"rating": None,
|
||||
"orgCode": None,
|
||||
"code": "*",
|
||||
"rcode": "",
|
||||
"pageSize": page_size,
|
||||
"p": page_no,
|
||||
"pageNo": page_no,
|
||||
"pageNum": page_no,
|
||||
"pageNumber": page_no
|
||||
}
|
||||
logging.debug(f'begin: {start_date}, end: {end_date}')
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.post(URL, headers=HEADERS, json=payload, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
|
||||
return data
|
||||
|
||||
except requests.RequestException as e:
|
||||
logging.warning(f"network error on {URL}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {URL}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
|
||||
# 获取行业研报列表的指定页
|
||||
def fetch_reports_by_industry(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||
"Referer": "https://data.eastmoney.com/report/industry.jshtml"
|
||||
}
|
||||
|
||||
url = "https://reportapi.eastmoney.com/report/list"
|
||||
|
||||
params = {
|
||||
"cb": "datatable1413600",
|
||||
"industryCode": "*",
|
||||
"pageSize": page_size,
|
||||
"industry": "*",
|
||||
"rating": "*",
|
||||
"ratingChange": "*",
|
||||
"beginTime": start_date,
|
||||
"endTime": end_date,
|
||||
"pageNo": page_no,
|
||||
"fields": "",
|
||||
"qType": 1,
|
||||
"orgCode": "",
|
||||
"rcode": "",
|
||||
"p": page_no,
|
||||
"pageNum": page_no,
|
||||
"pageNumber": page_no,
|
||||
"_": int(time.time() * 1000) # 动态时间戳
|
||||
}
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, params=params, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# 去掉回调函数包装
|
||||
json_text = response.text.strip("datatable1413600(").rstrip(");")
|
||||
data = json.loads(json_text)
|
||||
|
||||
return data
|
||||
except requests.RequestException as e:
|
||||
logging.warning(f"network error on {url}: {e}, Retring...")
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logging.warning(f"json decode error on {url}: {e}, Retring...")
|
||||
return None
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
|
||||
# 获取宏观研报列表的指定页
|
||||
def fetch_reports_by_macresearch(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||
"Referer": "https://data.eastmoney.com/report/macresearch.jshtml"
|
||||
}
|
||||
|
||||
url = "https://reportapi.eastmoney.com/report/jg"
|
||||
|
||||
params = {
|
||||
"cb": "datatable2612129",
|
||||
"industryCode": "*",
|
||||
"pageSize": page_size,
|
||||
"author": "",
|
||||
"beginTime": start_date,
|
||||
"endTime": end_date,
|
||||
"pageNo": page_no,
|
||||
"fields": "",
|
||||
"qType": 3,
|
||||
"orgCode": "",
|
||||
"rcode": "",
|
||||
"p": page_no,
|
||||
"pageNum": page_no,
|
||||
"pageNumber": page_no,
|
||||
"_": int(time.time() * 1000) # 动态时间戳
|
||||
}
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, params=params, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# 去掉回调函数包装
|
||||
json_text = response.text.strip("datatable2612129(").rstrip(");")
|
||||
data = json.loads(json_text)
|
||||
|
||||
return data
|
||||
except requests.RequestException as e:
|
||||
logging.warning(f"network error on {url}: {e}, Retring...")
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logging.warning(f"json decode error on {url}: {e}, Retring...")
|
||||
return None
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
# 获取策略研报列表的指定页
|
||||
def fetch_reports_by_strategy(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||
"Referer": "https://data.eastmoney.com/report/strategyreport.jshtml"
|
||||
}
|
||||
|
||||
url = "https://reportapi.eastmoney.com/report/jg"
|
||||
|
||||
params = {
|
||||
"cb": "datatable5349866",
|
||||
"industryCode": "*",
|
||||
"pageSize": page_size,
|
||||
"author": "",
|
||||
"beginTime": start_date,
|
||||
"endTime": end_date,
|
||||
"pageNo": page_no,
|
||||
"fields": "",
|
||||
"qType": 2,
|
||||
"orgCode": "",
|
||||
"rcode": "",
|
||||
"p": page_no,
|
||||
"pageNum": page_no,
|
||||
"pageNumber": page_no,
|
||||
"_": int(time.time() * 1000) # 动态时间戳
|
||||
}
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, params=params, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# 去掉回调函数包装
|
||||
json_text = response.text.strip("datatable5349866(").rstrip(");")
|
||||
data = json.loads(json_text)
|
||||
|
||||
return data
|
||||
except requests.RequestException as e:
|
||||
logging.warning(f"network error on {url}: {e}, Retring...")
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logging.warning(f"json decode error on {url}: {e}, Retring...")
|
||||
return None
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
|
||||
# 获取新股研报列表的指定页
|
||||
def fetch_reports_by_newstock(page_no, start_date="2023-03-10", end_date="2025-03-10", page_size=50, max_retries = 3):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||
"Referer": "https://data.eastmoney.com/report/newstock.jshtml"
|
||||
}
|
||||
|
||||
url = "https://reportapi.eastmoney.com/report/newStockList"
|
||||
|
||||
params = {
|
||||
"cb": "datatable5144183",
|
||||
"pageSize": page_size,
|
||||
"author": "",
|
||||
"beginTime": start_date,
|
||||
"endTime": end_date,
|
||||
"pageNo": page_no,
|
||||
"fields": "",
|
||||
"qType": 4,
|
||||
"orgCode": "",
|
||||
"rcode": "",
|
||||
"p": page_no,
|
||||
"pageNum": page_no,
|
||||
"pageNumber": page_no,
|
||||
"_": int(time.time() * 1000) # 动态时间戳
|
||||
}
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, params=params, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# 去掉回调函数包装
|
||||
json_text = response.text.strip("datatable5144183(").rstrip(");")
|
||||
data = json.loads(json_text)
|
||||
|
||||
return data
|
||||
except requests.RequestException as e:
|
||||
logging.warning(f"network error on {url}: {e}, Retring...")
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logging.warning(f"json decode error on {url}: {e}, Retring...")
|
||||
return None
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
|
||||
# 访问指定 infoCode 的页面,提取 PDF 下载链接
|
||||
def fetch_pdf_link(url, max_retries = 3):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# 解析 HTML
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
pdf_link = soup.find("a", class_="pdf-link")
|
||||
|
||||
if pdf_link and "href" in pdf_link.attrs:
|
||||
return pdf_link["href"]
|
||||
else:
|
||||
logging.warning(f"未找到 PDF 链接: {url}")
|
||||
return None
|
||||
|
||||
except requests.RequestException as e:
|
||||
logging.error(f"请求失败: {url} {e}")
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
|
||||
|
||||
def is_valid_pdf(file_path):
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
header = f.read(4)
|
||||
return header == b"%PDF"
|
||||
except Exception as e:
|
||||
logging.error(f"验证 PDF 失败: {e}")
|
||||
return False
|
||||
|
||||
def download_pdf_wget(pdf_url, save_path):
|
||||
cmd = f'wget -O "{save_path}" "{pdf_url}" --quiet --user-agent="Mozilla/5.0"'
|
||||
os.system(cmd)
|
||||
return os.path.exists(save_path) and is_valid_pdf(save_path)
|
||||
|
||||
|
||||
# 下载 PDF 并保存到本地
|
||||
def download_pdf(pdf_url, save_path, max_retries=5):
|
||||
for attempt in range(max_retries):
|
||||
down = download_pdf_wget(pdf_url, save_path)
|
||||
if down:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(pdf_url, headers=headers, stream=True, timeout=20)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(save_path, "wb") as file:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
file.write(chunk)
|
||||
|
||||
return True
|
||||
|
||||
except requests.RequestException as e:
|
||||
logging.error(f"PDF 下载失败: {e}")
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user