modify scripts

This commit is contained in:
2025-03-16 15:19:52 +08:00
parent e136de53f2
commit dab493f8e7
5 changed files with 80 additions and 7 deletions

View File

@ -262,7 +262,8 @@ def fetch_performers_detail_once(perfomers_list):
else: else:
logging.warning(f'parse_page_performer error. person: ({person}), url: {url}') logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
elif status_code and status_code == 404: elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') performer_id = db_tools.insert_or_update_performer_404(name=person, href=url)
logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
else: else:
logging.warning(f'fetch_page error. person: ({person}), url: {url}') logging.warning(f'fetch_page error. person: ({person}), url: {url}')
time.sleep(1) time.sleep(1)
@ -305,10 +306,11 @@ def fetch_movies_detail():
logging.info(f'all movies fetched.') logging.info(f'all movies fetched.')
break break
last_movie_id = 0 last_movie_id = 0
succ_count = 0
for movie in movies_list: for movie in movies_list:
url = movie['href'] url = movie['href']
title = movie['title'] title = movie['title']
logging.info(f"Fetching data for movie ({title}), url {url} ...") logging.debug(f"Fetching data for movie ({title}), url {url} ...")
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class")) soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
if soup: if soup:
movie_data = scraper.parse_page_movie(soup, url, title) movie_data = scraper.parse_page_movie(soup, url, title)
@ -322,6 +324,7 @@ def fetch_movies_detail():
if movie_id: if movie_id:
logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}') logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
last_movie_id = movie_id last_movie_id = movie_id
succ_count += 1
else: else:
logging.warning(f'insert movie {url} failed.') logging.warning(f'insert movie {url} failed.')
@ -330,11 +333,13 @@ def fetch_movies_detail():
else: else:
logging.warning(f'parse_page_movie error. url: {url}') logging.warning(f'parse_page_movie error. url: {url}')
elif status_code and status_code == 404: elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') # 标记为已处理
movie_id = db_tools.insert_or_update_movie_404(title=title, href=url)
logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
else: else:
logging.warning(f'fetch_page error. url: {url}') logging.warning(f'fetch_page error. url: {url}')
time.sleep(1) time.sleep(1)
logging.info(f'insert {len(movies_list)} movies. last movie id: {last_movie_id}') logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
# 调试增加break # 调试增加break
if debug: if debug:
return True return True

View File

@ -52,6 +52,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
response.raise_for_status() # 处理 HTTP 错误 response.raise_for_status() # 处理 HTTP 错误
# 过期的网页与404相同处理
if "invalid or outdated page" in response.text.lower():
logging.warning(f"invalid or outdated page: {url}")
return None, 404 # 直接返回 404调用方可以跳过
# 预处理 HTML如果提供了 preprocessor # 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text html_text = preprocessor(response.text) if preprocessor else response.text

View File

@ -251,6 +251,39 @@ def insert_or_update_performer(data):
logging.error(f"未知错误: {e}") logging.error(f"未知错误: {e}")
return None return None
# """插入或更新电影数据(异常url的处理比如404链接)"""
def insert_or_update_performer_404(name, href):
try:
cursor.execute("""
INSERT INTO iafd_performers (href, name, is_full_data, updated_at)
VALUES (?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
name = excluded.name,
is_full_data = 1,
updated_at = datetime('now', 'localtime')
""", (
href, name
))
# 获取 performer_id
performer_id = get_id_by_href('iafd_performers', href)
if performer_id is None:
return None
logging.debug(f'insert one performer, id: {performer_id}, name: {name}, href: {href}')
return performer_id
except sqlite3.Error as e:
conn.rollback()
logging.error(f"数据库错误: {e}")
return None
except Exception as e:
conn.rollback()
logging.error(f"未知错误: {e}")
return None
# 按 id 或 href 删除演员 # 按 id 或 href 删除演员
def delete_performer(identifier): def delete_performer(identifier):
try: try:
@ -610,6 +643,35 @@ def insert_or_update_movie(movie_data):
logging.error("Error inserting movie: %s", e) logging.error("Error inserting movie: %s", e)
return None return None
# """插入或更新电影数据(异常url的处理比如404链接)"""
def insert_or_update_movie_404(title, href):
try:
# 插入或更新电影信息
cursor.execute(
"""
INSERT INTO iafd_movies (title, href, is_full_data, updated_at)
VALUES (?, ?, 1, datetime('now', 'localtime'))
ON CONFLICT(href) DO UPDATE SET
title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime')
""",
(title, href)
)
conn.commit()
# 获取插入的 movie_id
movie_id = get_id_by_href('iafd_movies', href)
if movie_id is None:
return None
return movie_id
except Exception as e:
conn.rollback()
logging.error("Error inserting movie: %s", e)
return None
# 删除电影数据""" # 删除电影数据"""
def delete_movie(identifier): def delete_movie(identifier):
try: try:

View File

@ -6,7 +6,7 @@ from pathlib import Path
# MySQL 配置 # MySQL 配置
db_config = { db_config = {
'host': '172.18.0.4', 'host': 'testdb',
'user': 'root', 'user': 'root',
'password': 'mysqlpw', 'password': 'mysqlpw',
'database': 'stockdb' 'database': 'stockdb'

View File

@ -23,7 +23,7 @@ def fetch_with_retries_em(url, params, max_retries=3, delay=2):
time.sleep(delay) time.sleep(delay)
return None return None
def stock_zh_a_spot_em(fs='m:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23,m:0 t:81 s:2048', pz=200) -> pd.DataFrame: def stock_zh_a_spot_em(fs='m:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23,m:0 t:81 s:2048', pz=100) -> pd.DataFrame:
""" """
东方财富网-沪深京 A 股-实时行情 东方财富网-沪深京 A 股-实时行情
https://quote.eastmoney.com/center/gridlist.html#hs_a_board https://quote.eastmoney.com/center/gridlist.html#hs_a_board
@ -61,6 +61,7 @@ def stock_zh_a_spot_em(fs='m:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23,m:0 t:81 s:2048', p
# 获取 total 数据来更新 pn_max # 获取 total 数据来更新 pn_max
if pn == 1: if pn == 1:
pn_max = (data_json["data"].get("total", 0) + pz - 1) // pz pn_max = (data_json["data"].get("total", 0) + pz - 1) // pz
print(f'total pages: {pn_max}, total data lines: {data_json["data"].get("total", 0)}, curr lines: {len(diff_data)}, page size: {pz}')
pn += 1 pn += 1
time.sleep(0.5) # 防止请求过快 time.sleep(0.5) # 防止请求过快
@ -440,7 +441,7 @@ def code_id_map_em() -> dict:
if pn == 1 and "total" in data_json["data"]: if pn == 1 and "total" in data_json["data"]:
total = int(data_json["data"]["total"]) total = int(data_json["data"]["total"])
pn_max = (total // pz) + 1 # 计算最大页数 pn_max = (total // pz) + 1 # 计算最大页数
print(f"市场 {market_id} 总数据量: {total}, 需要页数: {pn_max}") print(f"市场 {market_id} 总数据量: {total}, 需要页数: {pn_max}, 当前获取数量: {len(temp_df)}, 每页最大拉取行数: {pz}")
# 按 f13 进行分组并存入字典 # 按 f13 进行分组并存入字典
grouped = temp_df.groupby('f13') grouped = temp_df.groupby('f13')