From dab493f8e798a3507f030ce59c839ddd5c72fb99 Mon Sep 17 00:00:00 2001 From: oscar Date: Sun, 16 Mar 2025 15:19:52 +0800 Subject: [PATCH] modify scripts --- scripts/iafd/src/fetch.py | 13 ++++-- scripts/iafd/src/iafd_scraper.py | 5 +++ scripts/iafd/src/sqlite_utils.py | 62 ++++++++++++++++++++++++++ stockapp/src/config.py | 2 +- stockapp/src/crawling/stock_hist_em.py | 5 ++- 5 files changed, 80 insertions(+), 7 deletions(-) diff --git a/scripts/iafd/src/fetch.py b/scripts/iafd/src/fetch.py index 227af58..e02eb8d 100644 --- a/scripts/iafd/src/fetch.py +++ b/scripts/iafd/src/fetch.py @@ -262,7 +262,8 @@ def fetch_performers_detail_once(perfomers_list): else: logging.warning(f'parse_page_performer error. person: ({person}), url: {url}') elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + performer_id = db_tools.insert_or_update_performer_404(name=person, href=url) + logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...') else: logging.warning(f'fetch_page error. person: ({person}), url: {url}') time.sleep(1) @@ -305,10 +306,11 @@ def fetch_movies_detail(): logging.info(f'all movies fetched.') break last_movie_id = 0 + succ_count = 0 for movie in movies_list: url = movie['href'] title = movie['title'] - logging.info(f"Fetching data for movie ({title}), url {url} ...") + logging.debug(f"Fetching data for movie ({title}), url {url} ...") soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class")) if soup: movie_data = scraper.parse_page_movie(soup, url, title) @@ -322,6 +324,7 @@ def fetch_movies_detail(): if movie_id: logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}') last_movie_id = movie_id + succ_count += 1 else: logging.warning(f'insert movie {url} failed.') @@ -330,11 +333,13 @@ def fetch_movies_detail(): else: logging.warning(f'parse_page_movie error. url: {url}') elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + # 标记为已处理 + movie_id = db_tools.insert_or_update_movie_404(title=title, href=url) + logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') else: logging.warning(f'fetch_page error. url: {url}') time.sleep(1) - logging.info(f'insert {len(movies_list)} movies. last movie id: {last_movie_id}') + logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}') # 调试增加break if debug: return True diff --git a/scripts/iafd/src/iafd_scraper.py b/scripts/iafd/src/iafd_scraper.py index b4a2de8..823a2b9 100644 --- a/scripts/iafd/src/iafd_scraper.py +++ b/scripts/iafd/src/iafd_scraper.py @@ -52,6 +52,11 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor response.raise_for_status() # 处理 HTTP 错误 + # 过期的网页,与404相同处理 + if "invalid or outdated page" in response.text.lower(): + logging.warning(f"invalid or outdated page: {url}") + return None, 404 # 直接返回 404,调用方可以跳过 + # 预处理 HTML(如果提供了 preprocessor) html_text = preprocessor(response.text) if preprocessor else response.text diff --git a/scripts/iafd/src/sqlite_utils.py b/scripts/iafd/src/sqlite_utils.py index e2288f0..912f7a6 100644 --- a/scripts/iafd/src/sqlite_utils.py +++ b/scripts/iafd/src/sqlite_utils.py @@ -251,6 +251,39 @@ def insert_or_update_performer(data): logging.error(f"未知错误: {e}") return None + +# """插入或更新电影数据(异常url的处理,比如404链接)""" +def insert_or_update_performer_404(name, href): + try: + cursor.execute(""" + INSERT INTO iafd_performers (href, name, is_full_data, updated_at) + VALUES (?, ?, 1, datetime('now', 'localtime')) + ON CONFLICT(href) DO UPDATE SET + name = excluded.name, + is_full_data = 1, + updated_at = datetime('now', 'localtime') + """, ( + href, name + )) + + # 获取 performer_id + performer_id = get_id_by_href('iafd_performers', href) + if performer_id is None: + return None + logging.debug(f'insert one performer, id: {performer_id}, name: {name}, href: {href}') + + return performer_id + + except sqlite3.Error as e: + conn.rollback() + logging.error(f"数据库错误: {e}") + return None + except Exception as e: + conn.rollback() + logging.error(f"未知错误: {e}") + return None + + # 按 id 或 href 删除演员 def delete_performer(identifier): try: @@ -610,6 +643,35 @@ def insert_or_update_movie(movie_data): logging.error("Error inserting movie: %s", e) return None + +# """插入或更新电影数据(异常url的处理,比如404链接)""" +def insert_or_update_movie_404(title, href): + try: + # 插入或更新电影信息 + cursor.execute( + """ + INSERT INTO iafd_movies (title, href, is_full_data, updated_at) + VALUES (?, ?, 1, datetime('now', 'localtime')) + ON CONFLICT(href) DO UPDATE SET + title=excluded.title, is_full_data=1, updated_at = datetime('now', 'localtime') + """, + (title, href) + ) + conn.commit() + + # 获取插入的 movie_id + movie_id = get_id_by_href('iafd_movies', href) + if movie_id is None: + return None + + return movie_id + + except Exception as e: + conn.rollback() + logging.error("Error inserting movie: %s", e) + return None + + # 删除电影数据""" def delete_movie(identifier): try: diff --git a/stockapp/src/config.py b/stockapp/src/config.py index 17f1d5c..bd609f4 100644 --- a/stockapp/src/config.py +++ b/stockapp/src/config.py @@ -6,7 +6,7 @@ from pathlib import Path # MySQL 配置 db_config = { - 'host': '172.18.0.4', + 'host': 'testdb', 'user': 'root', 'password': 'mysqlpw', 'database': 'stockdb' diff --git a/stockapp/src/crawling/stock_hist_em.py b/stockapp/src/crawling/stock_hist_em.py index 72eaec9..086b145 100644 --- a/stockapp/src/crawling/stock_hist_em.py +++ b/stockapp/src/crawling/stock_hist_em.py @@ -23,7 +23,7 @@ def fetch_with_retries_em(url, params, max_retries=3, delay=2): time.sleep(delay) return None -def stock_zh_a_spot_em(fs='m:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23,m:0 t:81 s:2048', pz=200) -> pd.DataFrame: +def stock_zh_a_spot_em(fs='m:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23,m:0 t:81 s:2048', pz=100) -> pd.DataFrame: """ 东方财富网-沪深京 A 股-实时行情 https://quote.eastmoney.com/center/gridlist.html#hs_a_board @@ -61,6 +61,7 @@ def stock_zh_a_spot_em(fs='m:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23,m:0 t:81 s:2048', p # 获取 total 数据来更新 pn_max if pn == 1: pn_max = (data_json["data"].get("total", 0) + pz - 1) // pz + print(f'total pages: {pn_max}, total data lines: {data_json["data"].get("total", 0)}, curr lines: {len(diff_data)}, page size: {pz}') pn += 1 time.sleep(0.5) # 防止请求过快 @@ -440,7 +441,7 @@ def code_id_map_em() -> dict: if pn == 1 and "total" in data_json["data"]: total = int(data_json["data"]["total"]) pn_max = (total // pz) + 1 # 计算最大页数 - print(f"市场 {market_id} 总数据量: {total}, 需要页数: {pn_max}") + print(f"市场 {market_id} 总数据量: {total}, 需要页数: {pn_max}, 当前获取数量: {len(temp_df)}, 每页最大拉取行数: {pz}") # 按 f13 进行分组并存入字典 grouped = temp_df.groupby('f13')