From 2e2218b623f5444e92d1691fb737bb68704b2ead Mon Sep 17 00:00:00 2001 From: oscarz Date: Wed, 23 Apr 2025 17:43:30 +0800 Subject: [PATCH] modify scripts --- javdb/src/fetch.py | 27 ++++++++++++++++++++++----- javdb/src/sqlite_utils.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/javdb/src/fetch.py b/javdb/src/fetch.py index da70bf8..781fb5a 100644 --- a/javdb/src/fetch.py +++ b/javdb/src/fetch.py @@ -160,6 +160,8 @@ def fetch_performers_detail(): if len(perfomers_list) < 1: logging.info(f'all performers fetched.') break + + succ_rows = 0 for performer in perfomers_list: url = performer['href'] person = performer['name'] @@ -168,8 +170,9 @@ def fetch_performers_detail(): next_url = url all_movies = [] + need_insert = True while next_url: - logging.info(f"Fetching data for actor ({person}), url {next_url} ...") + logging.debug(f"Fetching data for actor ({person}), url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class")) if soup: data, next_url = scraper.parse_actor_detail(soup, next_url) @@ -179,11 +182,22 @@ def fetch_performers_detail(): all_movies.extend(data.get('movies', [])) elif status_code and status_code == 404: - logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...') + actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=2) + logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') + need_insert = False + break + elif status_code and status_code == 401: + actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=3) + logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...') + need_insert = False break else: - logging.warning(f'fetch_page error. person: ({person}), url: {url}') - + logging.warning(f'fetch_page error. url: {url}') + + # 如果出现了401或者404,已经处理,直接跳过 + if not need_insert: + continue + # 获取完了个人的所有影片,开始插入数据 performer_id = db_tools.insert_or_update_actor({ 'href': url, @@ -193,11 +207,14 @@ def fetch_performers_detail(): 'credits':all_movies }) if performer_id: - logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') + logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') last_perfomer_id = performer_id + succ_rows += 1 else: logging.warning(f'insert person: ({person}) {url} failed.') time.sleep(0.5) + + logging.info(f'total request: {len(perfomers_list)}, succ: {succ_rows}, last performer id: {last_perfomer_id}') # 调试break if debug: return True diff --git a/javdb/src/sqlite_utils.py b/javdb/src/sqlite_utils.py index baa0749..6bc2596 100644 --- a/javdb/src/sqlite_utils.py +++ b/javdb/src/sqlite_utils.py @@ -172,6 +172,34 @@ def insert_or_update_actor(actor): logging.error(f"插入/更新演员 {actor['name']} 失败: {e}") conn.rollback() +# """插入或更新电影数据(异常url的处理,比如404链接)""" +def insert_or_update_actor_404(name, href, is_full_data=1): + try: + # 插入或更新电影信息 + cursor.execute( + """ + INSERT INTO javdb_actors (name, href, is_full_data, updated_at) + VALUES (?, ?, ?, datetime('now', 'localtime')) + ON CONFLICT(href) DO UPDATE SET + name=excluded.name, is_full_data=excluded.is_full_data, updated_at = datetime('now', 'localtime') + """, + (name, href, is_full_data) + ) + conn.commit() + + # 获取插入的 movie_id + actor_id = get_id_by_href('javdb_actors', href) + if actor_id is None: + return None + + return actor_id + + except Exception as e: + conn.rollback() + logging.error("Error inserting movie: %s", e) + return None + + # 删除演员 def delete_actor_by_href(href): try: