From 54c073532ca8f9e00133de2b4e4aac37ff8709b5 Mon Sep 17 00:00:00 2001 From: oscarz Date: Thu, 26 Jun 2025 09:11:41 +0800 Subject: [PATCH] modify scripts --- src/javbus/fetch.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/javbus/fetch.py b/src/javbus/fetch.py index fe21e04..d5591e8 100644 --- a/src/javbus/fetch.py +++ b/src/javbus/fetch.py @@ -61,7 +61,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None): if debug: return True -# 获取演员列表 +# 获取演员列表,控制逻辑,多语言 def fetch_actor_list(): if g_uncensored == 1: for lang in ["en", "ja", "zh"]: @@ -95,10 +95,13 @@ def fetch_movies_common(tbl): url = row['href'] row_id = row['id'] uncensored = row['uncensored'] if row['uncensored'] > 0 else None + if not utils.is_valid_url(url): + logging.info(f'invalid url {url} in {tbl}, skipping...') + continue # 去掉可下载的标志(如果有) next_url = url while next_url: - logging.info(f"Fetching data for maker url {next_url} ...") + logging.info(f"Fetching data from {tbl} url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id")) if soup: list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url) @@ -125,6 +128,10 @@ def fetch_movies_common(tbl): elif status_code and status_code == 404: logging.warning(f"fetch page error. httpcode: {status_code}, url: {next_url}") break + else: # 达到失败上限,加上休眠继续重试 + time.sleep(5) + + time.sleep(0.3) # 调试增加brak if debug: @@ -142,7 +149,7 @@ def fetch_movies_by_label(): def fetch_movies_by_series(): fetch_movies_common('series') -# 从studio/label/series中获取影片 +# 从studio/label/series首页获取他们的多语言表述 def update_multilang_common(tbl): if debug: url_list = db_tools.query_list_common(tbl=tbl, limit=3) @@ -162,7 +169,7 @@ def update_multilang_common(tbl): langs_url = utils.generate_multilang_urls(url) for lang, next_url in langs_url.items(): while next_url: - logging.info(f"Fetching data for url {next_url} ..., raw url: {url}") + logging.info(f"Fetching data for url {next_url} ...") soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id")) if soup: list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url) @@ -186,13 +193,21 @@ def update_multilang_common(tbl): logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break + else: # 达到失败上限,加上休眠继续重试 + time.sleep(5) + + time.sleep(0.3) + + if debug: + break + # 更新series列表中的影片信息 def update_multi_langs(): update_multilang_common('studio') update_multilang_common('label') update_multilang_common('series') -# 从studio/label/series中获取影片 +# 获取影片tags的多语言表述 def update_multilang_tags(): if debug: url_list = db_tools.query_tags(limit=5) @@ -231,6 +246,13 @@ def update_multilang_tags(): logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}') break + else: # 达到失败上限,加上休眠继续重试 + time.sleep(5) + + time.sleep(0.3) + if debug: + break + # 更新演员信息 def fetch_performers_detail(): limit_count = 5 if debug else 100 @@ -296,6 +318,7 @@ def fetch_performers_detail(): break else: logging.warning(f'fetch_page error. url: {url}') + time.sleep(2) # 如果出现了401或者404,已经处理,直接跳过 if not need_insert: @@ -388,6 +411,7 @@ def fetch_movies_detail(): logging.warning(f'insert movie {url} failed.') else: logging.warning(f'parse_page_movie error. url: {url}') + time.sleep(2) elif status_code and status_code == craw.http_code_404: movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})