modify scripts

2025-06-26 09:11:41 +08:00
parent ad9c9094c9
commit 54c073532c
1 changed files with 29 additions and 5 deletions
--- a/src/javbus/fetch.py
+++ b/src/javbus/fetch.py
@ -61,7 +61,7 @@ def fetch_actor_list_lang(lang="en", uncensored=None):
        if debug:
            return True

-# 获取演员列表
+# 获取演员列表，控制逻辑，多语言
 def fetch_actor_list():
    if g_uncensored == 1:
        for lang in ["en", "ja", "zh"]:
@ -95,10 +95,13 @@ def fetch_movies_common(tbl):
        url = row['href']
        row_id = row['id']
        uncensored = row['uncensored'] if row['uncensored'] > 0 else None
+        if not utils.is_valid_url(url):
+            logging.info(f'invalid url {url} in {tbl}, skipping...')
+            continue
        # 去掉可下载的标志（如果有）
        next_url = url
        while next_url:
-            logging.info(f"Fetching data for maker url {next_url} ...")
+            logging.info(f"Fetching data from {tbl} url {next_url} ...")
            soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
            if soup:
                list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
@ -125,6 +128,10 @@ def fetch_movies_common(tbl):
            elif status_code  and status_code == 404:
                logging.warning(f"fetch page error. httpcode: {status_code}, url: {next_url}")
                break
+            else:   # 达到失败上限，加上休眠继续重试
+                time.sleep(5)
+            
+            time.sleep(0.3)

            # 调试增加brak
            if debug:
@ -142,7 +149,7 @@ def fetch_movies_by_label():
 def fetch_movies_by_series():
    fetch_movies_common('series')

-# 从studio/label/series中获取影片
+# 从studio/label/series首页获取他们的多语言表述
 def update_multilang_common(tbl):
    if debug:
        url_list = db_tools.query_list_common(tbl=tbl, limit=3)
@ -162,7 +169,7 @@ def update_multilang_common(tbl):
        langs_url = utils.generate_multilang_urls(url)
        for lang, next_url in langs_url.items():
            while next_url:
-                logging.info(f"Fetching data for url {next_url} ..., raw url: {url}")
+                logging.info(f"Fetching data for url {next_url} ...")
                soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
                if soup:
                    list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
@ -186,13 +193,21 @@ def update_multilang_common(tbl):
                    logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
                    break

+                else:   # 达到失败上限，加上休眠继续重试
+                    time.sleep(5)
+                
+                time.sleep(0.3)
+
+        if debug:
+            break
+
 # 更新series列表中的影片信息
 def update_multi_langs():
    update_multilang_common('studio')
    update_multilang_common('label')
    update_multilang_common('series')

-# 从studio/label/series中获取影片
+# 获取影片tags的多语言表述
 def update_multilang_tags():
    if debug:
        url_list = db_tools.query_tags(limit=5)
@ -231,6 +246,13 @@ def update_multilang_tags():
                    logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
                    break

+                else:   # 达到失败上限，加上休眠继续重试
+                    time.sleep(5)
+                
+                time.sleep(0.3)
+        if debug:
+            break
+
 # 更新演员信息
 def fetch_performers_detail():
    limit_count = 5 if debug else 100
@ -296,6 +318,7 @@ def fetch_performers_detail():
                    break
                else:
                    logging.warning(f'fetch_page error. url: {url}')
+                    time.sleep(2)

            # 如果出现了401或者404，已经处理，直接跳过
            if not need_insert:
@ -388,6 +411,7 @@ def fetch_movies_detail():
                        logging.warning(f'insert movie {url} failed.')
                else:
                    logging.warning(f'parse_page_movie error. url: {url}')
+                    time.sleep(2)

            elif status_code  and status_code == craw.http_code_404:
                movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404})