modify some scripts.

2025-03-07 19:11:41 +08:00
parent 6cebf3f8ac
commit f5929811c7
27 changed files with 778 additions and 2724974 deletions
--- a/scripts/javdb/src/scraper.py
+++ b/scripts/javdb/src/scraper.py
@ -30,9 +30,15 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
        try:
            if 'javdb.com' not in url.lower():
                logging.error(f'wrong url format: {url}')
-                return None
+                return None, None
            
            response = scraper.get(url, headers=headers)
+
+            # 处理 HTTP 状态码
+            if response.status_code == 404:
+                logging.warning(f"Page not found (404): {url}")
+                return None, 404  # 直接返回 404，调用方可以跳过
+            
            response.raise_for_status()  # 处理 HTTP 错误

            # 预处理 HTML（如果提供了 preprocessor）
@ -40,7 +46,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
-                return soup
+                return soup, response.status_code

            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
@ -51,7 +57,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
            logging.error(f"Unexpected error on {url}: {e}, Retring...")

    logging.error(f'Fetching failed after max retries. {url}')
-    return None  # 达到最大重试次数仍然失败
+    return None, None  # 达到最大重试次数仍然失败

 # 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
 def preprocess_html(html):
@ -78,6 +84,21 @@ def url_page_num(href):
    else:
        return None
    
+
+# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
+def parse_avatar_image(soup):
+    try:
+        span = soup.find("span", class_="avatar")
+        if not span:
+            return ""  # 没有找到 <span> 元素，返回空字符串
+        
+        style = span.get("style", "")
+        match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
+        return match.group(1) if match else ""  # 解析成功返回 URL，否则返回空字符串
+    except Exception as e:
+        return ""  # 发生异常时，返回空字符串
+
+
 # 解析 HTML 内容，提取需要的数据
 def parse_actors_uncensored(soup, href):
    div_actors = soup.find("div", id='actors')
@ -123,6 +144,29 @@ def parse_actors_uncensored(soup, href):

 # 解析 HTML 内容，提取需要的数据
 def parse_actor_detail(soup, href):
+    # 先找一下别名
+    alias_list = []
+
+    div_meta = soup.find('span', class_='actor-section-name')
+    if not div_meta:
+        logging.warning(f'warning: no meta data found in page {href}')
+        return None, None
+    alias_div = soup.find('div', class_='column section-title')
+    
+    if alias_div:
+        meta_list = alias_div.find_all('span', class_='section-meta')
+        if len(meta_list) > 1:
+            alias_list = meta_list[0].text.strip().split(", ")
+
+    # 头像
+    pic = ''
+    avatar = soup.find("div", class_="column actor-avatar")
+    if avatar:
+        pic = parse_avatar_image(avatar)
+
+    # 返回数据
+    actor = {}
+
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
@ -157,7 +201,13 @@ def parse_actor_detail(soup, href):
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

-    return list_data, next_url
+    actor = {
+        'pic' : pic,
+        'alias' : alias_list,
+        'movies' : list_data
+    }
+
+    return actor, next_url


 # 解析 HTML 内容，提取需要的数据
@ -257,7 +307,7 @@ def parse_series_detail(soup, href):
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
-        return None, None
+        return [], None
    
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
@ -337,7 +387,7 @@ def parse_maker_detail(soup, href):
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
-        return None, None
+        return [], None
    
    # 解析元素
    rows = div_movies.find_all('div', class_='item')