modify scripts

2025-06-26 15:59:56 +08:00
parent 6fce587aad
commit d42e9b0456
2 changed files with 51 additions and 12 deletions
--- a/src/crawling/craw.py
+++ b/src/crawling/craw.py
@ -364,10 +364,31 @@ class JavbusCrawler(GenericCrawler):
                    # 提取前两个元素作为工作室和角色
                    studio = parts[video_index - 2]
                    role = parts[video_index - 1]
-                    result['meta'] = {'title': studio, 'role': role}
+                    result['meta']['title'] = studio
+                    result['meta']['role'] = role
                else:
                    logging.debug(f"无法按规则解析: {' - '.join(parts)}")
-     
+                
+            # 提取全部影片和已有磁力的数量            
+            # 查找a标签
+            a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
+            if not a_tags:
+                logging.warning(f'found no movie cnt. href: {href}')
+            else:
+                for a in a_tags:
+                    text = a.get_text(strip=True)                    
+                    # 提取全部影片数量
+                    if '全部影片' in text:
+                        match = re.search(r'全部影片\s*(\d+)\s*', text)
+                        if match:
+                            result['meta']['movies_cnt'] = int(match.group(1))
+                    
+                    # 提取已有磁力数量
+                    if '已有磁力' in text:
+                        match = re.search(r'已有磁力\s*(\d+)\s*', text)
+                        if match:
+                            result['meta']['magnet_cnt'] = int(match.group(1))
+
            div_waterfall = soup.find('div', id='waterfall')
            if not div_waterfall:
                logging.warning(f"found no records. href: {href}")
@ -415,19 +436,19 @@ class JavbusCrawler(GenericCrawler):
            # 提取标题
            div_container = soup.find('div', class_='container')
            if not div_container:
-                logging.warning(f"found no container tag.")
+                logging.warning(f"found no container tag. href: {href}")
                return None
            
            title_element = div_container.find('h3')
            if title_element:
                result['title'] = title_element.get_text(strip=True)
            else:
-                logging.debug("未找到影片标题")
+                logging.debug("no title found. href: {href}")
            
            # 提取基本信息（识别码、发行日期等）
            info_div = div_container.find('div', class_='info')
            if not info_div:
-                logging.warning(f"found no div info tag.")
+                logging.warning(f"found no div info tag. href: {href}")
                return None
            
            # 定义字段映射关系（多种语言支持）
@ -511,11 +532,11 @@ class JavbusCrawler(GenericCrawler):
                        else:
                            logging.debug(f"actors not found.")
                else:
-                    logging.warning("未找到演员列表区域")
+                    logging.debug("no star-name area. href: {href}")
            else:
-                logging.warning("未找到演员标题")
+                logging.debug("no star-show area.  href: {href}")
                
        except Exception as e:
-            logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
+            logging.warning(f"parse movie detail error. href: {href}, error: {str(e)}", exc_info=True)
            
        return result
--- a/src/javbus/fetch.py
+++ b/src/javbus/fetch.py
@ -82,7 +82,7 @@ def fetch_actor_list():
 # 从studio/label/series中获取影片
 def fetch_movies_common(tbl):
    if debug:
-        url_list = db_tools.query_list_common(tbl=tbl)
+        url_list = db_tools.query_list_common(tbl=tbl, limit=2)
    else:
        if g_uncensored==1:
            url_list = db_tools.query_list_common(tbl=tbl, uncensored=1)
@ -101,6 +101,8 @@ def fetch_movies_common(tbl):
        if not utils.is_valid_url(url):
            logging.info(f'invalid url ({url}) in {tbl}, row id: {row_id}. skipping...')
            continue
+
+        meta_data = None
        # 去掉可下载的标志（如果有）
        next_url = url
        while next_url:
@ -108,7 +110,11 @@ def fetch_movies_common(tbl):
            soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
            if soup:
                list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
-                if list_data:     
+                if list_data:
+                    # 更新metadata   
+                    if meta_data is None:
+                        meta_data = list_data.get('meta', {})
+                        
                    # 根据tbl的值动态构建额外参数
                    extra_kwargs = {}
                    if tbl == 'studio':
@ -138,7 +144,17 @@ def fetch_movies_common(tbl):

            # 调试增加brak
            if debug:
-                return True
+                break
+            
+        # 更新metaddata
+        if meta_data and meta_data.get('movies_cnt') is not None:
+            meta_data['href'] = url            
+            tmp_id = db_tools.update_pubs_multilang(meta_data, tbl)
+            if tmp_id:
+                logging.debug(f'update pubs multi lang. data: {meta_data}')
+            else:
+                logging.warning(f'update pubs multi lang failed. data: {meta_data}')
+

 # 更新makers列表中的影片信息
 def fetch_movies_by_studio():
@ -484,7 +500,9 @@ def main(cmd, args):
    db_tools.finalize_task_log(task_id)

    # TODO:
-    # 1, tags 和 studio / label / series 的多语言
+    # 1, tags 和 studio / label / series 的多语言 ---done
+    # 2, studio / label / series 表增加字段： movies_cnt magnet_cnt
+    # 3, movie 页面保存磁力链接

 # 设置环境变量
 def set_env(args):