modify scripts

2025-06-25 11:38:06 +08:00
parent 9cf521a0d6
commit 5ebfe7cb8c
4 changed files with 306 additions and 309 deletions
--- a/src/crawling/craw.py
+++ b/src/crawling/craw.py
@ -333,6 +333,69 @@ class JavbusCrawler(GenericCrawler):
            
        return movie_info

+
+    # 获取演员详情
+    def parse_studios_labels_series_detail(self, soup, href):
+        """
+        解析Javbus网页内容，提取演员信息和影片列表
+        """
+        result = {
+            'meta': {},
+            'movies': []
+        }
+        
+        try:           
+            # 解析标题
+            b_tag = soup.select_one('.alert.alert-success.alert-common p b')
+            if not b_tag:
+                logging.warning(f'found no title. href: {href}')
+            else:                
+                # 获取文本内容
+                title_text = b_tag.get_text(strip=True)                
+                # 使用横线分割文本
+                parts = [part.strip() for part in title_text.split('-')]
+                # 定义"影片"的多种语言表示
+                video_keywords = ['影片', 'Video', '映画', 'Videos', 'Movies']
+                
+                # 查找"影片"关键词的位置
+                video_index = next((i for i, part in enumerate(parts) if part in video_keywords), None)
+                
+                if video_index is not None and video_index >= 2:
+                    # 提取前两个元素作为工作室和角色
+                    studio = parts[video_index - 2]
+                    role = parts[video_index - 1]
+                    result['meta'] = {'title': studio, 'role': role}
+                else:
+                    logging.debug(f"无法按规则解析: {' - '.join(parts)}")
+     
+            div_waterfall = soup.find('div', id='waterfall')
+            if not div_waterfall:
+                logging.warning(f"found no records. href: {href}")
+            else:            
+                # 解析影片列表
+                movie_boxes = div_waterfall.find_all('a', class_='movie-box')
+                if movie_boxes:
+                    for movie_box in movie_boxes:
+                        movie_info = self.parse_movie_info(movie_box)
+                        if movie_info:
+                            result['movies'].append(movie_info)
+                else:
+                    logging.debug(f"movie-box not found. href: {href}")
+                
+        except Exception as e:
+            logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True)
+            
+        # 查找 "下一页" 按钮
+        next_url = None
+        div_link = soup.find("div", class_='text-center hidden-xs')
+        if div_link:
+            next_page_element = soup.find('a', id='next')
+            if next_page_element:
+                next_page_url = next_page_element['href']
+                next_url = urljoin(href, next_page_url)
+   
+        return result, next_url
+
    # 解析Javbus影片详情页内容
    def parse_movie_detail(self, soup, href, title):
        result = {
@ -456,231 +519,3 @@ class JavbusCrawler(GenericCrawler):
            logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
            
        return result
-
-    def parse_series_uncensored(self, soup, href):
-        div_series = soup.find("div", id='series')
-        if not div_series:
-            logging.warning(f"Warning: No div_series div found ")
-            return None, None
-
-        # 解析元素
-        rows = div_series.find_all('a', class_='box')
-
-        list_data = []
-        next_url = None
-        for row in rows:
-            name = row.find('strong').text.strip()
-            href = row['href']
-            div_movies = row.find('span')
-            movies = 0
-            if div_movies:
-                match = re.search(r'\((\d+)\)', div_movies.text.strip())
-                if match:
-                    movies = int(match.group(1))
-
-            list_data.append({
-                'name': name,
-                'href': host_url + href if href else '',
-                'movies': movies
-            })
-
-        # 查找 "下一页" 按钮
-        next_page_element = soup.find('a', class_='pagination-next')
-        if next_page_element:
-            next_page_url = next_page_element['href']
-            next_page_number = self.url_page_num(next_page_url)
-            current_page_number = self.url_page_num(href)
-            if current_page_number is None:
-                current_page_number = 0
-            if next_page_number and next_page_number > current_page_number:
-                next_url = host_url + next_page_url
-
-        return list_data, next_url
-
-    def parse_series_detail(self, soup, href):
-        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
-        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
-        if not div_movies:
-            logging.warning(f"Warning: No movies div found ")
-            return [], None
-
-        # 解析元素
-        rows = div_movies.find_all('div', class_='item')
-
-        list_data = []
-        next_url = None
-        for row in rows:
-            link = row.find('a', class_='box')['href']
-            serial_number = row.find('strong').text.strip()
-            title = row.find('div', class_='video-title').text.strip()
-            release_date = row.find('div', class_='meta').text.strip()
-            list_data.append({
-                'href': host_url + link if link else '',
-                'serial_number': serial_number,
-                'title': title,
-                'release_date': release_date
-            })
-
-        # 查找 "下一页" 按钮
-        next_page_element = soup.find('a', class_='pagination-next')
-        if next_page_element:
-            next_page_url = next_page_element['href']
-            next_page_number = self.url_page_num(next_page_url)
-            current_page_number = self.url_page_num(href)
-            if current_page_number is None:
-                current_page_number = 0
-            if next_page_number and next_page_number > current_page_number:
-                next_url = host_url + next_page_url
-
-        return list_data, next_url
-
-    def parse_makers_uncensored(self, soup, href):
-        div_series = soup.find("div", id='makers')
-        if not div_series:
-            logging.warning(f"Warning: No makers div found ")
-            return None, None
-
-        # 解析元素
-        rows = div_series.find_all('a', class_='box')
-
-        list_data = []
-        next_url = None
-        for row in rows:
-            name = row.find('strong').text.strip()
-            href = row['href']
-            div_movies = row.find('span')
-            movies = 0
-            if div_movies:
-                match = re.search(r'\((\d+)\)', div_movies.text.strip())
-                if match:
-                    movies = int(match.group(1))
-
-            list_data.append({
-                'name': name,
-                'href': host_url + href if href else '',
-                'movies': movies
-            })
-
-        # 查找 "下一页" 按钮
-        next_page_element = soup.find('a', class_='pagination-next')
-        if next_page_element:
-            next_page_url = next_page_element['href']
-            next_page_number = self.url_page_num(next_page_url)
-            current_page_number = self.url_page_num(href)
-            if current_page_number is None:
-                current_page_number = 0
-            if next_page_number and next_page_number > current_page_number:
-                next_url = host_url + next_page_url
-
-        return list_data, next_url
-
-    def parse_maker_detail(self, soup, href):
-        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
-        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
-        if not div_movies:
-            logging.warning(f"Warning: No movies div found ")
-            return [], None
-
-        # 解析元素
-        rows = div_movies.find_all('div', class_='item')
-
-        list_data = []
-        next_url = None
-        for row in rows:
-            link = row.find('a', class_='box')['href']
-            serial_number = row.find('strong').text.strip()
-            title = row.find('div', class_='video-title').text.strip()
-            release_date = row.find('div', class_='meta').text.strip()
-            list_data.append({
-                'href': host_url + link if link else '',
-                'serial_number': serial_number,
-                'title': title,
-                'release_date': release_date
-            })
-
-        # 查找 "下一页" 按钮
-        next_page_element = soup.find('a', class_='pagination-next')
-        if next_page_element:
-            next_page_url = next_page_element['href']
-            next_page_number = self.url_page_num(next_page_url)
-            current_page_number = self.url_page_num(href)
-            if current_page_number is None:
-                current_page_number = 0
-            if next_page_number and next_page_number > current_page_number:
-                next_url = host_url + next_page_url
-
-        return list_data, next_url
-
-    def parse_publisher_detail(self, soup, href):
-        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
-        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
-        if not div_movies:
-            logging.warning(f"Warning: No movies div found ")
-            return [], None
-
-        # 解析元素
-        rows = div_movies.find_all('div', class_='item')
-
-        list_data = []
-        next_url = None
-        for row in rows:
-            link = row.find('a', class_='box')['href']
-            serial_number = row.find('strong').text.strip()
-            title = row.find('div', class_='video-title').text.strip()
-            release_date = row.find('div', class_='meta').text.strip()
-            list_data.append({
-                'href': host_url + link if link else '',
-                'serial_number': serial_number,
-                'title': title,
-                'release_date': release_date
-            })
-
-        # 查找 "下一页" 按钮
-        next_page_element = soup.find('a', class_='pagination-next')
-        if next_page_element:
-            next_page_url = next_page_element['href']
-            next_page_number = self.url_page_num(next_page_url)
-            current_page_number = self.url_page_num(href)
-            if current_page_number is None:
-                current_page_number = 0
-            if next_page_number and next_page_number > current_page_number:
-                next_url = host_url + next_page_url
-
-        return list_data, next_url
-
-    def parse_uncensored(self, soup, href):
-        # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
-        div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
-        if not div_movies:
-            logging.warning(f"Warning: No movies div found ")
-            return [], None
-
-        # 解析元素
-        rows = div_movies.find_all('div', class_='item')
-
-        list_data = []
-        next_url = None
-        for row in rows:
-            link = row.find('a', class_='box')['href']
-            serial_number = row.find('strong').text.strip()
-            title = row.find('div', class_='video-title').text.strip()
-            release_date = row.find('div', class_='meta').text.strip()
-            list_data.append({
-                'href': host_url + link if link else '',
-                'serial_number': serial_number,
-                'title': title,
-                'release_date': release_date
-            })
-
-        # 查找 "下一页" 按钮
-        next_page_element = soup.find('a', class_='pagination-next')
-        if next_page_element:
-            next_page_url = next_page_element['href']
-            next_page_number = self.url_page_num(next_page_url)
-            current_page_number = self.url_page_num(href)
-            if current_page_number is None:
-                current_page_number = 0
-            if next_page_number and next_page_number > current_page_number:
-                next_url = host_url + next_page_url
-
-        return list_data, next_url