import logging import sys import requests import re from bs4 import BeautifulSoup from urllib.parse import urljoin import src.utils.utils as utils http_code_404 = 404 http_code_redirect = 401 http_code_url = 601 http_code_local = 99 # 通用的爬取类,主要实现了底层的网络交互封装 class GenericCrawler: def __init__(self, use_cloudscraper=None, headers=None, cookies=None, max_retries=3, html_parser='html.parser'): if use_cloudscraper is None: use_cloudscraper = sys.version_info >= (3, 8) self.use_cloudscraper = use_cloudscraper self.headers = headers or { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0' } self.cookies = cookies or {} self.scraper = None # 延迟初始化 self.max_retries = max_retries self.parser = html_parser # 不在这里导入 cloudscraper,而是在需要时导入 def _initialize_scraper(self): """延迟初始化请求客户端,避免不必要的 cloudscraper 导入""" if self.scraper is not None: return if self.use_cloudscraper: try: # 延迟导入 cloudscraper import cloudscraper self.scraper = cloudscraper.create_scraper() logging.info("Using cloudscraper for requests") except ImportError: logging.warning("cloudscraper not installed. Falling back to requests.") self.use_cloudscraper = False self.scraper = requests.Session() else: self.scraper = requests.Session() logging.info("Using requests for HTTP operations") def fetch_page(self, url, validator): # 在使用前初始化 scraper self._initialize_scraper() for attempt in range(self.max_retries): try: if not utils.is_valid_url(url): logging.error(f'wrong url format: {url}') return None, http_code_url response = self.scraper.get(url, headers=self.headers, cookies=self.cookies) # 处理 HTTP 状态码 if response.status_code == http_code_404: logging.debug(f"Page not found (404): {url}") return None, http_code_404 # 直接返回 404,调用方可以跳过 response.raise_for_status() # 处理 HTTP 错误 # 检查是否发生跳转,比如到登录页面 if response.history: logging.debug(f"Page redirected on {url}. Checking if it's a verify page.") soup = BeautifulSoup(response.text, self.parser) if self.check_redirect(soup) : logging.warning(f"Page redirected to verify page on {url}.") return None, http_code_redirect # 判断是否为登录页面 #if soup.find('div', id='ageVerify'): # 预处理 HTML(如果提供了 preprocessor) html_text = self.preprocessor(response.text) soup = BeautifulSoup(html_text, self.parser) if validator(soup): # 进行自定义页面检查 return soup, response.status_code logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except Exception as e: logging.error(f"Unexpected error on {url}: {e}, Retrying...") logging.error(f'Fetching failed after max retries. {url}') return None, None # 达到最大重试次数仍然失败 # 对页面的预处理,通常是修复标签之类的 def preprocessor(self, html): return html # 检查是否发生了跳转,偏离了正常解析 def check_redirect(self, soup): """默认的页面验证器,子类可重写""" return False # 默认验证通过 @staticmethod def generic_validator(soup, tag, identifier, attr_type="id"): if attr_type == "id": return soup.find(tag, id=identifier) is not None elif attr_type == "class": return bool(soup.find_all(tag, class_=identifier)) elif attr_type == "name": return bool(soup.find('select', {'name': identifier})) return False # javbus.com 网页爬取类 class JavbusCrawler(GenericCrawler): def __init__(self, use_cloudscraper=None): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Sec-Fetch-Site": "none", "Accept-Encoding": "gzip, deflate, br", "Sec-Fetch-Mode": "navigate", "Host": "www.javbus.com", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15", "Accept-Language": "zh-CN,zh-Hans;q=0.9", "Sec-Fetch-Dest": "document", "Connection": "keep-alive", } cookies = { 'PHPSESSID': 'l9m4ugaaao1hgvl3micr22u3o6', 'existmag': 'all', 'age': 'verified' } super().__init__(use_cloudscraper, headers=headers, cookies=cookies) self.host_url = "https://www.javbus.com" # 以下是原有的解析函数,保持不变 def parse_actors_list(self, soup, href): div_actors = soup.find("div", id='waterfall') if not div_actors: logging.warning(f"Warning: No actors div found ") return None, None # 解析元素 rows = div_actors.find_all('div', class_='item') list_data = [] next_url = None for row in rows: # 获取演员详情链接 actor_link = row.find('a')['href'] # 获取演员名字 actor_name = row.find('span').text.strip() # 获取头像图片链接 avatar_url = row.find('img')['src'] list_data.append({ 'name': actor_name, 'href': urljoin(self.host_url, actor_link), 'pic': avatar_url }) # 查找 "下一页" 按钮 div_link = soup.find("div", class_='text-center hidden-xs') if div_link: next_page_element = soup.find('a', id='next') if next_page_element: next_page_url = next_page_element['href'] next_url = urljoin(href, next_page_url) return list_data, next_url # 获取演员详情 def parse_actor_detail(self, soup, href): """ 解析Javbus网页内容,提取演员信息和影片列表 """ result = { 'avatar': {}, 'movies': [] } try: # 解析演员信息 avatar_box = soup.find('div', class_='avatar-box') if avatar_box: result['avatar'] = self.parse_avatar_info(avatar_box) else: logging.debug(f"avatar-box not found. href: {href}") # 解析影片列表 movie_boxes = soup.find_all('a', class_='movie-box') if movie_boxes: for movie_box in movie_boxes: movie_info = self.parse_movie_info(movie_box) if movie_info: result['movies'].append(movie_info) else: logging.debug(f"movie-box not found. href: {href}") except Exception as e: logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True) # 查找 "下一页" 按钮 next_url = None div_link = soup.find("div", class_='text-center hidden-xs') if div_link: next_page_element = soup.find('a', id='next') if next_page_element: next_page_url = next_page_element['href'] next_url = urljoin(href, next_page_url) return result, next_url def parse_avatar_info(self, avatar_box): """ 解析演员信息 """ avatar_info = {} # 定义映射关系:包含各种语言的字段名称及其对应的目标键名 field_mapping = { 'birth_date': ['生日', 'D.O.B', '生年月日', 'Birthday'], 'age': ['年齡', 'Age', '年龄'], 'height': ['身高', 'Height', '身長'], 'breast_size': ['罩杯', 'Cup', 'ブラのサイズ'], 'bust': ['胸圍', 'Bust', 'バスト'], 'waist': ['腰圍', 'Waist', 'ウエスト'], 'hip': ['臀圍', 'Hips', 'ヒップ'], 'hobby': ['愛好', 'Hobby', '趣味'] } # 提取演员名称 name_span = avatar_box.find('span', class_='pb10') if name_span: avatar_info['name'] = name_span.get_text(strip=True) else: logging.debug("未找到演员名称") # 提取生日、年龄等信息 p_tags = avatar_box.find_all('p') for p in p_tags: text = p.get_text(strip=True) # 使用正则表达式匹配冒号前后的内容 match = re.search(r'^(.*?)[::](.*)$', text) if match: key = match.group(1).strip() value = match.group(2).strip() # 查找对应的目标键名 target_key = next((k for k, v in field_mapping.items() if any(x in key for x in v)), None) if target_key: # 特殊处理数字类型和单位转换 if target_key in ['age', 'height', 'bust', 'waist', 'hip']: # 提取数字部分 num_match = re.search(r'(\d+\.?\d*)', value) if num_match: try: avatar_info[target_key] = float(num_match.group(1)) # 保留整数(如果是整数) if avatar_info[target_key].is_integer(): avatar_info[target_key] = int(avatar_info[target_key]) except ValueError: logging.debug(f"转换数字失败: {value}") avatar_info[target_key] = value else: logging.debug(f"未找到数字部分: {value}") avatar_info[target_key] = value else: avatar_info[target_key] = value else: logging.debug(f"未知的演员信息类型: {key}") else: logging.debug(f"无法解析的演员信息: {text}") avatar_info['measurements'] = f"{avatar_info.get('bust', '')}-{avatar_info.get('waist', '')}-{avatar_info.get('hip', '') }" return avatar_info def parse_movie_info(self, movie_box): """ 解析影片信息 """ movie_info = {} try: # 提取影片链接 href = movie_box.get('href') if href: movie_info['href'] = href else: logging.warning("未找到影片链接") return None # 提取图片链接 img_tag = movie_box.find('img') if img_tag and 'src' in img_tag.attrs: movie_info['cover_url'] = img_tag['src'] movie_info['title'] = img_tag['title'] else: logging.warning("未找到影片图片链接") # 提取标题、番号和发布日期 photo_info = movie_box.find('div', class_='photo-info') if photo_info: # 提取标题 (span标签中的文本,排除date标签) span_tag = photo_info.find('span') if span_tag: # 获取span下的纯文本内容 (不包含date标签) title = ''.join(span_tag.find_all(text=True, recursive=False)).strip() # 移除常见的分隔符模式 if title.endswith('\n\n /'): clean_title = title[:-4].strip() elif title.endswith('\n /'): clean_title = title[:-3].strip() else: clean_title = title movie_info['title'] = clean_title # 提取番号和日期 (date标签) date_tags = span_tag.find_all('date') if len(date_tags) >= 2: movie_info['serial_number'] = date_tags[0].get_text(strip=True) movie_info['release_date'] = date_tags[1].get_text(strip=True) else: logging.warning(f"date标签数量不足,无法提取番号和日期") else: logging.warning("未找到span标签") else: logging.warning("未找到影片信息区域") except Exception as e: logging.error(f"解析影片信息时发生错误: {str(e)}", exc_info=True) return None return movie_info # 获取演员详情 def parse_studios_labels_series_detail(self, soup, href): """ 解析Javbus网页内容,提取演员信息和影片列表 """ result = { 'meta': {}, 'movies': [] } try: # 解析标题 b_tag = soup.select_one('.alert.alert-success.alert-common p b') if not b_tag: logging.warning(f'found no title. href: {href}') else: # 获取文本内容 title_text = b_tag.get_text(strip=True) # 使用横线分割文本 parts = [part.strip() for part in title_text.split('-')] # 定义"影片"的多种语言表示 video_keywords = ['影片', 'Video', '映画', 'Videos', 'Movies'] # 查找"影片"关键词的位置 video_index = next((i for i, part in enumerate(parts) if part in video_keywords), None) if video_index is not None and video_index >= 2: # 提取前两个元素作为工作室和角色 studio = parts[video_index - 2] role = parts[video_index - 1] result['meta']['title'] = studio result['meta']['role'] = role else: logging.debug(f"无法按规则解析: {' - '.join(parts)}") # 提取全部影片和已有磁力的数量 # 查找a标签 a_tags = soup.select('.alert.alert-success.alert-common a.mypointer') if not a_tags: logging.warning(f'found no movie cnt. href: {href}') else: for a in a_tags: text = a.get_text(strip=True) # 提取全部影片数量 if '全部影片' in text: match = re.search(r'全部影片\s*(\d+)\s*', text) if match: result['meta']['movies_cnt'] = int(match.group(1)) # 提取已有磁力数量 if '已有磁力' in text: match = re.search(r'已有磁力\s*(\d+)\s*', text) if match: result['meta']['magnet_cnt'] = int(match.group(1)) div_waterfall = soup.find('div', id='waterfall') if not div_waterfall: logging.warning(f"found no records. href: {href}") else: # 解析影片列表 movie_boxes = div_waterfall.find_all('a', class_='movie-box') if movie_boxes: for movie_box in movie_boxes: movie_info = self.parse_movie_info(movie_box) if movie_info: result['movies'].append(movie_info) else: logging.debug(f"movie-box not found. href: {href}") except Exception as e: logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True) # 查找 "下一页" 按钮 next_url = None div_link = soup.find("div", class_='text-center hidden-xs') if div_link: next_page_element = soup.find('a', id='next') if next_page_element: next_page_url = next_page_element['href'] next_url = urljoin(href, next_page_url) return result, next_url # 解析Javbus影片详情页内容 def parse_movie_detail(self, soup, href, title): result = { 'title': title, 'href': href, 'serial_number': '', 'release_date': '', 'duration': '', 'studio': {'name': '', 'href': ''}, 'label': {'name': '', 'href': ''}, 'series': {'name': '', 'href': ''}, 'tags': [], 'actors': [] } try: # 提取标题 div_container = soup.find('div', class_='container') if not div_container: logging.warning(f"found no container tag. href: {href}") return None title_element = div_container.find('h3') if title_element: result['title'] = title_element.get_text(strip=True) else: logging.debug("no title found. href: {href}") # 提取基本信息(识别码、发行日期等) info_div = div_container.find('div', class_='info') if not info_div: logging.warning(f"found no div info tag. href: {href}") return None # 定义字段映射关系(多种语言支持) field_mapping = { 'serial_number': ['識別碼:', '识别码:', 'ID:', '品番:'], 'release_date': ['發行日期:', '发行日期:', 'Release Date:', '発売日:'], 'duration': ['長度:', '长度:', 'Length:', '収録時間:'], 'studio': ['製作商:', '制作商:', 'Studio:', 'メーカー:'], 'label': ['發行商:', '发行商:', 'Label:', 'レーベル:'], 'series': ['系列:', 'Series:', 'シリーズ:'] } # 遍历所有p标签查找信息 p_tags = info_div.find_all('p') for p in p_tags: # 查找header标签 header = p.find('span', class_='header') if header: header_text = header.get_text(strip=True) # 查找匹配的目标键名 target_key = next((k for k, v in field_mapping.items() if header_text in v), None) if target_key: # 获取值(处理文本和链接) if target_key in ['studio', 'label', 'series']: # 处理有链接的字段 a_tag = p.find('a') if a_tag: result[target_key]['name'] = a_tag.get_text(strip=True) result[target_key]['href'] = a_tag.get('href', '') else: # 没有链接,直接获取文本 value_text = p.get_text(strip=True) # 移除header文本 value_text = value_text.replace(header_text, '').strip() result[target_key]['name'] = value_text logging.debug(f"{header_text} 没有链接,直接提取文本") else: # 处理普通文本字段 value_text = p.get_text(strip=True) # 移除header文本 value_text = value_text.replace(header_text, '').strip() # 特殊处理:提取时长的数字部分(咱不处理) if target_key == 'duration' and False: num_match = re.search(r'(\d+)', value_text) if num_match: result[target_key] = num_match.group(1) else: result[target_key] = value_text else: result[target_key] = value_text # 处理类别字段 tag_lables = info_div.find_all('label') for item in tag_lables: link = item.find('a') if link: genre = { 'name': link.get_text(strip=True), 'href': link.get('href', '') } result['tags'].append(genre) # 提取演员信息 star_p = info_div.find('p', class_='star-show') if star_p: # 查找演员列表 star_list = star_p.find_next('ul') if star_list: star_items = star_list.find_all('div', class_='star-name') for item in star_items: link = item.find('a') if link: actor = { 'name': link.get_text(strip=True), 'href': link.get('href', '') } result['actors'].append(actor) else: logging.debug(f"actors not found.") else: logging.debug("no star-name area. href: {href}") else: logging.debug("no star-show area. href: {href}") except Exception as e: logging.warning(f"parse movie detail error. href: {href}, error: {str(e)}", exc_info=True) return result