modify scripts
This commit is contained in:
@ -333,6 +333,69 @@ class JavbusCrawler(GenericCrawler):
|
||||
|
||||
return movie_info
|
||||
|
||||
|
||||
# 获取演员详情
|
||||
def parse_studios_labels_series_detail(self, soup, href):
|
||||
"""
|
||||
解析Javbus网页内容,提取演员信息和影片列表
|
||||
"""
|
||||
result = {
|
||||
'meta': {},
|
||||
'movies': []
|
||||
}
|
||||
|
||||
try:
|
||||
# 解析标题
|
||||
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
|
||||
if not b_tag:
|
||||
logging.warning(f'found no title. href: {href}')
|
||||
else:
|
||||
# 获取文本内容
|
||||
title_text = b_tag.get_text(strip=True)
|
||||
# 使用横线分割文本
|
||||
parts = [part.strip() for part in title_text.split('-')]
|
||||
# 定义"影片"的多种语言表示
|
||||
video_keywords = ['影片', 'Video', '映画', 'Videos', 'Movies']
|
||||
|
||||
# 查找"影片"关键词的位置
|
||||
video_index = next((i for i, part in enumerate(parts) if part in video_keywords), None)
|
||||
|
||||
if video_index is not None and video_index >= 2:
|
||||
# 提取前两个元素作为工作室和角色
|
||||
studio = parts[video_index - 2]
|
||||
role = parts[video_index - 1]
|
||||
result['meta'] = {'title': studio, 'role': role}
|
||||
else:
|
||||
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
|
||||
|
||||
div_waterfall = soup.find('div', id='waterfall')
|
||||
if not div_waterfall:
|
||||
logging.warning(f"found no records. href: {href}")
|
||||
else:
|
||||
# 解析影片列表
|
||||
movie_boxes = div_waterfall.find_all('a', class_='movie-box')
|
||||
if movie_boxes:
|
||||
for movie_box in movie_boxes:
|
||||
movie_info = self.parse_movie_info(movie_box)
|
||||
if movie_info:
|
||||
result['movies'].append(movie_info)
|
||||
else:
|
||||
logging.debug(f"movie-box not found. href: {href}")
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True)
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_url = None
|
||||
div_link = soup.find("div", class_='text-center hidden-xs')
|
||||
if div_link:
|
||||
next_page_element = soup.find('a', id='next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_url = urljoin(href, next_page_url)
|
||||
|
||||
return result, next_url
|
||||
|
||||
# 解析Javbus影片详情页内容
|
||||
def parse_movie_detail(self, soup, href, title):
|
||||
result = {
|
||||
@ -456,231 +519,3 @@ class JavbusCrawler(GenericCrawler):
|
||||
logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
|
||||
|
||||
return result
|
||||
|
||||
def parse_series_uncensored(self, soup, href):
|
||||
div_series = soup.find("div", id='series')
|
||||
if not div_series:
|
||||
logging.warning(f"Warning: No div_series div found ")
|
||||
return None, None
|
||||
|
||||
# 解析元素
|
||||
rows = div_series.find_all('a', class_='box')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
name = row.find('strong').text.strip()
|
||||
href = row['href']
|
||||
div_movies = row.find('span')
|
||||
movies = 0
|
||||
if div_movies:
|
||||
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||
if match:
|
||||
movies = int(match.group(1))
|
||||
|
||||
list_data.append({
|
||||
'name': name,
|
||||
'href': host_url + href if href else '',
|
||||
'movies': movies
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = self.url_page_num(next_page_url)
|
||||
current_page_number = self.url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number:
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
def parse_series_detail(self, soup, href):
|
||||
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href': host_url + link if link else '',
|
||||
'serial_number': serial_number,
|
||||
'title': title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = self.url_page_num(next_page_url)
|
||||
current_page_number = self.url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number:
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
def parse_makers_uncensored(self, soup, href):
|
||||
div_series = soup.find("div", id='makers')
|
||||
if not div_series:
|
||||
logging.warning(f"Warning: No makers div found ")
|
||||
return None, None
|
||||
|
||||
# 解析元素
|
||||
rows = div_series.find_all('a', class_='box')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
name = row.find('strong').text.strip()
|
||||
href = row['href']
|
||||
div_movies = row.find('span')
|
||||
movies = 0
|
||||
if div_movies:
|
||||
match = re.search(r'\((\d+)\)', div_movies.text.strip())
|
||||
if match:
|
||||
movies = int(match.group(1))
|
||||
|
||||
list_data.append({
|
||||
'name': name,
|
||||
'href': host_url + href if href else '',
|
||||
'movies': movies
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = self.url_page_num(next_page_url)
|
||||
current_page_number = self.url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number:
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
def parse_maker_detail(self, soup, href):
|
||||
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href': host_url + link if link else '',
|
||||
'serial_number': serial_number,
|
||||
'title': title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = self.url_page_num(next_page_url)
|
||||
current_page_number = self.url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number:
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
def parse_publisher_detail(self, soup, href):
|
||||
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href': host_url + link if link else '',
|
||||
'serial_number': serial_number,
|
||||
'title': title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = self.url_page_num(next_page_url)
|
||||
current_page_number = self.url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number:
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
def parse_uncensored(self, soup, href):
|
||||
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
|
||||
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
for row in rows:
|
||||
link = row.find('a', class_='box')['href']
|
||||
serial_number = row.find('strong').text.strip()
|
||||
title = row.find('div', class_='video-title').text.strip()
|
||||
release_date = row.find('div', class_='meta').text.strip()
|
||||
list_data.append({
|
||||
'href': host_url + link if link else '',
|
||||
'serial_number': serial_number,
|
||||
'title': title,
|
||||
'release_date': release_date
|
||||
})
|
||||
|
||||
# 查找 "下一页" 按钮
|
||||
next_page_element = soup.find('a', class_='pagination-next')
|
||||
if next_page_element:
|
||||
next_page_url = next_page_element['href']
|
||||
next_page_number = self.url_page_num(next_page_url)
|
||||
current_page_number = self.url_page_num(href)
|
||||
if current_page_number is None:
|
||||
current_page_number = 0
|
||||
if next_page_number and next_page_number > current_page_number:
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
Reference in New Issue
Block a user