modify scripts

This commit is contained in:
oscarz
2025-06-25 11:38:06 +08:00
parent 9cf521a0d6
commit 5ebfe7cb8c
4 changed files with 306 additions and 309 deletions

View File

@ -333,6 +333,69 @@ class JavbusCrawler(GenericCrawler):
return movie_info
# 获取演员详情
def parse_studios_labels_series_detail(self, soup, href):
"""
解析Javbus网页内容提取演员信息和影片列表
"""
result = {
'meta': {},
'movies': []
}
try:
# 解析标题
b_tag = soup.select_one('.alert.alert-success.alert-common p b')
if not b_tag:
logging.warning(f'found no title. href: {href}')
else:
# 获取文本内容
title_text = b_tag.get_text(strip=True)
# 使用横线分割文本
parts = [part.strip() for part in title_text.split('-')]
# 定义"影片"的多种语言表示
video_keywords = ['影片', 'Video', '映画', 'Videos', 'Movies']
# 查找"影片"关键词的位置
video_index = next((i for i, part in enumerate(parts) if part in video_keywords), None)
if video_index is not None and video_index >= 2:
# 提取前两个元素作为工作室和角色
studio = parts[video_index - 2]
role = parts[video_index - 1]
result['meta'] = {'title': studio, 'role': role}
else:
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
div_waterfall = soup.find('div', id='waterfall')
if not div_waterfall:
logging.warning(f"found no records. href: {href}")
else:
# 解析影片列表
movie_boxes = div_waterfall.find_all('a', class_='movie-box')
if movie_boxes:
for movie_box in movie_boxes:
movie_info = self.parse_movie_info(movie_box)
if movie_info:
result['movies'].append(movie_info)
else:
logging.debug(f"movie-box not found. href: {href}")
except Exception as e:
logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True)
# 查找 "下一页" 按钮
next_url = None
div_link = soup.find("div", class_='text-center hidden-xs')
if div_link:
next_page_element = soup.find('a', id='next')
if next_page_element:
next_page_url = next_page_element['href']
next_url = urljoin(href, next_page_url)
return result, next_url
# 解析Javbus影片详情页内容
def parse_movie_detail(self, soup, href, title):
result = {
@ -456,231 +519,3 @@ class JavbusCrawler(GenericCrawler):
logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
return result
def parse_series_uncensored(self, soup, href):
div_series = soup.find("div", id='series')
if not div_series:
logging.warning(f"Warning: No div_series div found ")
return None, None
# 解析元素
rows = div_series.find_all('a', class_='box')
list_data = []
next_url = None
for row in rows:
name = row.find('strong').text.strip()
href = row['href']
div_movies = row.find('span')
movies = 0
if div_movies:
match = re.search(r'\((\d+)\)', div_movies.text.strip())
if match:
movies = int(match.group(1))
list_data.append({
'name': name,
'href': host_url + href if href else '',
'movies': movies
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = self.url_page_num(next_page_url)
current_page_number = self.url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number:
next_url = host_url + next_page_url
return list_data, next_url
def parse_series_detail(self, soup, href):
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href': host_url + link if link else '',
'serial_number': serial_number,
'title': title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = self.url_page_num(next_page_url)
current_page_number = self.url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number:
next_url = host_url + next_page_url
return list_data, next_url
def parse_makers_uncensored(self, soup, href):
div_series = soup.find("div", id='makers')
if not div_series:
logging.warning(f"Warning: No makers div found ")
return None, None
# 解析元素
rows = div_series.find_all('a', class_='box')
list_data = []
next_url = None
for row in rows:
name = row.find('strong').text.strip()
href = row['href']
div_movies = row.find('span')
movies = 0
if div_movies:
match = re.search(r'\((\d+)\)', div_movies.text.strip())
if match:
movies = int(match.group(1))
list_data.append({
'name': name,
'href': host_url + href if href else '',
'movies': movies
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = self.url_page_num(next_page_url)
current_page_number = self.url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number:
next_url = host_url + next_page_url
return list_data, next_url
def parse_maker_detail(self, soup, href):
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href': host_url + link if link else '',
'serial_number': serial_number,
'title': title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = self.url_page_num(next_page_url)
current_page_number = self.url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number:
next_url = host_url + next_page_url
return list_data, next_url
def parse_publisher_detail(self, soup, href):
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href': host_url + link if link else '',
'serial_number': serial_number,
'title': title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = self.url_page_num(next_page_url)
current_page_number = self.url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number:
next_url = host_url + next_page_url
return list_data, next_url
def parse_uncensored(self, soup, href):
# div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8')
div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-4 vcols-(5|8)'))
if not div_movies:
logging.warning(f"Warning: No movies div found ")
return [], None
# 解析元素
rows = div_movies.find_all('div', class_='item')
list_data = []
next_url = None
for row in rows:
link = row.find('a', class_='box')['href']
serial_number = row.find('strong').text.strip()
title = row.find('div', class_='video-title').text.strip()
release_date = row.find('div', class_='meta').text.strip()
list_data.append({
'href': host_url + link if link else '',
'serial_number': serial_number,
'title': title,
'release_date': release_date
})
# 查找 "下一页" 按钮
next_page_element = soup.find('a', class_='pagination-next')
if next_page_element:
next_page_url = next_page_element['href']
next_page_number = self.url_page_num(next_page_url)
current_page_number = self.url_page_num(href)
if current_page_number is None:
current_page_number = 0
if next_page_number and next_page_number > current_page_number:
next_url = host_url + next_page_url
return list_data, next_url