modify some scripts.
This commit is contained in:
@ -30,9 +30,15 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
try:
|
||||
if 'javdb.com' not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None
|
||||
return None, None
|
||||
|
||||
response = scraper.get(url, headers=headers)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.warning(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
@ -40,7 +46,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup
|
||||
return soup, response.status_code
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except cloudscraper.exceptions.CloudflareChallengeError as e:
|
||||
@ -51,7 +57,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
|
||||
logging.error(f"Unexpected error on {url}: {e}, Retring...")
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None # 达到最大重试次数仍然失败
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
# 修复 HTML 结构,去除多余标签并修正 <a> 标签,在获取人种的时候需要
|
||||
def preprocess_html(html):
|
||||
@ -78,6 +84,21 @@ def url_page_num(href):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
|
||||
def parse_avatar_image(soup):
|
||||
try:
|
||||
span = soup.find("span", class_="avatar")
|
||||
if not span:
|
||||
return "" # 没有找到 <span> 元素,返回空字符串
|
||||
|
||||
style = span.get("style", "")
|
||||
match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
|
||||
return match.group(1) if match else "" # 解析成功返回 URL,否则返回空字符串
|
||||
except Exception as e:
|
||||
return "" # 发生异常时,返回空字符串
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actors_uncensored(soup, href):
|
||||
div_actors = soup.find("div", id='actors')
|
||||
@ -123,6 +144,29 @@ def parse_actors_uncensored(soup, href):
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
def parse_actor_detail(soup, href):
|
||||
# 先找一下别名
|
||||
alias_list = []
|
||||
|
||||
div_meta = soup.find('span', class_='actor-section-name')
|
||||
if not div_meta:
|
||||
logging.warning(f'warning: no meta data found in page {href}')
|
||||
return None, None
|
||||
alias_div = soup.find('div', class_='column section-title')
|
||||
|
||||
if alias_div:
|
||||
meta_list = alias_div.find_all('span', class_='section-meta')
|
||||
if len(meta_list) > 1:
|
||||
alias_list = meta_list[0].text.strip().split(", ")
|
||||
|
||||
# 头像
|
||||
pic = ''
|
||||
avatar = soup.find("div", class_="column actor-avatar")
|
||||
if avatar:
|
||||
pic = parse_avatar_image(avatar)
|
||||
|
||||
# 返回数据
|
||||
actor = {}
|
||||
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
@ -157,7 +201,13 @@ def parse_actor_detail(soup, href):
|
||||
if next_page_number and next_page_number > current_page_number :
|
||||
next_url = host_url + next_page_url
|
||||
|
||||
return list_data, next_url
|
||||
actor = {
|
||||
'pic' : pic,
|
||||
'alias' : alias_list,
|
||||
'movies' : list_data
|
||||
}
|
||||
|
||||
return actor, next_url
|
||||
|
||||
|
||||
# 解析 HTML 内容,提取需要的数据
|
||||
@ -257,7 +307,7 @@ def parse_series_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
@ -337,7 +387,7 @@ def parse_maker_detail(soup, href):
|
||||
div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
|
||||
if not div_movies:
|
||||
logging.warning(f"Warning: No movies div found ")
|
||||
return None, None
|
||||
return [], None
|
||||
|
||||
# 解析元素
|
||||
rows = div_movies.find_all('div', class_='item')
|
||||
|
||||
Reference in New Issue
Block a user