modify scripts
This commit is contained in:
@ -364,10 +364,31 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
# 提取前两个元素作为工作室和角色
|
# 提取前两个元素作为工作室和角色
|
||||||
studio = parts[video_index - 2]
|
studio = parts[video_index - 2]
|
||||||
role = parts[video_index - 1]
|
role = parts[video_index - 1]
|
||||||
result['meta'] = {'title': studio, 'role': role}
|
result['meta']['title'] = studio
|
||||||
|
result['meta']['role'] = role
|
||||||
else:
|
else:
|
||||||
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
|
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
|
||||||
|
|
||||||
|
# 提取全部影片和已有磁力的数量
|
||||||
|
# 查找a标签
|
||||||
|
a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
|
||||||
|
if not a_tags:
|
||||||
|
logging.warning(f'found no movie cnt. href: {href}')
|
||||||
|
else:
|
||||||
|
for a in a_tags:
|
||||||
|
text = a.get_text(strip=True)
|
||||||
|
# 提取全部影片数量
|
||||||
|
if '全部影片' in text:
|
||||||
|
match = re.search(r'全部影片\s*(\d+)\s*', text)
|
||||||
|
if match:
|
||||||
|
result['meta']['movies_cnt'] = int(match.group(1))
|
||||||
|
|
||||||
|
# 提取已有磁力数量
|
||||||
|
if '已有磁力' in text:
|
||||||
|
match = re.search(r'已有磁力\s*(\d+)\s*', text)
|
||||||
|
if match:
|
||||||
|
result['meta']['magnet_cnt'] = int(match.group(1))
|
||||||
|
|
||||||
div_waterfall = soup.find('div', id='waterfall')
|
div_waterfall = soup.find('div', id='waterfall')
|
||||||
if not div_waterfall:
|
if not div_waterfall:
|
||||||
logging.warning(f"found no records. href: {href}")
|
logging.warning(f"found no records. href: {href}")
|
||||||
@ -415,19 +436,19 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
# 提取标题
|
# 提取标题
|
||||||
div_container = soup.find('div', class_='container')
|
div_container = soup.find('div', class_='container')
|
||||||
if not div_container:
|
if not div_container:
|
||||||
logging.warning(f"found no container tag.")
|
logging.warning(f"found no container tag. href: {href}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
title_element = div_container.find('h3')
|
title_element = div_container.find('h3')
|
||||||
if title_element:
|
if title_element:
|
||||||
result['title'] = title_element.get_text(strip=True)
|
result['title'] = title_element.get_text(strip=True)
|
||||||
else:
|
else:
|
||||||
logging.debug("未找到影片标题")
|
logging.debug("no title found. href: {href}")
|
||||||
|
|
||||||
# 提取基本信息(识别码、发行日期等)
|
# 提取基本信息(识别码、发行日期等)
|
||||||
info_div = div_container.find('div', class_='info')
|
info_div = div_container.find('div', class_='info')
|
||||||
if not info_div:
|
if not info_div:
|
||||||
logging.warning(f"found no div info tag.")
|
logging.warning(f"found no div info tag. href: {href}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 定义字段映射关系(多种语言支持)
|
# 定义字段映射关系(多种语言支持)
|
||||||
@ -511,11 +532,11 @@ class JavbusCrawler(GenericCrawler):
|
|||||||
else:
|
else:
|
||||||
logging.debug(f"actors not found.")
|
logging.debug(f"actors not found.")
|
||||||
else:
|
else:
|
||||||
logging.warning("未找到演员列表区域")
|
logging.debug("no star-name area. href: {href}")
|
||||||
else:
|
else:
|
||||||
logging.warning("未找到演员标题")
|
logging.debug("no star-show area. href: {href}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
|
logging.warning(f"parse movie detail error. href: {href}, error: {str(e)}", exc_info=True)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@ -82,7 +82,7 @@ def fetch_actor_list():
|
|||||||
# 从studio/label/series中获取影片
|
# 从studio/label/series中获取影片
|
||||||
def fetch_movies_common(tbl):
|
def fetch_movies_common(tbl):
|
||||||
if debug:
|
if debug:
|
||||||
url_list = db_tools.query_list_common(tbl=tbl)
|
url_list = db_tools.query_list_common(tbl=tbl, limit=2)
|
||||||
else:
|
else:
|
||||||
if g_uncensored==1:
|
if g_uncensored==1:
|
||||||
url_list = db_tools.query_list_common(tbl=tbl, uncensored=1)
|
url_list = db_tools.query_list_common(tbl=tbl, uncensored=1)
|
||||||
@ -101,6 +101,8 @@ def fetch_movies_common(tbl):
|
|||||||
if not utils.is_valid_url(url):
|
if not utils.is_valid_url(url):
|
||||||
logging.info(f'invalid url ({url}) in {tbl}, row id: {row_id}. skipping...')
|
logging.info(f'invalid url ({url}) in {tbl}, row id: {row_id}. skipping...')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
meta_data = None
|
||||||
# 去掉可下载的标志(如果有)
|
# 去掉可下载的标志(如果有)
|
||||||
next_url = url
|
next_url = url
|
||||||
while next_url:
|
while next_url:
|
||||||
@ -109,6 +111,10 @@ def fetch_movies_common(tbl):
|
|||||||
if soup:
|
if soup:
|
||||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||||
if list_data:
|
if list_data:
|
||||||
|
# 更新metadata
|
||||||
|
if meta_data is None:
|
||||||
|
meta_data = list_data.get('meta', {})
|
||||||
|
|
||||||
# 根据tbl的值动态构建额外参数
|
# 根据tbl的值动态构建额外参数
|
||||||
extra_kwargs = {}
|
extra_kwargs = {}
|
||||||
if tbl == 'studio':
|
if tbl == 'studio':
|
||||||
@ -138,7 +144,17 @@ def fetch_movies_common(tbl):
|
|||||||
|
|
||||||
# 调试增加brak
|
# 调试增加brak
|
||||||
if debug:
|
if debug:
|
||||||
return True
|
break
|
||||||
|
|
||||||
|
# 更新metaddata
|
||||||
|
if meta_data and meta_data.get('movies_cnt') is not None:
|
||||||
|
meta_data['href'] = url
|
||||||
|
tmp_id = db_tools.update_pubs_multilang(meta_data, tbl)
|
||||||
|
if tmp_id:
|
||||||
|
logging.debug(f'update pubs multi lang. data: {meta_data}')
|
||||||
|
else:
|
||||||
|
logging.warning(f'update pubs multi lang failed. data: {meta_data}')
|
||||||
|
|
||||||
|
|
||||||
# 更新makers列表中的影片信息
|
# 更新makers列表中的影片信息
|
||||||
def fetch_movies_by_studio():
|
def fetch_movies_by_studio():
|
||||||
@ -484,7 +500,9 @@ def main(cmd, args):
|
|||||||
db_tools.finalize_task_log(task_id)
|
db_tools.finalize_task_log(task_id)
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
# 1, tags 和 studio / label / series 的多语言
|
# 1, tags 和 studio / label / series 的多语言 ---done
|
||||||
|
# 2, studio / label / series 表增加字段: movies_cnt magnet_cnt
|
||||||
|
# 3, movie 页面保存磁力链接
|
||||||
|
|
||||||
# 设置环境变量
|
# 设置环境变量
|
||||||
def set_env(args):
|
def set_env(args):
|
||||||
|
|||||||
Reference in New Issue
Block a user