modify scripts

This commit is contained in:
oscarz
2025-06-26 15:59:56 +08:00
parent 6fce587aad
commit d42e9b0456
2 changed files with 51 additions and 12 deletions

View File

@ -364,10 +364,31 @@ class JavbusCrawler(GenericCrawler):
# 提取前两个元素作为工作室和角色
studio = parts[video_index - 2]
role = parts[video_index - 1]
result['meta'] = {'title': studio, 'role': role}
result['meta']['title'] = studio
result['meta']['role'] = role
else:
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
# 提取全部影片和已有磁力的数量
# 查找a标签
a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
if not a_tags:
logging.warning(f'found no movie cnt. href: {href}')
else:
for a in a_tags:
text = a.get_text(strip=True)
# 提取全部影片数量
if '全部影片' in text:
match = re.search(r'全部影片\s*(\d+)\s*', text)
if match:
result['meta']['movies_cnt'] = int(match.group(1))
# 提取已有磁力数量
if '已有磁力' in text:
match = re.search(r'已有磁力\s*(\d+)\s*', text)
if match:
result['meta']['magnet_cnt'] = int(match.group(1))
div_waterfall = soup.find('div', id='waterfall')
if not div_waterfall:
logging.warning(f"found no records. href: {href}")
@ -415,19 +436,19 @@ class JavbusCrawler(GenericCrawler):
# 提取标题
div_container = soup.find('div', class_='container')
if not div_container:
logging.warning(f"found no container tag.")
logging.warning(f"found no container tag. href: {href}")
return None
title_element = div_container.find('h3')
if title_element:
result['title'] = title_element.get_text(strip=True)
else:
logging.debug("未找到影片标题")
logging.debug("no title found. href: {href}")
# 提取基本信息(识别码、发行日期等)
info_div = div_container.find('div', class_='info')
if not info_div:
logging.warning(f"found no div info tag.")
logging.warning(f"found no div info tag. href: {href}")
return None
# 定义字段映射关系(多种语言支持)
@ -511,11 +532,11 @@ class JavbusCrawler(GenericCrawler):
else:
logging.debug(f"actors not found.")
else:
logging.warning("未找到演员列表区域")
logging.debug("no star-name area. href: {href}")
else:
logging.warning("未找到演员标题")
logging.debug("no star-show area. href: {href}")
except Exception as e:
logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
logging.warning(f"parse movie detail error. href: {href}, error: {str(e)}", exc_info=True)
return result

View File

@ -82,7 +82,7 @@ def fetch_actor_list():
# 从studio/label/series中获取影片
def fetch_movies_common(tbl):
if debug:
url_list = db_tools.query_list_common(tbl=tbl)
url_list = db_tools.query_list_common(tbl=tbl, limit=2)
else:
if g_uncensored==1:
url_list = db_tools.query_list_common(tbl=tbl, uncensored=1)
@ -101,6 +101,8 @@ def fetch_movies_common(tbl):
if not utils.is_valid_url(url):
logging.info(f'invalid url ({url}) in {tbl}, row id: {row_id}. skipping...')
continue
meta_data = None
# 去掉可下载的标志(如果有)
next_url = url
while next_url:
@ -108,7 +110,11 @@ def fetch_movies_common(tbl):
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
if soup:
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
if list_data:
if list_data:
# 更新metadata
if meta_data is None:
meta_data = list_data.get('meta', {})
# 根据tbl的值动态构建额外参数
extra_kwargs = {}
if tbl == 'studio':
@ -138,7 +144,17 @@ def fetch_movies_common(tbl):
# 调试增加brak
if debug:
return True
break
# 更新metaddata
if meta_data and meta_data.get('movies_cnt') is not None:
meta_data['href'] = url
tmp_id = db_tools.update_pubs_multilang(meta_data, tbl)
if tmp_id:
logging.debug(f'update pubs multi lang. data: {meta_data}')
else:
logging.warning(f'update pubs multi lang failed. data: {meta_data}')
# 更新makers列表中的影片信息
def fetch_movies_by_studio():
@ -484,7 +500,9 @@ def main(cmd, args):
db_tools.finalize_task_log(task_id)
# TODO:
# 1, tags 和 studio / label / series 的多语言
# 1, tags 和 studio / label / series 的多语言 ---done
# 2, studio / label / series 表增加字段: movies_cnt magnet_cnt
# 3, movie 页面保存磁力链接
# 设置环境变量
def set_env(args):