modify scripts
This commit is contained in:
@ -364,10 +364,31 @@ class JavbusCrawler(GenericCrawler):
|
||||
# 提取前两个元素作为工作室和角色
|
||||
studio = parts[video_index - 2]
|
||||
role = parts[video_index - 1]
|
||||
result['meta'] = {'title': studio, 'role': role}
|
||||
result['meta']['title'] = studio
|
||||
result['meta']['role'] = role
|
||||
else:
|
||||
logging.debug(f"无法按规则解析: {' - '.join(parts)}")
|
||||
|
||||
|
||||
# 提取全部影片和已有磁力的数量
|
||||
# 查找a标签
|
||||
a_tags = soup.select('.alert.alert-success.alert-common a.mypointer')
|
||||
if not a_tags:
|
||||
logging.warning(f'found no movie cnt. href: {href}')
|
||||
else:
|
||||
for a in a_tags:
|
||||
text = a.get_text(strip=True)
|
||||
# 提取全部影片数量
|
||||
if '全部影片' in text:
|
||||
match = re.search(r'全部影片\s*(\d+)\s*', text)
|
||||
if match:
|
||||
result['meta']['movies_cnt'] = int(match.group(1))
|
||||
|
||||
# 提取已有磁力数量
|
||||
if '已有磁力' in text:
|
||||
match = re.search(r'已有磁力\s*(\d+)\s*', text)
|
||||
if match:
|
||||
result['meta']['magnet_cnt'] = int(match.group(1))
|
||||
|
||||
div_waterfall = soup.find('div', id='waterfall')
|
||||
if not div_waterfall:
|
||||
logging.warning(f"found no records. href: {href}")
|
||||
@ -415,19 +436,19 @@ class JavbusCrawler(GenericCrawler):
|
||||
# 提取标题
|
||||
div_container = soup.find('div', class_='container')
|
||||
if not div_container:
|
||||
logging.warning(f"found no container tag.")
|
||||
logging.warning(f"found no container tag. href: {href}")
|
||||
return None
|
||||
|
||||
title_element = div_container.find('h3')
|
||||
if title_element:
|
||||
result['title'] = title_element.get_text(strip=True)
|
||||
else:
|
||||
logging.debug("未找到影片标题")
|
||||
logging.debug("no title found. href: {href}")
|
||||
|
||||
# 提取基本信息(识别码、发行日期等)
|
||||
info_div = div_container.find('div', class_='info')
|
||||
if not info_div:
|
||||
logging.warning(f"found no div info tag.")
|
||||
logging.warning(f"found no div info tag. href: {href}")
|
||||
return None
|
||||
|
||||
# 定义字段映射关系(多种语言支持)
|
||||
@ -511,11 +532,11 @@ class JavbusCrawler(GenericCrawler):
|
||||
else:
|
||||
logging.debug(f"actors not found.")
|
||||
else:
|
||||
logging.warning("未找到演员列表区域")
|
||||
logging.debug("no star-name area. href: {href}")
|
||||
else:
|
||||
logging.warning("未找到演员标题")
|
||||
logging.debug("no star-show area. href: {href}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True)
|
||||
logging.warning(f"parse movie detail error. href: {href}, error: {str(e)}", exc_info=True)
|
||||
|
||||
return result
|
||||
|
||||
@ -82,7 +82,7 @@ def fetch_actor_list():
|
||||
# 从studio/label/series中获取影片
|
||||
def fetch_movies_common(tbl):
|
||||
if debug:
|
||||
url_list = db_tools.query_list_common(tbl=tbl)
|
||||
url_list = db_tools.query_list_common(tbl=tbl, limit=2)
|
||||
else:
|
||||
if g_uncensored==1:
|
||||
url_list = db_tools.query_list_common(tbl=tbl, uncensored=1)
|
||||
@ -101,6 +101,8 @@ def fetch_movies_common(tbl):
|
||||
if not utils.is_valid_url(url):
|
||||
logging.info(f'invalid url ({url}) in {tbl}, row id: {row_id}. skipping...')
|
||||
continue
|
||||
|
||||
meta_data = None
|
||||
# 去掉可下载的标志(如果有)
|
||||
next_url = url
|
||||
while next_url:
|
||||
@ -108,7 +110,11 @@ def fetch_movies_common(tbl):
|
||||
soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id"))
|
||||
if soup:
|
||||
list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url)
|
||||
if list_data:
|
||||
if list_data:
|
||||
# 更新metadata
|
||||
if meta_data is None:
|
||||
meta_data = list_data.get('meta', {})
|
||||
|
||||
# 根据tbl的值动态构建额外参数
|
||||
extra_kwargs = {}
|
||||
if tbl == 'studio':
|
||||
@ -138,7 +144,17 @@ def fetch_movies_common(tbl):
|
||||
|
||||
# 调试增加brak
|
||||
if debug:
|
||||
return True
|
||||
break
|
||||
|
||||
# 更新metaddata
|
||||
if meta_data and meta_data.get('movies_cnt') is not None:
|
||||
meta_data['href'] = url
|
||||
tmp_id = db_tools.update_pubs_multilang(meta_data, tbl)
|
||||
if tmp_id:
|
||||
logging.debug(f'update pubs multi lang. data: {meta_data}')
|
||||
else:
|
||||
logging.warning(f'update pubs multi lang failed. data: {meta_data}')
|
||||
|
||||
|
||||
# 更新makers列表中的影片信息
|
||||
def fetch_movies_by_studio():
|
||||
@ -484,7 +500,9 @@ def main(cmd, args):
|
||||
db_tools.finalize_task_log(task_id)
|
||||
|
||||
# TODO:
|
||||
# 1, tags 和 studio / label / series 的多语言
|
||||
# 1, tags 和 studio / label / series 的多语言 ---done
|
||||
# 2, studio / label / series 表增加字段: movies_cnt magnet_cnt
|
||||
# 3, movie 页面保存磁力链接
|
||||
|
||||
# 设置环境变量
|
||||
def set_env(args):
|
||||
|
||||
Reference in New Issue
Block a user