diff --git a/src/crawling/craw.py b/src/crawling/craw.py index 0dabdd0..88f9c8c 100644 --- a/src/crawling/craw.py +++ b/src/crawling/craw.py @@ -364,10 +364,31 @@ class JavbusCrawler(GenericCrawler): # 提取前两个元素作为工作室和角色 studio = parts[video_index - 2] role = parts[video_index - 1] - result['meta'] = {'title': studio, 'role': role} + result['meta']['title'] = studio + result['meta']['role'] = role else: logging.debug(f"无法按规则解析: {' - '.join(parts)}") - + + # 提取全部影片和已有磁力的数量 + # 查找a标签 + a_tags = soup.select('.alert.alert-success.alert-common a.mypointer') + if not a_tags: + logging.warning(f'found no movie cnt. href: {href}') + else: + for a in a_tags: + text = a.get_text(strip=True) + # 提取全部影片数量 + if '全部影片' in text: + match = re.search(r'全部影片\s*(\d+)\s*', text) + if match: + result['meta']['movies_cnt'] = int(match.group(1)) + + # 提取已有磁力数量 + if '已有磁力' in text: + match = re.search(r'已有磁力\s*(\d+)\s*', text) + if match: + result['meta']['magnet_cnt'] = int(match.group(1)) + div_waterfall = soup.find('div', id='waterfall') if not div_waterfall: logging.warning(f"found no records. href: {href}") @@ -415,19 +436,19 @@ class JavbusCrawler(GenericCrawler): # 提取标题 div_container = soup.find('div', class_='container') if not div_container: - logging.warning(f"found no container tag.") + logging.warning(f"found no container tag. href: {href}") return None title_element = div_container.find('h3') if title_element: result['title'] = title_element.get_text(strip=True) else: - logging.debug("未找到影片标题") + logging.debug("no title found. href: {href}") # 提取基本信息(识别码、发行日期等) info_div = div_container.find('div', class_='info') if not info_div: - logging.warning(f"found no div info tag.") + logging.warning(f"found no div info tag. href: {href}") return None # 定义字段映射关系(多种语言支持) @@ -511,11 +532,11 @@ class JavbusCrawler(GenericCrawler): else: logging.debug(f"actors not found.") else: - logging.warning("未找到演员列表区域") + logging.debug("no star-name area. href: {href}") else: - logging.warning("未找到演员标题") + logging.debug("no star-show area. href: {href}") except Exception as e: - logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True) + logging.warning(f"parse movie detail error. href: {href}, error: {str(e)}", exc_info=True) return result diff --git a/src/javbus/fetch.py b/src/javbus/fetch.py index 2684a97..d112053 100644 --- a/src/javbus/fetch.py +++ b/src/javbus/fetch.py @@ -82,7 +82,7 @@ def fetch_actor_list(): # 从studio/label/series中获取影片 def fetch_movies_common(tbl): if debug: - url_list = db_tools.query_list_common(tbl=tbl) + url_list = db_tools.query_list_common(tbl=tbl, limit=2) else: if g_uncensored==1: url_list = db_tools.query_list_common(tbl=tbl, uncensored=1) @@ -101,6 +101,8 @@ def fetch_movies_common(tbl): if not utils.is_valid_url(url): logging.info(f'invalid url ({url}) in {tbl}, row id: {row_id}. skipping...') continue + + meta_data = None # 去掉可下载的标志(如果有) next_url = url while next_url: @@ -108,7 +110,11 @@ def fetch_movies_common(tbl): soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="waterfall", attr_type="id")) if soup: list_data, next_url = scraper.parse_studios_labels_series_detail(soup, next_url) - if list_data: + if list_data: + # 更新metadata + if meta_data is None: + meta_data = list_data.get('meta', {}) + # 根据tbl的值动态构建额外参数 extra_kwargs = {} if tbl == 'studio': @@ -138,7 +144,17 @@ def fetch_movies_common(tbl): # 调试增加brak if debug: - return True + break + + # 更新metaddata + if meta_data and meta_data.get('movies_cnt') is not None: + meta_data['href'] = url + tmp_id = db_tools.update_pubs_multilang(meta_data, tbl) + if tmp_id: + logging.debug(f'update pubs multi lang. data: {meta_data}') + else: + logging.warning(f'update pubs multi lang failed. data: {meta_data}') + # 更新makers列表中的影片信息 def fetch_movies_by_studio(): @@ -484,7 +500,9 @@ def main(cmd, args): db_tools.finalize_task_log(task_id) # TODO: - # 1, tags 和 studio / label / series 的多语言 + # 1, tags 和 studio / label / series 的多语言 ---done + # 2, studio / label / series 表增加字段: movies_cnt magnet_cnt + # 3, movie 页面保存磁力链接 # 设置环境变量 def set_env(args):