From 7e14a5f247ea2bf4613eb4c1f9e569c0565c958a Mon Sep 17 00:00:00 2001 From: oscarz Date: Tue, 24 Jun 2025 19:03:44 +0800 Subject: [PATCH] modify scripts --- src/crawling/craw.py | 469 +++++++++++++++++++++++--------------- src/db_utils/sqlite_db.py | 250 +++++++++++++++++++- src/javbus/fetch.py | 60 ++--- src/utils/utils.py | 57 ++++- 4 files changed, 610 insertions(+), 226 deletions(-) diff --git a/src/crawling/craw.py b/src/crawling/craw.py index a850a4c..24ea5f0 100644 --- a/src/crawling/craw.py +++ b/src/crawling/craw.py @@ -1,6 +1,7 @@ import logging import sys import requests +import re from bs4 import BeautifulSoup from urllib.parse import urljoin import src.utils.utils as utils @@ -8,6 +9,7 @@ import src.utils.utils as utils http_code_404 = 404 http_code_redirect = 401 http_code_url = 601 +http_code_local = 99 # 通用的爬取类,主要实现了底层的网络交互封装 class GenericCrawler: @@ -166,138 +168,293 @@ class JavbusCrawler(GenericCrawler): return list_data, next_url + # 获取演员详情 def parse_actor_detail(self, soup, href): - # 先找一下别名 - alias_list = [] - - div_meta = soup.find('span', class_='actor-section-name') - if not div_meta: - logging.warning(f'warning: no meta data found in page {href}') - return None, None - alias_div = soup.find('div', class_='column section-title') - - if alias_div: - meta_list = alias_div.find_all('span', class_='section-meta') - if len(meta_list) > 1: - alias_list = meta_list[0].text.strip().split(", ") - - # 头像 - pic = '' - avatar = soup.find("div", class_="column actor-avatar") - if avatar: - pic = self.parse_avatar_image(avatar) - - # 返回数据 - actor = {} - - # 使用正则表达式查找 class 包含 'movie-list h cols-4' 的 div 元素 - div_movies = soup.find("div", class_=re.compile(r'movie-list h cols-')) - # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5') - # div_movies = soup.find("div", class_='movie-list h cols-4 vcols-8') - if not div_movies: - logging.warning(f"Warning: No movies div found ") - return None, None - - # 解析元素 - rows = div_movies.find_all('div', class_='item') - - list_data = [] - next_url = None - for row in rows: - link = row.find('a', class_='box')['href'] - serial_number = row.find('strong').text.strip() - title = row.find('div', class_='video-title').text.strip() - release_date = row.find('div', class_='meta').text.strip() - list_data.append({ - 'href': host_url + link if link else '', - 'serial_number': serial_number, - 'title': title, - 'release_date': release_date - }) - - # 查找 "下一页" 按钮 - next_page_element = soup.find('a', class_='pagination-next') - if next_page_element: - next_page_url = next_page_element['href'] - next_page_number = self.url_page_num(next_page_url) - current_page_number = self.url_page_num(href) - logging.debug(f'current_page: {current_page_number}, next page_num: {next_page_number}') - if current_page_number is None: - current_page_number = 0 - if next_page_number and next_page_number > current_page_number: - next_url = host_url + next_page_url - - actor = { - 'pic': pic, - 'alias': alias_list, - 'movies': list_data + """ + 解析Javbus网页内容,提取演员信息和影片列表 + """ + result = { + 'avatar': {}, + 'movies': [] } + + try: + # 解析演员信息 + avatar_box = soup.find('div', class_='avatar-box') + if avatar_box: + result['avatar'] = self.parse_avatar_info(avatar_box) + else: + logging.debug(f"avatar-box not found. href: {href}") + + # 解析影片列表 + movie_boxes = soup.find_all('a', class_='movie-box') + if movie_boxes: + for movie_box in movie_boxes: + movie_info = self.parse_movie_info(movie_box) + if movie_info: + result['movies'].append(movie_info) + else: + logging.debug(f"movie-box not found. href: {href}") + + except Exception as e: + logging.warning(f"parse html error: {str(e)}, href: {href}", exc_info=True) + + # 查找 "下一页" 按钮 + next_url = None + div_link = soup.find("div", class_='text-center hidden-xs') + if div_link: + next_page_element = soup.find('a', id='next') + if next_page_element: + next_page_url = next_page_element['href'] + next_url = urljoin(href, next_page_url) + + return result, next_url - return actor, next_url - - def parse_movie_one(self, soup, keys): - key_strong = soup.find('strong', string=lambda text: text in keys) - if key_strong: - key_span = key_strong.find_next_sibling('span', class_='value') - if key_span: - return key_span.text.strip() - return None - - def parse_movie_val_href(self, soup, keys): - key_strong = soup.find('strong', string=lambda text: text in keys) - if key_strong: - key_span = key_strong.find_next_sibling('span', class_='value') - if key_span: - a_tag = key_span.find('a') - if a_tag: - return a_tag.text.strip(), host_url + a_tag.get('href') + def parse_avatar_info(self, avatar_box): + """ + 解析演员信息 + """ + avatar_info = {} + + # 定义映射关系:包含各种语言的字段名称及其对应的目标键名 + field_mapping = { + 'birth_date': ['生日', 'D.O.B', '生年月日', 'Birthday'], + 'age': ['年齡', 'Age', '年龄'], + 'height': ['身高', 'Height', '身長'], + 'breast_size': ['罩杯', 'Cup', 'ブラのサイズ'], + 'bust': ['胸圍', 'Bust', 'バスト'], + 'waist': ['腰圍', 'Waist', 'ウエスト'], + 'hip': ['臀圍', 'Hips', 'ヒップ'], + 'hobby': ['愛好', 'Hobby', '趣味'] + } + # 提取演员名称 + name_span = avatar_box.find('span', class_='pb10') + if name_span: + avatar_info['name'] = name_span.get_text(strip=True) + else: + logging.debug("未找到演员名称") + + # 提取生日、年龄等信息 + p_tags = avatar_box.find_all('p') + for p in p_tags: + text = p.get_text(strip=True) + # 使用正则表达式匹配冒号前后的内容 + match = re.search(r'^(.*?)[::](.*)$', text) + if match: + key = match.group(1).strip() + value = match.group(2).strip() + + # 查找对应的目标键名 + target_key = next((k for k, v in field_mapping.items() if any(x in key for x in v)), None) + + if target_key: + # 特殊处理数字类型和单位转换 + if target_key in ['age', 'height', 'bust', 'waist', 'hip']: + # 提取数字部分 + num_match = re.search(r'(\d+\.?\d*)', value) + if num_match: + try: + avatar_info[target_key] = float(num_match.group(1)) + # 保留整数(如果是整数) + if avatar_info[target_key].is_integer(): + avatar_info[target_key] = int(avatar_info[target_key]) + except ValueError: + logging.debug(f"转换数字失败: {value}") + avatar_info[target_key] = value + else: + logging.debug(f"未找到数字部分: {value}") + avatar_info[target_key] = value + else: + avatar_info[target_key] = value else: - return key_span.text.strip(), None - return None, None + logging.debug(f"未知的演员信息类型: {key}") + else: + logging.debug(f"无法解析的演员信息: {text}") - def parse_movie_arr(self, soup, keys): - key_strong = soup.find('strong', string=lambda text: text in keys) - if key_strong: - key_span = key_strong.find_next_sibling('span', class_='value') - if key_span: - actors = [] - a_tags = key_span.find_all('a') - for a_tag in a_tags: - actors.append({ - 'name': a_tag.text.strip(), - 'href': host_url + a_tag.get('href') - }) - return actors - return [] + avatar_info['measurements'] = f"{avatar_info.get('bust', '')}-{avatar_info.get('waist', '')}-{avatar_info.get('hip', '') }" + return avatar_info + def parse_movie_info(self, movie_box): + """ + 解析影片信息 + """ + movie_info = {} + + try: + # 提取影片链接 + href = movie_box.get('href') + if href: + movie_info['href'] = href + else: + logging.warning("未找到影片链接") + return None + + # 提取图片链接 + img_tag = movie_box.find('img') + if img_tag and 'src' in img_tag.attrs: + movie_info['cover_url'] = img_tag['src'] + movie_info['title'] = img_tag['title'] + else: + logging.warning("未找到影片图片链接") + + # 提取标题、番号和发布日期 + photo_info = movie_box.find('div', class_='photo-info') + if photo_info: + # 提取标题 (span标签中的文本,排除date标签) + span_tag = photo_info.find('span') + if span_tag: + # 获取span下的纯文本内容 (不包含date标签) + title = ''.join(span_tag.find_all(text=True, recursive=False)).strip() + # 移除常见的分隔符模式 + if title.endswith('\n\n /'): + clean_title = title[:-4].strip() + elif title.endswith('\n /'): + clean_title = title[:-3].strip() + else: + clean_title = title + + movie_info['title'] = clean_title + + # 提取番号和日期 (date标签) + date_tags = span_tag.find_all('date') + if len(date_tags) >= 2: + movie_info['serial_number'] = date_tags[0].get_text(strip=True) + movie_info['release_date'] = date_tags[1].get_text(strip=True) + else: + logging.warning(f"date标签数量不足,无法提取番号和日期") + else: + logging.warning("未找到span标签") + else: + logging.warning("未找到影片信息区域") + + except Exception as e: + logging.error(f"解析影片信息时发生错误: {str(e)}", exc_info=True) + return None + + return movie_info + + # 解析Javbus影片详情页内容 def parse_movie_detail(self, soup, href, title): - div_video = soup.find("div", class_='video-meta-panel') - if not div_video: - logging.warning(f"Warning: No movies div found ") - return None, None - - result = {} - result['href'] = href - result['title'] = title - - # 获取封面图片 - cover_img = soup.select_one('.column-video-cover a') - result['cover_url'] = cover_img['href'] if cover_img else None - - # 获取番号 - result['serial_number'] = self.parse_movie_one(soup, ['番號:', 'ID:']) - result['release_date'] = self.parse_movie_one(soup, ['日期:', 'Released Date:']) - result['duration'] = self.parse_movie_one(soup, ['時長:', 'Duration:']) - - # 获取maker,系列 - result['maker_name'], result['maker_link'] = self.parse_movie_val_href(soup, ['片商:', 'Maker:']) - result['series_name'], result['series_link'] = self.parse_movie_val_href(soup, ['系列:', 'Series:']) - result['pub_name'], result['pub_link'] = self.parse_movie_val_href(soup, ['發行:', 'Publisher:']) - - # 获取演员,tags - result['tags'] = self.parse_movie_arr(soup, ['類別:', 'Tags:']) - result['actors'] = self.parse_movie_arr(soup, ['演員:', 'Actor(s):']) + result = { + 'title': title, + 'href': href, + 'serial_number': '', + 'release_date': '', + 'duration': '', + 'studio': {'name': '', 'href': ''}, + 'label': {'name': '', 'href': ''}, + 'series': {'name': '', 'href': ''}, + 'tags': [], + 'actors': [] + } + + try: + # 提取标题 + div_container = soup.find('div', class_='container') + if not div_container: + logging.warning(f"found no container tag.") + return None + + title_element = div_container.find('h3') + if title_element: + result['title'] = title_element.get_text(strip=True) + else: + logging.debug("未找到影片标题") + + # 提取基本信息(识别码、发行日期等) + info_div = div_container.find('div', class_='info') + if not info_div: + logging.warning(f"found no div info tag.") + return None + + # 定义字段映射关系(多种语言支持) + field_mapping = { + 'serial_number': ['識別碼:', '识别码:', 'ID:', '品番:'], + 'release_date': ['發行日期:', '发行日期:', 'Release Date:', '発売日:'], + 'duration': ['長度:', '长度:', 'Length:', '収録時間:'], + 'studio': ['製作商:', '制作商:', 'Studio:', 'メーカー:'], + 'label': ['發行商:', '发行商:', 'Label:', 'レーベル:'], + 'series': ['系列:', 'Series:', 'シリーズ:'] + } + + # 遍历所有p标签查找信息 + p_tags = info_div.find_all('p') + for p in p_tags: + # 查找header标签 + header = p.find('span', class_='header') + if header: + header_text = header.get_text(strip=True) + + # 查找匹配的目标键名 + target_key = next((k for k, v in field_mapping.items() if header_text in v), None) + + if target_key: + # 获取值(处理文本和链接) + if target_key in ['studio', 'label', 'series']: + # 处理有链接的字段 + a_tag = p.find('a') + if a_tag: + result[target_key]['name'] = a_tag.get_text(strip=True) + result[target_key]['href'] = a_tag.get('href', '') + else: + # 没有链接,直接获取文本 + value_text = p.get_text(strip=True) + # 移除header文本 + value_text = value_text.replace(header_text, '').strip() + result[target_key]['name'] = value_text + logging.debug(f"{header_text} 没有链接,直接提取文本") + else: + # 处理普通文本字段 + value_text = p.get_text(strip=True) + # 移除header文本 + value_text = value_text.replace(header_text, '').strip() + + # 特殊处理:提取时长的数字部分(咱不处理) + if target_key == 'duration' and False: + num_match = re.search(r'(\d+)', value_text) + if num_match: + result[target_key] = num_match.group(1) + else: + result[target_key] = value_text + else: + result[target_key] = value_text + # 处理类别字段 + tag_lables = info_div.find_all('label') + for item in tag_lables: + link = item.find('a') + if link: + genre = { + 'name': link.get_text(strip=True), + 'href': link.get('href', '') + } + result['tags'].append(genre) + + # 提取演员信息 + star_p = info_div.find('p', class_='star-show') + if star_p: + # 查找演员列表 + star_list = star_p.find_next('ul') + if star_list: + star_items = star_list.find_all('div', class_='star-name') + for item in star_items: + link = item.find('a') + if link: + actor = { + 'name': link.get_text(strip=True), + 'href': link.get('href', '') + } + result['actors'].append(actor) + else: + logging.debug(f"actors not found.") + else: + logging.warning("未找到演员列表区域") + else: + logging.warning("未找到演员标题") + + except Exception as e: + logging.error(f"解析影片详情时发生错误: {str(e)}", exc_info=True) + return result def parse_series_uncensored(self, soup, href): @@ -527,67 +684,3 @@ class JavbusCrawler(GenericCrawler): next_url = host_url + next_page_url return list_data, next_url - - @staticmethod - def pretty_print_json(data, n=10, indent=4, sort_keys=False): - """ - 以美化格式打印数组的前n个元素,其他元素用"..."表示 - - 参数: - - data: 要打印的数据(应为数组) - - n: 要显示的元素数量 - - indent: 缩进空格数 - - sort_keys: 是否按键排序 - """ - try: - # 处理非数组数据 - if not isinstance(data, list): - print(formatted) - return - - # 复制原始数据,避免修改原数组 - data_copy = data.copy() - - # 切片取前n个元素 - first_n_elements = data_copy[:n] - - # 如果数组长度超过n,添加"..."标记 - if len(data) > n: - result = first_n_elements + ["... ({} more elements)".format(len(data) - n)] - else: - result = first_n_elements - - # 格式化输出 - formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys) - print(formatted) - - except TypeError as e: - print(f"错误:无法格式化数据。详情:{e}") - except Exception as e: - print(f"打印时发生意外错误:{e}") - - def test_actor_list(self, url='https://www.javbus.com/uncensored/actresses/1'): - next_url = url - all_data = [] - while next_url: - print(f'fetching page {next_url}') - soup, status_code = self.fetch_page(next_url, partial(self.generic_validator, tag="div", identifier="waterfall", attr_type="id"), - max_retries=1) - if soup: - list_data, next_url = self.parse_actors_list(soup, next_url) - if list_data: - all_data.extend(list_data) - self.pretty_print_json(all_data) - else: - print('get wrong page.') - - if next_url: - print(f"\n\nnext url: {next_url}") - else: - print(f"wrong request. url: {next_url}, status_code: {status_code}") - - break - - def url_page_num(self, url): - # 这里需要根据实际情况实现提取页码的逻辑 - return None diff --git a/src/db_utils/sqlite_db.py b/src/db_utils/sqlite_db.py index f21dca1..8670459 100644 --- a/src/db_utils/sqlite_db.py +++ b/src/db_utils/sqlite_db.py @@ -118,6 +118,11 @@ class DatabaseHandler: logging.error(f"Error inserting or updating data: {e}") return None + def get_id_by_key(self, tbl, uniq_key, val): + self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,)) + row = self.cursor.fetchone() + return row[0] if row else None + def insert_task_log(self): return 1 @@ -137,20 +142,62 @@ class JavbusDBHandler(DatabaseHandler): def __init__(self, db_path=None): super().__init__(db_path) self.tbl_name_actors = 'javbus_actors' + self.tbl_name_movies = 'javbus_movies' + self.tbl_name_studios = 'javbus_studios' + self.tbl_name_labels = 'javbus_labels' + self.tbl_name_series = 'javbus_series' + self.tbl_name_tags = 'javbus_tags' + self.tbl_name_movie_tags = 'javbus_movies_tags' + self.tbl_name_actor_movie = 'javbus_actors_movies' + + + def insert_actor_index(self, data, **kwargs): + fields = ['uncensored', 'from_actor_list', 'from_movie_list'] + # 如果没有传入值,就用原来的值 + for field in fields: + if kwargs.get(field) is not None: + data[field] = kwargs.get(field) - def insert_actor_index(self, data, uncensored=0, from_actor_list=0, from_movie_list=0): - data['uncensored'] = uncensored - if from_actor_list: - data['from_actor_list'] = from_actor_list - if from_movie_list: - data['from_movie_list'] = from_movie_list try: return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href') except sqlite3.Error as e: logging.error(f"Error inserting or updating data: {e}") return None - def update_actor_detail(self, data, is_full_data=1): + def insert_movie_index(self, data, **kwargs): + fields = ['uncensored', 'from_actor_list', 'from_movie_studios', 'from_movie_labels', 'from_movie_series'] + # 如果没有传入值,就用原来的值 + for field in fields: + if kwargs.get(field) is not None: + data[field] = kwargs.get(field) + try: + return self.insert_or_update_common(data, self.tbl_name_movies, uniq_key='href') + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + + # 插入演员和电影的关联数据 + def insert_actor_movie(self, performer_id, movie_id, tags=''): + try: + self.cursor.execute(""" + INSERT INTO javbus_actors_movies (actor_id, movie_id, tags, updated_at) + VALUES (?, ?, ?, datetime('now', 'localtime')) + ON CONFLICT(actor_id, movie_id) DO UPDATE SET tags=excluded.tags, updated_at=datetime('now', 'localtime') + """, + (performer_id, movie_id, tags) + ) + self.conn.commit() + + #logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}') + + return performer_id + + except Exception as e: + self.conn.rollback() + logging.error("Error inserting movie: %s", e) + return None + + def update_actor_detail_404(self, data, is_full_data=1): try: data['is_full_data'] = is_full_data return self.insert_or_update_common(data, self.tbl_name_actors, uniq_key='href') @@ -158,17 +205,49 @@ class JavbusDBHandler(DatabaseHandler): logging.error(f"Error inserting or updating data: {e}") return None + def update_actor_detail(self, data, is_full_data=1): + try: + # 跟新actor表 + if data.get('avatar') is not None: + avatar = data.get('avatar', {}) + avatar['href'] = data['href'] + avatar['is_full_data'] = is_full_data + avatar_id = self.insert_or_update_common(avatar, self.tbl_name_actors, uniq_key='href') + logging.debug(f"update actor data. data: {avatar}") + else: + avatar_id = self.get_id_by_key(self.tbl_name_actors, 'href', data.get('href', '')) + if not avatar_id: + logging.warning(f"get actor id error. href: {data['href']}") + return None + + # 更新movies表 + uncensored = data.get('uncensored', 0) + for movie in data.get('credits', []): + movie_id = self.insert_movie_index(movie, from_actor_list=1, uncensored=uncensored) + if movie_id: + logging.debug(f"insert one movie index. data: {movie}") + # 插入关系表 + link_id = self.insert_actor_movie(avatar_id, movie_id) + if link_id: + logging.debug(f"insert one actor_movie record. actor id: {avatar_id}, movie id: {movie_id}") + + return avatar_id + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + def query_actors(self, **filters): try: - sql = f"SELECT url, en_name as name FROM {self.tbl_name_actors} WHERE 1=1" + sql = f"SELECT href, en_name as name, uncensored FROM {self.tbl_name_actors} WHERE 1=1" params = [] conditions = { "id": " AND id = ?", - "url": " AND href = ?", - "en_name": " AND name LIKE ?", + "href": " AND href = ?", + "en_name": " AND en_name LIKE ?", "is_full_data": " AND is_full_data = ?", "start_id": " AND id > ?", + "uncensored": " AND uncensored = ?", } for key, condition in conditions.items(): @@ -197,8 +276,157 @@ class JavbusDBHandler(DatabaseHandler): params.append(filters["limit"]) self.cursor.execute(sql, params) - return [{'url': row[0], 'name': row[1]} for row in self.cursor.fetchall()] + return [{'href': row[0], 'name': row[1], 'uncensored': row[2]} for row in self.cursor.fetchall()] except sqlite3.Error as e: logging.error(f"查询 href 失败: {e}") return None + + def query_movies(self, **filters): + try: + sql = f"SELECT href, title, uncensored, id FROM {self.tbl_name_movies} WHERE 1=1" + params = [] + + conditions = { + "id": " AND id = ?", + "href": " AND href = ?", + "title": " AND title LIKE ?", + "is_full_data": " AND is_full_data = ?", + "start_id": " AND id > ?", + "uncensored": " AND uncensored = ?", + } + + for key, condition in conditions.items(): + if key in filters: + sql += condition + if key == "title": + params.append(f"%{filters[key]}%") + else: + params.append(filters[key]) + + for key in ["is_full_data_in", "is_full_data_not_in"]: + if key in filters: + values = filters[key] + if values: + placeholders = ", ".join(["?"] * len(values)) + operator = "IN" if key == "is_full_data_in" else "NOT IN" + sql += f" AND is_full_data {operator} ({placeholders})" + params.extend(values) + + if "order_by" in filters: + # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 + sql += f" ORDER BY {filters['order_by']} " + + if 'limit' in filters: + sql += " LIMIT ?" + params.append(filters["limit"]) + + self.cursor.execute(sql, params) + return [{'href': row[0], 'title': row[1], 'uncensored': row[2], 'id':row[3]} for row in self.cursor.fetchall()] + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + + # 检查记录是否存在,不存在就插入 + def check_and_get_id(self, item, uncensored, tbl, uniq_key='href'): + name = item['name'] + href = item['href'] + row_id = self.get_id_by_key(tbl, uniq_key, href) + if row_id is None: + row_id = self.insert_or_update_common({'name':name, 'href': href, 'uncensored':uncensored, 'from_movie_list':1}, tbl_name=tbl, uniq_key=uniq_key) + + return row_id + + def insert_or_update_tags(self, data, uniq_key='href'): + return self.insert_or_update_common(data, self.tbl_name_tags, uniq_key) + + def insert_movie_tags(self, movie_id, tag_id, tags): + try: + self.cursor.execute(""" + INSERT INTO javbus_movies_tags (movie_id, tag_id, tags, updated_at) + VALUES (?, ?, ?, datetime('now', 'localtime')) + ON CONFLICT(tag_id, movie_id) DO UPDATE SET tags=excluded.tags, updated_at=datetime('now', 'localtime') + """, + (movie_id, tag_id, tags) + ) + self.conn.commit() + + #logging.debug(f'insert one performer_movie, performer_id: {performer_id}, movie_id: {movie_id}') + + return movie_id + + except Exception as e: + self.conn.rollback() + logging.error("Error inserting movie: %s", e) + return None + + def insert_or_update_movie_404(self, data, is_full_data=1): + try: + data['is_full_data'] = is_full_data + return self.insert_or_update_common(data, self.tbl_name_movies, uniq_key='href') + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + # """插入或更新电影数据""" + def insert_or_update_movie(self, movie, is_full_data=1): + try: + # 获取相关 ID + studio_id = self.check_and_get_id(movie.get('studio'), movie.get('uncensored', 0), self.tbl_name_studios) if movie.get('studio') is not None else None + label_id = self.check_and_get_id(movie.get('label'), movie.get('uncensored', 0), self.tbl_name_labels) if movie.get('label') is not None else None + series_id = self.check_and_get_id(movie.get('series'), movie.get('uncensored', 0), self.tbl_name_series) if movie.get('series') is not None else None + + if studio_id: + movie['studio_id'] = studio_id + if label_id: + movie['label_id'] = label_id + if series_id: + movie['series_id'] = series_id + + movie['is_full_data'] = is_full_data + movie['actors_cnt'] = len(movie.get('actors', [])) + + movie_id = self.insert_or_update_common(movie, self.tbl_name_movies, uniq_key='href') + if movie_id is None: + logging.warning(f"insert/update movie error. data:{movie}") + return None + + logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}") + + # 插入 performers_movies 关系表 + for performer in movie.get('actors', []): + performer_id = self.get_id_by_key(self.tbl_name_actors, 'href', performer['href']) + # 如果演员不存在,先插入 + if performer_id is None: + performer_id = self.insert_actor_index(performer['name'], performer['href'], from_movie_list=1) + logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}") + if performer_id: + tmp_id = self.insert_actor_movie(performer_id, movie_id) + if tmp_id: + logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}") + else: + logging.debug(f"insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}") + else: + logging.warning(f"insert perfomer failed. name: {performer['name']}, href: {performer['href']}") + + # 插入 tags 表 + for tag in movie.get('tags', []): + tag_name = tag.get('name', '') + tag_href = tag.get('href', '') + tag_id = self.insert_or_update_tags({'name':tag_name, 'href':tag_href}, uniq_key='href') + if tag_id: + logging.debug(f"insert one tags. tag_id: {tag_id}, name: {tag_name}") + tmp_id = self.insert_movie_tags(movie_id=movie_id, tag_id=tag_id, tags=tag_name) + if tmp_id: + logging.debug(f"insert one movie_tag. movie_id: {movie_id}, tag_id: {tag_id}, name: {tag_name}") + else: + logging.warning(f"insert one movie_tag error. movie_id: {movie_id}, tag_id: {tag_id}, name: {tag_name}") + else: + logging.warning(f"insert tags error. name:{tag_name}, href: {tag_href}") + + return movie_id + + except Exception as e: + self.conn.rollback() + logging.error("Error inserting movie: %s", e) + return None + diff --git a/src/javbus/fetch.py b/src/javbus/fetch.py index 0fc8252..c71fcdb 100644 --- a/src/javbus/fetch.py +++ b/src/javbus/fetch.py @@ -246,13 +246,13 @@ def fetch_performers_detail(): limit_count = 5 if debug else 100 performers_list = [] last_performer_id = 0 - abnormal_codes = [scraper.http_code_404, scraper.http_code_login] + abnormal_codes = [craw.http_code_404, craw.http_code_redirect] def get_performers(**kwargs): if scan_mode == 1: - kwargs["from_actor_list"] = 1 + kwargs["uncensored"] = 1 elif scan_mode == 0: - kwargs["from_actor_list"] = 0 + kwargs["uncensored"] = 0 else: logging.debug(f"scan all records") kwargs["order_by"] = 'id asc' @@ -278,29 +278,29 @@ def fetch_performers_detail(): for performer in performers_list: url = performer['href'] person = performer['name'] - pic = '' - alias = [] + uncensored = int(performer['uncensored']) + avatar = None next_url = url all_movies = [] need_insert = True while next_url: logging.debug(f"Fetching data for actor ({person}), url {next_url} ...") - soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class")) + soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="alert alert-success alert-common", attr_type="class")) if soup: data, next_url = scraper.parse_actor_detail(soup, next_url) if data: - pic = data.get('pic', '') - alias = data.get('alias', []) + if not avatar: + avatar = data.get('avatar') all_movies.extend(data.get('movies', [])) - elif status_code and status_code == scraper.http_code_404: - actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404) + elif status_code and status_code == craw.http_code_404: + actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_404}) logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break - elif status_code and status_code == scraper.http_code_login: - actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_login) + elif status_code and status_code == craw.http_code_redirect: + actor_id = db_tools.update_actor_detail_404({'href': url, 'is_full_data': craw.http_code_redirect}) logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...') need_insert = False break @@ -311,16 +311,20 @@ def fetch_performers_detail(): if not need_insert: continue + #utils.pretty_print_json(avatar) + #utils.pretty_print_json(all_movies) + #continue + # 获取完了个人的所有影片,开始插入数据 - performer_id = db_tools.insert_or_update_actor({ + performer_id = db_tools.update_actor_detail({ 'href': url, 'name': person, - 'pic' : pic, - 'alias' : alias, - 'credits':all_movies + 'avatar': avatar, + 'credits':all_movies, + 'uncensored':uncensored }) if performer_id: - logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') + logging.debug(f'insert/update one person, id: {performer_id}, person: ({person}), url: {url}') last_performer_id = performer_id succ_rows += 1 else: @@ -334,10 +338,10 @@ def fetch_performers_detail(): # 更新影片信息 def fetch_movies_detail(): - limit_count = 10 if debug else 100 + limit_count = 2 if debug else 100 movies_list = [] last_movie_id = 0 - abnormal_codes = [scraper.http_code_404, scraper.http_code_login] + abnormal_codes = [craw.http_code_404, craw.http_code_redirect] def get_movies(**kwargs): if scan_mode == 1: @@ -347,7 +351,7 @@ def fetch_movies_detail(): else: logging.debug(f"scan all records.") kwargs["order_by"] = 'id asc' - return db_tools.query_movie_hrefs(limit=limit_count, **kwargs) + return db_tools.query_movies(limit=limit_count, **kwargs) while True: if update_mode == 0: # 只遍历新纪录 @@ -370,10 +374,11 @@ def fetch_movies_detail(): url = movie['href'] title = movie['title'] curr_id = movie['id'] + uncensored = int(movie['uncensored']) logging.debug(f"Fetching data for movie ({title}), url {url} ...") - soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class")) + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="container", attr_type="class")) # 从本地读取的文件,忽略 - if skip_local and status_code == scraper.http_code_local : + if skip_local and status_code == craw.http_code_local : last_movie_id = curr_id succ_count += 1 continue @@ -381,6 +386,9 @@ def fetch_movies_detail(): if soup: movie_data = scraper.parse_movie_detail(soup, url, title) if movie_data : + #utils.pretty_print_json(movie_data) + #continue + movie_data['uncensored'] = uncensored movie_id = db_tools.insert_or_update_movie(movie_data) if movie_id: logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}') @@ -391,11 +399,11 @@ def fetch_movies_detail(): else: logging.warning(f'parse_page_movie error. url: {url}') - elif status_code and status_code == scraper.http_code_404: - movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404) + elif status_code and status_code == craw.http_code_404: + movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_404}) logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...') - elif status_code and status_code == scraper.http_code_login: - movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login) + elif status_code and status_code == craw.http_code_redirect: + movie_id = db_tools.insert_or_update_movie_404({'href': url, 'is_full_data': craw.http_code_redirect}) logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...') else: logging.warning(f'fetch_page error. url: {url}') diff --git a/src/utils/utils.py b/src/utils/utils.py index 6421b82..ec7b7fa 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -164,4 +164,59 @@ def normalize_url(url: str) -> str: except Exception as e: print(f"URL标准化失败: {url}, 错误: {e}") - return url # 出错时返回原始URL \ No newline at end of file + return url # 出错时返回原始URL + +import json + +def pretty_print_json(data, n=10, indent=4, sort_keys=False): + """ + 以美化格式打印数组的前n个元素,其他元素用"..."表示 + + 参数: + - data: 要打印的数据(应为数组) + - n: 要显示的元素数量 + - indent: 缩进空格数 + - sort_keys: 是否按键排序 + """ + try: + # 处理非数组数据 + if not isinstance(data, list): + formatted = json.dumps(data, indent=indent, ensure_ascii=False, sort_keys=sort_keys) + print(formatted) + return + + # 复制原始数据,避免修改原数组 + data_copy = data.copy() + + # 切片取前n个元素 + first_n_elements = data_copy[:n] + + # 如果数组长度超过n,添加"..."标记 + if len(data) > n: + result = first_n_elements + ["... ({} more elements)".format(len(data) - n)] + else: + result = first_n_elements + + # 格式化输出 + formatted = json.dumps(result, indent=indent, ensure_ascii=False, sort_keys=sort_keys) + print(formatted) + + except TypeError as e: + print(f"错误:无法格式化数据。详情:{e}") + except Exception as e: + print(f"打印时发生意外错误:{e}") + +# 使用示例 +if __name__ == "__main__": + # 长数组示例(20个元素) + large_array = [{"id": i, "value": f"元素{i}"} for i in range(1, 21)] + + print("### 输出前3个元素:") + pretty_print_json(large_array, n=3) + + print("\n### 输出前10个元素:") + pretty_print_json(large_array, n=10) + + # 非数组数据示例 + print("\n### 非数组数据(字典):") + pretty_print_json({"key1": "value1", "key2": "value2"}) \ No newline at end of file