diff --git a/javdb/src/fetch.py b/javdb/src/fetch.py index bb52cde..da70bf8 100644 --- a/javdb/src/fetch.py +++ b/javdb/src/fetch.py @@ -148,10 +148,15 @@ def fetch_movies_by_series(): # 更新演员信息 def fetch_performers_detail(): + limit_count = 5 if debug else 100 perfomers_list = [] + last_perfomer_id = 0 while True: - # 每次从数据库中取一部分,避免一次全量获取 - perfomers_list = db_tools.query_actors(is_full_data=0, limit=100) + # 每次从数据库中取一部分,避免一次全量获取 + if force: # 从头逐个遍历 + perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1) + else: # 只做更新 + perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count) if len(perfomers_list) < 1: logging.info(f'all performers fetched.') break @@ -189,21 +194,27 @@ def fetch_performers_detail(): }) if performer_id: logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}') + last_perfomer_id = performer_id else: logging.warning(f'insert person: ({person}) {url} failed.') + time.sleep(0.5) # 调试break if debug: return True # 更新影片信息 def fetch_movies_detail(): + limit_count = 10 if debug else 100 movies_list = [] - while True: - movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100) + last_movie_id = 0 + while True: + if force: # 从头逐个遍历 + movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1) + else: # 只做更新 + movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count) if len(movies_list) < 1: logging.info(f'all movies fetched.') break - last_movie_id = 0 succ_count = 0 for movie in movies_list: url = movie['href'] @@ -231,7 +242,7 @@ def fetch_movies_detail(): logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...') else: logging.warning(f'fetch_page error. url: {url}') - time.sleep(1) + time.sleep(0.5) logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}') # 调试增加break if debug: @@ -253,6 +264,9 @@ function_map = { def main(cmd, args_debug, args_force): global debug debug = args_debug + if debug: + logger = logging.getLogger() + #logger.setLevel(logging.DEBUG) global force force = args_force diff --git a/javdb/src/scraper.py b/javdb/src/scraper.py index 8fe772d..f503038 100644 --- a/javdb/src/scraper.py +++ b/javdb/src/scraper.py @@ -11,6 +11,7 @@ from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial import config +import utils # 定义基础 URL 和可变参数 host_url = "https://www.javdb.com" @@ -24,8 +25,22 @@ headers = { } scraper = cloudscraper.create_scraper() +save_raw_html = True +load_from_local = True + #使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None): + if load_from_local: # 从本地读取的逻辑 + html = utils.read_raw_html(url) + if html: + # 预处理 HTML(如果提供了 preprocessor) + html_text = preprocessor(html) if preprocessor else html + + soup = BeautifulSoup(html_text, parser) + if validator(soup): # 进行自定义页面检查 + logging.info(f"read from local. href: {url}") + return soup, 99 # 返回一个小于100的错误码,表明是从本地返回的 + for attempt in range(max_retries): try: if 'javdb.com' not in url.lower(): @@ -50,6 +65,9 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor logging.warning(f"Page redirected to login page on {url}.") return None, 401 + if save_raw_html: + utils.write_raw_html(url, response.text) + # 预处理 HTML(如果提供了 preprocessor) html_text = preprocessor(response.text) if preprocessor else response.text @@ -223,7 +241,7 @@ def parse_actor_detail(soup, href): # 解析 HTML 内容,提取需要的数据 -def parse_movie_detail(soup, href, title): +def parse_movie_detail_old(soup, href, title): div_video = soup.find("div", class_='video-meta-panel') if not div_video: logging.warning(f"Warning: No movies div found ") @@ -272,6 +290,74 @@ def parse_movie_detail(soup, href, title): 'actors': actors } +# 解析单个元素 +def parse_movie_one(soup, keys): + key_strong = soup.find('strong', string=lambda text: text in keys) + if key_strong: + key_span = key_strong.find_next_sibling('span', class_='value') + if key_span: + return key_span.text.strip() + return None + +# 解析值和链接 +def parse_movie_val_href(soup, keys): + key_strong = soup.find('strong', string=lambda text: text in keys) + if key_strong: + key_span = key_strong.find_next_sibling('span', class_='value') + if key_span: + a_tag = key_span.find('a') + if a_tag: + return a_tag.text.strip(), host_url + a_tag.get('href') + else: + return key_span.text.strip(), None + return None, None + +# 解析多个值和链接 +def parse_movie_arr(soup, keys): + key_strong = soup.find('strong', string=lambda text: text in keys) + if key_strong: + key_span = key_strong.find_next_sibling('span', class_='value') + if key_span: + actors = [] + a_tags = key_span.find_all('a') + for a_tag in a_tags: + actors.append({ + 'name': a_tag.text.strip(), + 'href': host_url + a_tag.get('href') + }) + return actors + return [] + +# 解析 HTML 内容,提取需要的数据 +def parse_movie_detail(soup, href, title): + div_video = soup.find("div", class_='video-meta-panel') + if not div_video: + logging.warning(f"Warning: No movies div found ") + return None, None + + result = {} + result['href'] = href + result['title'] = title + + # 获取封面图片 + cover_img = soup.select_one('.column-video-cover a') + result['cover_url'] = cover_img['href'] if cover_img else None + + # 获取番号 + result['serial_number'] = parse_movie_one(soup, ['番號:', 'ID:']) + result['release_date'] = parse_movie_one(soup, ['日期:', 'Released Date:']) + result['duration'] = parse_movie_one(soup, ['時長:', 'Duration:']) + + # 获取maker,系列 + result['maker_name'], result['maker_link'] = parse_movie_val_href(soup, ['片商:', 'Maker:']) + result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:']) + + # 获取演员,tags + result['tags'] = parse_movie_arr(soup, ['類別:', 'Tags:']) + result['actors'] = parse_movie_arr(soup, ['演員:', 'Actor(s):']) + + return result + # 解析 HTML 内容,提取需要的数据 def parse_series_uncensored(soup, href): div_series = soup.find("div", id='series') diff --git a/javdb/src/sqlite_utils.py b/javdb/src/sqlite_utils.py index bff2457..baa0749 100644 --- a/javdb/src/sqlite_utils.py +++ b/javdb/src/sqlite_utils.py @@ -49,7 +49,7 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None): performer_id = get_id_by_href('javdb_actors', href) if performer_id: - logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}') + logging.debug(f"Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}") return performer_id @@ -200,6 +200,33 @@ def query_actors(**filters): if "is_full_data" in filters: sql += " AND is_full_data = ?" params.append(filters["is_full_data"]) + if "from_actor_list" in filters: + sql += " AND from_actor_list = ?" + params.append(filters["from_actor_list"]) + if "is_full_data_in" in filters: + values = filters["is_full_data_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data IN ({placeholders})" + params.extend(values) + if "is_full_data_not_in" in filters: + values = filters["is_full_data_not_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data NOT IN ({placeholders})" + params.extend(values) + if "before_updated_at" in filters: + sql += " AND updated_at <= ?" + params.append(filters["before_updated_at"]) + if "after_updated_at" in filters: + sql += " AND updated_at >= ?" + params.append(filters["after_updated_at"]) + if "start_id" in filters: + sql += " AND id > ?" + params.append(filters["start_id"]) + if "order_by" in filters: + sql += " order by ? asc" + params.append(filters["order_by"]) if 'limit' in filters: sql += " limit ?" params.append(filters["limit"]) @@ -372,13 +399,43 @@ def query_series_hrefs(**filters): return None +# 插入或更新类别 """ +def insert_or_update_tags(name, href): + try: + cursor.execute(""" + INSERT INTO javdb_tags (name, href, updated_at) + VALUES (?, ? , datetime('now', 'localtime')) + ON CONFLICT(href) DO UPDATE SET + name = excluded.name, + updated_at = datetime('now', 'localtime') + """, (name, href)) + conn.commit() + + # 获取 performer_id + cursor.execute("SELECT id FROM javdb_tags WHERE href = ?", (href,)) + dist_id = cursor.fetchone()[0] + if dist_id: + logging.debug(f"insert/update tags succ. id: {dist_id}, name: {name}") + return dist_id + else: + return None + except sqlite3.Error as e: + conn.rollback() + logging.error(f"数据库错误: {e}") + return None + # """插入或更新电影数据""" def insert_or_update_movie(movie): try: # 获取相关 ID - makers_id = get_id_by_href('javdb_makers', movie['maker_link']) - series_id = get_id_by_href('javdb_series', movie['series_link']) + makers_id = get_id_by_href('javdb_makers', movie['maker_link']) if movie['maker_link'] else None + series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None + # 如果不存在,插入 + if makers_id is None and movie['maker_link']: + makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''), 'href' : movie.get('maker_link', '')}) + if series_id is None and movie['series_link']: + series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')}) cursor.execute(""" INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration, @@ -404,7 +461,7 @@ def insert_or_update_movie(movie): if movie_id is None: return None - logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}') + logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}") # 插入 performers_movies 关系表 for performer in movie.get('actors', []): @@ -412,14 +469,23 @@ def insert_or_update_movie(movie): # 如果演员不存在,先插入 if performer_id is None: performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1) + logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}") if performer_id: tmp_id = insert_actor_movie(performer_id, movie_id) if tmp_id: logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}") else: - logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}') + logging.debug(f"insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}") else: - logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}') + logging.warning(f"insert perfomer failed. name: {performer['name']}, href: {performer['href']}") + + # 插入 tags 表 + for tag in movie.get('tags', []): + tag_name = tag.get('name', '') + tag_href = tag.get('href', '') + tag_id = insert_or_update_tags(tag_name, tag_href) + if tag_id: + logging.debug(f"insert one tags. tag_id: {tag_id}, name:{tag_name}") return movie_id @@ -516,6 +582,33 @@ def query_movie_hrefs(**filters): if "is_full_data" in filters: sql += " AND is_full_data = ?" params.append(filters["is_full_data"]) + if "from_actor_list" in filters: + sql += " AND from_actor_list = ?" + params.append(filters["from_actor_list"]) + if "is_full_data_in" in filters: + values = filters["is_full_data_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data IN ({placeholders})" + params.extend(values) + if "is_full_data_not_in" in filters: + values = filters["is_full_data_not_in"] + if values: + placeholders = ", ".join(["?"] * len(values)) + sql += f" AND is_full_data NOT IN ({placeholders})" + params.extend(values) + if "before_updated_at" in filters: + sql += " AND updated_at <= ?" + params.append(filters["before_updated_at"]) + if "after_updated_at" in filters: + sql += " AND updated_at >= ?" + params.append(filters["after_updated_at"]) + if "start_id" in filters: + sql += " AND id > ?" + params.append(filters["start_id"]) + if "order_by" in filters: + sql += " order by ?" + params.append(filters["order_by"]) if 'limit' in filters: sql += " limit ?" params.append(filters["limit"]) diff --git a/javdb/src/utils.py b/javdb/src/utils.py index d38853c..c8e0ff9 100644 --- a/javdb/src/utils.py +++ b/javdb/src/utils.py @@ -3,8 +3,98 @@ import os import json import time import csv +from datetime import datetime from urllib.parse import urlparse import logging +import config + +update_dir = f'{config.global_host_data_dir}/javdb' + +# 创建目录 +def create_sub_directory(base_dir, str): + # 获取 person 的前两个字母并转为小写 + sub_dir = str[:1].lower() + full_path = os.path.join(base_dir, sub_dir) + if not os.path.exists(full_path): + os.makedirs(full_path) + return full_path + +# 只提取movies url +def extract_id_from_href(href): + # 检查 URL 是否符合要求 + if 'javdb.com/v/' in href: + # 定义正则表达式模式 + pattern = r'javdb.com/v/([^?&]+)' + # 查找匹配项 + match = re.search(pattern, href) + if match: + # 提取匹配的字符串并转换为小写 + result = match.group(1).lower() + return result + return '' + +# 保存抓取到的原始HTML,方便后续核验 +def write_raw_html(href, html_text): + # 获取目录 + id = extract_id_from_href(href) + if 'javdb.com/v/' in href.lower(): + dir_prefix = 'raw_movies' + else: + return + + file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) + file_name = f"{id}.html" # 用 - 替换空格 + full_path = os.path.join(file_dir, file_name) + + try: + with open(full_path, 'w', encoding='utf-8') as file: + file.write(html_text) + except FileNotFoundError: + logging.warning(f"错误:指定的路径 {full_path} 不存在。") + except PermissionError: + logging.warning(f"错误:没有权限写入文件 {full_path}。") + except Exception as e: + logging.warning(f"发生未知错误:{e}") + + +# 保存抓取到的原始HTML,方便后续核验 +def read_raw_html(href, expire_date_str="2025-03-01"): + # 获取目录 + id = extract_id_from_href(href) + if 'javdb.com/v/' in href.lower(): + dir_prefix = 'raw_movies' + else: + return + + file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id) + file_name = f"{id}.html" # 用 - 替换空格 + full_path = os.path.join(file_dir, file_name) + + try: + if os.path.exists(full_path): + # 获取文件的最后修改时间 + last_modified_timestamp = os.path.getmtime(full_path) + # 将时间戳转换为 datetime 对象 + last_modified_date = datetime.fromtimestamp(last_modified_timestamp) + # 检查文件最后修改时间是否晚于给定日期 + expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d") + if last_modified_date > expire_date: + logging.debug(f"find local file on href {href}") + with open(full_path, 'r', encoding='utf-8') as file: + return file.read() + else: + logging.debug(f"expired file {last_modified_date} on href {href}") + return None + else: + return None + except FileNotFoundError: + logging.warning(f"错误:指定的路径 {full_path} 不存在。") + except PermissionError: + logging.warning(f"错误:没有权限读取文件 {full_path}。") + except Exception as e: + logging.warning(f"发生未知错误:{e}") + return None + # 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数