modify scripts

2025-04-23 17:13:35 +08:00
parent f42fd2177b
commit f6385b83e4
4 changed files with 296 additions and 13 deletions
--- a/javdb/src/fetch.py
+++ b/javdb/src/fetch.py
@ -148,10 +148,15 @@ def fetch_movies_by_series():

 # 更新演员信息
 def fetch_performers_detail():
+    limit_count = 5 if debug else 100
    perfomers_list = []
+    last_perfomer_id = 0
    while True:
        # 每次从数据库中取一部分，避免一次全量获取        
-        perfomers_list = db_tools.query_actors(is_full_data=0, limit=100)
+        if force:   # 从头逐个遍历
+            perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
+        else:       # 只做更新
+            perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
        if len(perfomers_list) < 1:
            logging.info(f'all performers fetched.')
            break
@ -189,21 +194,27 @@ def fetch_performers_detail():
            })
            if performer_id:
                logging.info(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
+                last_perfomer_id = performer_id
            else:
                logging.warning(f'insert person: ({person}) {url} failed.')
+            time.sleep(0.5)
        # 调试break
        if debug:
            return True

 # 更新影片信息
 def fetch_movies_detail():
+    limit_count = 10 if debug else 100
    movies_list = []
+    last_movie_id = 0
    while True:        
-        movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100)
+        if force:   # 从头逐个遍历
+            movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
+        else:       # 只做更新
+            movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
        if len(movies_list) < 1:
            logging.info(f'all movies fetched.')
            break
-        last_movie_id = 0
        succ_count = 0
        for movie in movies_list:
            url = movie['href']
@ -231,7 +242,7 @@ def fetch_movies_detail():
                logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
            else:
                logging.warning(f'fetch_page error. url: {url}')
-            time.sleep(1)
+            time.sleep(0.5)
        logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
        # 调试增加break
        if debug:
@ -253,6 +264,9 @@ function_map = {
 def main(cmd, args_debug, args_force):
    global debug
    debug = args_debug
+    if debug:
+        logger = logging.getLogger()
+        #logger.setLevel(logging.DEBUG)

    global force
    force = args_force
--- a/javdb/src/scraper.py
+++ b/javdb/src/scraper.py
@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
 from requests.exceptions import RequestException
 from functools import partial
 import config
+import utils

 # 定义基础 URL 和可变参数
 host_url = "https://www.javdb.com"
@ -24,8 +25,22 @@ headers = {
 }
 scraper = cloudscraper.create_scraper()

+save_raw_html = True
+load_from_local = True
+
 #使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
 def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor=None):
+    if load_from_local:     # 从本地读取的逻辑
+        html = utils.read_raw_html(url)
+        if html:
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(html) if preprocessor else html
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                logging.info(f"read from local. href: {url}")
+                return soup, 99     # 返回一个小于100的错误码，表明是从本地返回的
+
    for attempt in range(max_retries):
        try:
            if 'javdb.com' not in url.lower():
@ -50,6 +65,9 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
                    logging.warning(f"Page redirected to login page on {url}.")
                    return None, 401
                
+            if save_raw_html:
+                utils.write_raw_html(url, response.text)
+
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text

@ -223,7 +241,7 @@ def parse_actor_detail(soup, href):


 # 解析 HTML 内容，提取需要的数据
-def parse_movie_detail(soup, href, title):
+def parse_movie_detail_old(soup, href, title):
    div_video = soup.find("div", class_='video-meta-panel')
    if not div_video:
        logging.warning(f"Warning: No movies div found ")
@ -272,6 +290,74 @@ def parse_movie_detail(soup, href, title):
        'actors': actors
    }

+# 解析单个元素
+def parse_movie_one(soup, keys):
+    key_strong = soup.find('strong', string=lambda text: text in keys)
+    if key_strong:
+        key_span = key_strong.find_next_sibling('span', class_='value')
+        if key_span:
+            return key_span.text.strip()
+    return None
+
+# 解析值和链接
+def parse_movie_val_href(soup, keys):
+    key_strong = soup.find('strong', string=lambda text: text in keys)
+    if key_strong:
+        key_span = key_strong.find_next_sibling('span', class_='value')
+        if key_span:
+            a_tag = key_span.find('a')
+            if a_tag:
+                return a_tag.text.strip(), host_url + a_tag.get('href')
+            else:
+                return key_span.text.strip(), None
+    return None, None
+
+# 解析多个值和链接
+def parse_movie_arr(soup, keys):
+    key_strong = soup.find('strong', string=lambda text: text in keys)
+    if key_strong:
+        key_span = key_strong.find_next_sibling('span', class_='value')
+        if key_span:
+            actors = []
+            a_tags = key_span.find_all('a')
+            for a_tag in a_tags:
+                actors.append({
+                    'name': a_tag.text.strip(),
+                    'href': host_url + a_tag.get('href')
+                })
+            return actors
+    return []
+
+# 解析 HTML 内容，提取需要的数据
+def parse_movie_detail(soup, href, title):
+    div_video = soup.find("div", class_='video-meta-panel')
+    if not div_video:
+        logging.warning(f"Warning: No movies div found ")
+        return None, None
+    
+    result = {}
+    result['href'] = href
+    result['title'] = title
+
+    # 获取封面图片
+    cover_img = soup.select_one('.column-video-cover a')
+    result['cover_url'] = cover_img['href'] if cover_img else None
+
+    # 获取番号
+    result['serial_number'] = parse_movie_one(soup, ['番號:', 'ID:'])
+    result['release_date']  = parse_movie_one(soup, ['日期:', 'Released Date:'])
+    result['duration']      = parse_movie_one(soup, ['時長:', 'Duration:'])
+
+    # 获取maker，系列
+    result['maker_name'], result['maker_link']   = parse_movie_val_href(soup, ['片商:', 'Maker:'])
+    result['series_name'], result['series_link'] = parse_movie_val_href(soup, ['系列:', 'Series:'])
+
+    # 获取演员，tags
+    result['tags']   = parse_movie_arr(soup, ['類別:', 'Tags:'])
+    result['actors'] = parse_movie_arr(soup, ['演員:', 'Actor(s):'])
+
+    return result
+
 # 解析 HTML 内容，提取需要的数据
 def parse_series_uncensored(soup, href):
    div_series = soup.find("div", id='series')
--- a/javdb/src/sqlite_utils.py
+++ b/javdb/src/sqlite_utils.py
@ -49,7 +49,7 @@ def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):

        performer_id = get_id_by_href('javdb_actors', href)
        if performer_id:
-            logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}')
+            logging.debug(f"Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}")

        return performer_id

@ -200,6 +200,33 @@ def query_actors(**filters):
        if "is_full_data" in filters:
            sql += " AND is_full_data = ?"
            params.append(filters["is_full_data"])
+        if "from_actor_list" in filters:
+            sql += " AND from_actor_list = ?"
+            params.append(filters["from_actor_list"])
+        if "is_full_data_in" in filters:
+            values = filters["is_full_data_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data IN ({placeholders})"
+                params.extend(values)
+        if "is_full_data_not_in" in filters:
+            values = filters["is_full_data_not_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data NOT IN ({placeholders})"
+                params.extend(values)
+        if "before_updated_at" in filters:
+            sql += " AND updated_at <= ?"
+            params.append(filters["before_updated_at"])
+        if "after_updated_at" in filters:
+            sql += " AND updated_at >= ?"
+            params.append(filters["after_updated_at"])
+        if "start_id" in filters:
+            sql += " AND id > ?"
+            params.append(filters["start_id"])
+        if "order_by" in filters:
+            sql += " order by ? asc"
+            params.append(filters["order_by"])
        if 'limit' in filters:
            sql += " limit ?"
            params.append(filters["limit"])
@ -372,13 +399,43 @@ def query_series_hrefs(**filters):
        return None


+# 插入或更新类别 """
+def insert_or_update_tags(name, href):
+    try:
+        cursor.execute("""
+            INSERT INTO javdb_tags (name, href, updated_at) 
+            VALUES (?, ? , datetime('now', 'localtime'))
+            ON CONFLICT(href) DO UPDATE SET 
+                name = excluded.name, 
+                updated_at = datetime('now', 'localtime')
+        """, (name, href))
+        conn.commit()
+
+        # 获取 performer_id
+        cursor.execute("SELECT id FROM javdb_tags WHERE href = ?", (href,))
+        dist_id = cursor.fetchone()[0]
+        if dist_id:
+            logging.debug(f"insert/update tags succ. id: {dist_id}, name: {name}")
+            return dist_id
+        else:
+            return None
+    except sqlite3.Error as e:
+        conn.rollback()
+        logging.error(f"数据库错误: {e}")
+        return None
+
 # """插入或更新电影数据"""
 def insert_or_update_movie(movie):
    try:
        # 获取相关 ID
-        makers_id = get_id_by_href('javdb_makers', movie['maker_link'])
-        series_id = get_id_by_href('javdb_series', movie['series_link'])
+        makers_id = get_id_by_href('javdb_makers', movie['maker_link'])  if movie['maker_link'] else None
+        series_id = get_id_by_href('javdb_series', movie['series_link']) if movie['series_link'] else None

+        # 如果不存在，插入
+        if makers_id is None and movie['maker_link']:
+            makers_id = insert_or_update_makers({'name' : movie.get('maker_name', ''),  'href' : movie.get('maker_link', '')})
+        if series_id is None and movie['series_link']:
+            series_id = insert_or_update_series({'name' : movie.get('series_name', ''), 'href' : movie.get('series_link', '')})

        cursor.execute("""
            INSERT INTO javdb_movies (href, title, cover_url, serial_number, release_date, duration, 
@ -404,7 +461,7 @@ def insert_or_update_movie(movie):
        if movie_id is None:
            return None
        
-        logging.debug(f'insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}')
+        logging.debug(f"insert one move, id: {movie_id}, title: {movie['title']}, href: {movie['href']}")
        
        # 插入 performers_movies 关系表
        for performer in movie.get('actors', []):
@ -412,14 +469,23 @@ def insert_or_update_movie(movie):
            # 如果演员不存在，先插入
            if performer_id is None:
                performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
+                logging.debug(f"insert new perfomer. perfomer_id: {performer_id}, name:{performer['name']}")
            if performer_id:
                tmp_id = insert_actor_movie(performer_id, movie_id)
                if tmp_id:
                    logging.debug(f"insert one perfomer_movie. perfomer_id: {performer_id}, movie_id:{movie_id}")
                else:
-                    logging.debug(f'insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}')
+                    logging.debug(f"insert perfomer_movie failed. perfomer_id: {performer_id}, movie_id:{movie_id}")
            else:
-                logging.warning(f'insert perfomer failed. name: {performer['name']}, href: {performer['href']}')
+                logging.warning(f"insert perfomer failed. name: {performer['name']}, href: {performer['href']}")
+
+        # 插入 tags 表
+        for tag in movie.get('tags', []):
+            tag_name = tag.get('name', '')
+            tag_href = tag.get('href', '')
+            tag_id = insert_or_update_tags(tag_name, tag_href)
+            if tag_id:
+                logging.debug(f"insert one tags. tag_id: {tag_id}, name:{tag_name}")

        return movie_id
        
@ -516,6 +582,33 @@ def query_movie_hrefs(**filters):
        if "is_full_data" in filters:
            sql += " AND is_full_data = ?"
            params.append(filters["is_full_data"])
+        if "from_actor_list" in filters:
+            sql += " AND from_actor_list = ?"
+            params.append(filters["from_actor_list"])
+        if "is_full_data_in" in filters:
+            values = filters["is_full_data_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data IN ({placeholders})"
+                params.extend(values)
+        if "is_full_data_not_in" in filters:
+            values = filters["is_full_data_not_in"]
+            if values:
+                placeholders = ", ".join(["?"] * len(values))
+                sql += f" AND is_full_data NOT IN ({placeholders})"
+                params.extend(values)
+        if "before_updated_at" in filters:
+            sql += " AND updated_at <= ?"
+            params.append(filters["before_updated_at"])
+        if "after_updated_at" in filters:
+            sql += " AND updated_at >= ?"
+            params.append(filters["after_updated_at"])
+        if "start_id" in filters:
+            sql += " AND id > ?"
+            params.append(filters["start_id"])
+        if "order_by" in filters:
+            sql += " order by ?"
+            params.append(filters["order_by"])
        if 'limit' in filters:
            sql += " limit ?"
            params.append(filters["limit"])
--- a/javdb/src/utils.py
+++ b/javdb/src/utils.py
@ -3,8 +3,98 @@ import os
 import json
 import time
 import csv
+from datetime import datetime
 from urllib.parse import urlparse
 import logging
+import config
+
+update_dir = f'{config.global_host_data_dir}/javdb'
+
+# 创建目录
+def create_sub_directory(base_dir, str):
+    # 获取 person 的前两个字母并转为小写
+    sub_dir = str[:1].lower()
+    full_path = os.path.join(base_dir, sub_dir)
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+    return full_path
+
+# 只提取movies url
+def extract_id_from_href(href):
+    # 检查 URL 是否符合要求
+    if 'javdb.com/v/' in href:
+        # 定义正则表达式模式
+        pattern = r'javdb.com/v/([^?&]+)'
+        # 查找匹配项
+        match = re.search(pattern, href)
+        if match:
+            # 提取匹配的字符串并转换为小写
+            result = match.group(1).lower()
+            return result
+    return ''
+
+# 保存抓取到的原始HTML，方便后续核验
+def write_raw_html(href, html_text):
+    # 获取目录
+    id = extract_id_from_href(href)
+    if 'javdb.com/v/' in href.lower():
+        dir_prefix = 'raw_movies'
+    else:
+        return 
+    
+    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
+    file_name = f"{id}.html"  # 用 - 替换空格
+    full_path = os.path.join(file_dir, file_name)
+
+    try:
+        with open(full_path, 'w', encoding='utf-8') as file:
+            file.write(html_text)
+    except FileNotFoundError:
+        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
+    except PermissionError:
+        logging.warning(f"错误：没有权限写入文件 {full_path}。")
+    except Exception as e:
+        logging.warning(f"发生未知错误：{e}")
+
+
+# 保存抓取到的原始HTML，方便后续核验
+def read_raw_html(href, expire_date_str="2025-03-01"):
+    # 获取目录
+    id = extract_id_from_href(href)
+    if 'javdb.com/v/' in href.lower():
+        dir_prefix = 'raw_movies'
+    else:
+        return 
+
+    file_dir = create_sub_directory(f"{update_dir}/{dir_prefix}", id)
+    file_name = f"{id}.html"  # 用 - 替换空格
+    full_path = os.path.join(file_dir, file_name)
+
+    try:
+        if os.path.exists(full_path):
+            # 获取文件的最后修改时间
+            last_modified_timestamp = os.path.getmtime(full_path)
+            # 将时间戳转换为 datetime 对象
+            last_modified_date = datetime.fromtimestamp(last_modified_timestamp)
+            # 检查文件最后修改时间是否晚于给定日期
+            expire_date = datetime.strptime(expire_date_str, "%Y-%m-%d")
+            if last_modified_date > expire_date:
+                logging.debug(f"find local file on href {href}")
+                with open(full_path, 'r', encoding='utf-8') as file:
+                    return file.read()
+            else:
+                logging.debug(f"expired file {last_modified_date} on href {href}")
+                return None
+        else:
+            return None
+    except FileNotFoundError:
+        logging.warning(f"错误：指定的路径 {full_path} 不存在。")
+    except PermissionError:
+        logging.warning(f"错误：没有权限读取文件 {full_path}。")
+    except Exception as e:
+        logging.warning(f"发生未知错误：{e}")
+    return None
+


 # 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数