modify some scripts.

2025-03-07 19:11:41 +08:00
parent 6cebf3f8ac
commit f5929811c7
27 changed files with 778 additions and 2724974 deletions
--- a/scripts/javdb/src/fetch.py
+++ b/scripts/javdb/src/fetch.py
@ -20,26 +20,29 @@ def fetch_actor_list():
    next_url = scraper.actors_uncensored_base_url
    while next_url:
        logging.info(f'fetching page {next_url}')
-        soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
+        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
            if list_data :
                # 写入数据库
                for row in list_data:
-                    actor_id = db_tools.insert_actor_index(name=row['name'], href=row['href'] if row['href'] else '')
+                    actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
                    if actor_id:
                        logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
                    else:
                        logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
            else:
                logging.warning(f'fetch actor error. {next_url} ...')
+        elif status_code  and status_code == 404:
+            logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
+            break

 # 获取makers列表
 def fetch_makers_list():
    next_url = scraper.makers_uncensored_base_url
    while next_url:
        logging.info(f'fetching page {next_url}')
-        soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
+        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
            if list_data :
@ -53,12 +56,16 @@ def fetch_makers_list():
            else:
                logging.warning(f'fetch actor error. {next_url} ...')

+        elif status_code  and status_code == 404:
+            logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
+            break
+
 # 获取series列表
 def fetch_series_list():
    next_url = scraper.series_uncensored_base_url
    while next_url:
        logging.info(f'fetching page {next_url}')
-        soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
+        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
            if list_data :
@ -72,6 +79,10 @@ def fetch_series_list():
            else:
                logging.warning(f'fetch actor error. {next_url} ...')

+        elif status_code  and status_code == 404:
+            logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
+            break
+

 # 更新makers列表中的影片信息
 def fetch_movies_by_maker():
@ -79,21 +90,27 @@ def fetch_movies_by_maker():
    if debug:
        url_list = db_tools.query_maker_hrefs(name='muramura')
    for url in url_list:
-        next_url = url
-        while True:
+        # 去掉可下载的标志（如果有）
+        next_url = utils.remove_url_query(url)
+        while next_url:
            logging.info(f"Fetching data for maker url {next_url} ...")
-            soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+            soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
            if soup:
                list_data, next_url = scraper.parse_maker_detail(soup, next_url)
                if list_data:                
                    for movie in list_data:
-                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
+                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1)
                        if tmp_id:
                            logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
                        else:
                            logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
                else :
                    logging.warning(f'parse_page_movie error. url: {next_url}')
+
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
+                break
+
            # 调试增加brak
            if debug:
                return True
@ -104,21 +121,26 @@ def fetch_movies_by_series():
    if debug:
        url_list = db_tools.query_series_hrefs(name='10musume')
    for url in url_list:
-        next_url = url
-        while True:
+        # 去掉可下载的标志（如果有）
+        next_url = utils.remove_url_query(url)
+        while next_url:
            logging.info(f"Fetching data for series url {next_url} ...")
-            soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+            soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
            if soup:
                list_data, next_url = scraper.parse_series_detail(soup, next_url)
                if list_data:                
                    for movie in list_data:
-                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'])
+                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1)
                        if tmp_id:
                            logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
                        else:
                            logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
                else :
                    logging.warning(f'parse_page_movie error. url: {next_url}')
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
+                break
+
            # 调试增加brak
            if debug:
                return True
@ -129,23 +151,31 @@ def fetch_performers_detail():
    perfomers_list = []
    while True:
        # 每次从数据库中取一部分，避免一次全量获取
-        perfomers_list = db_tools.query_actors(is_full_data=0, limit=10)
+        perfomers_list = db_tools.query_actors(is_full_data=0, limit=100)
        if len(perfomers_list) < 1:
            logging.info(f'all performers fetched.')
            break
        for performer in perfomers_list:
            url = performer['href']
            person = performer['name']
+            pic = ''
+            alias = []

            next_url = url
            all_movies = []
            while next_url:
                logging.info(f"Fetching data for actor ({person}), url {next_url} ...")
-                soup = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
+                soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="movie-list h cols-4 vcols-5", attr_type="class"))
                if soup:
                    data, next_url = scraper.parse_actor_detail(soup, next_url)
                    if data:
-                        all_movies.extend(data)
+                        pic = data.get('pic', '')
+                        alias = data.get('alias', [])
+                        all_movies.extend(data.get('movies', []))
+
+                elif status_code  and status_code == 404:
+                    logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}, Skiping...')
+                    break
                else:
                    logging.warning(f'fetch_page error. person: ({person}), url: {url}')
            
@ -153,8 +183,8 @@ def fetch_performers_detail():
            performer_id = db_tools.insert_or_update_actor({
                'href': url,
                'name': person,
-                'pic' : '',
-                'alias' : [],
+                'pic' : pic,
+                'alias' : alias,
                'credits':all_movies
            })
            if performer_id:
@ -169,7 +199,7 @@ def fetch_performers_detail():
 def fetch_movies_detail():
    movies_list = []
    while True:
-        movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=10)
+        movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=100)
        if len(movies_list) < 1:
            logging.info(f'all movies fetched.')
            break
@ -177,7 +207,7 @@ def fetch_movies_detail():
            url = movie['href']
            title = movie['title']
            logging.info(f"Fetching data for movie ({title}), url {url} ...")
-            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
+            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
            if soup:
                movie_data = scraper.parse_movie_detail(soup, url, title)
                if movie_data :
@ -188,84 +218,76 @@ def fetch_movies_detail():
                        logging.warning(f'insert movie {url} failed.')
                else:
                    logging.warning(f'parse_page_movie error. url: {url}')
+
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
+                break
            else:
                logging.warning(f'fetch_page error. url: {url}')
        # 调试增加break
        if debug:
            return True

-# 获取更新
-def check_update():
+    
+# 建立缩写到函数的映射
+function_map = {
+    "actor_list": fetch_actor_list,
+    "maker_list": fetch_makers_list,
+    "series_list": fetch_series_list,
+    "makers": fetch_movies_by_maker,
+    "series" : fetch_movies_by_series,
+    "movies" : fetch_movies_detail,
+    "actors" : fetch_performers_detail,
+}   
+
+# 主函数
+def main(cmd, args_debug, args_force):
+    global debug
+    debug = args_debug
+
+    global force
+    force = args_force

    # 开启任务
    task_id = db_tools.insert_task_log()
    if task_id is None:
        logging.warning(f'insert task log error.')
        return None
+    
+    logging.info(f'running task. id: {task_id}, debug: {debug}, force: {force}, cmd: {cmd}')

-    if False:
-        # 刷新演员列表
-        db_tools.update_task_log(task_id, task_status='fetching actor list')
-        fetch_actor_list()
-
-        # 刷新makers列表
-        db_tools.update_task_log(task_id, task_status='fetching maker list')
-        fetch_makers_list()
-
-        # 刷新series列表
-        db_tools.update_task_log(task_id, task_status='fetching series list')
-        fetch_series_list()
-
-        # 刷新影片列表
-        db_tools.update_task_log(task_id, task_status='fetching movie list by maker')
-        fetch_movies_by_maker()
-        db_tools.update_task_log(task_id, task_status='fetching movie list by series')
-        fetch_movies_by_series()
-
-    # 更新演员信息
-    db_tools.update_task_log(task_id, task_status='fetching performers')
-    fetch_performers_detail()
-
-    # 更新影片信息
-    db_tools.update_task_log(task_id, task_status='fetching movies')
-    fetch_movies_detail()
+    # 执行指定的函数
+    if cmd:
+        function_names = args.cmd.split(",")  # 拆分输入
+        for short_name in function_names:
+            func = function_map.get(short_name.strip())  # 从映射中获取对应的函数
+            if callable(func):
+                db_tools.update_task_log(task_id, task_status=f'Running {func}')
+                func()
+            else:
+                logging.warning(f" {short_name} is not a valid function shortcut.")
+    else: # 全量执行
+        for name, func in function_map.items():
+            if callable(func):
+                db_tools.update_task_log(task_id, task_status=f'Running {func}')
+                func()
+            else:
+                logging.warning(f" {short_name} is not a valid function shortcut.")

    logging.info(f'all process completed!')
    db_tools.finalize_task_log(task_id)

    # TODO:
    # 1, 
-    
-# 处理本地数据
-def load_data():
-    return True
-
-# 主函数
-def main(task, args_debug, args_force):
-    global debug
-    debug = args_debug
-    if debug:
-        logging.info('Debug mode enabled.')
-
-    global force
-    force = args_force
-    if force:
-        logging.info('force update for all data.')
-
-    if task == 'fetch':
-        check_update()
-    elif task == 'load':
-        load_data()
-    else:
-        print(f'unkown command. see --help.')
-    

 if __name__ == "__main__":
    # 命令行参数处理
-    parser = argparse.ArgumentParser(description='fetch iafd data.')
-    parser.add_argument('--task', type=str, default='fetch', help='fetch from iafd.com or load from local data ... (fetch , load)')
+    keys_str = ",".join(function_map.keys())
+
+    parser = argparse.ArgumentParser(description='fetch javdb data.')
+    parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
    parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
    parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
    args = parser.parse_args()
    
-    main(args.task, args.debug, args.force)
+    main(args.cmd, args.debug, args.force)
--- a/scripts/javdb/src/scraper.py
+++ b/scripts/javdb/src/scraper.py
@ -30,9 +30,15 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
        try:
            if 'javdb.com' not in url.lower():
                logging.error(f'wrong url format: {url}')
-                return None
+                return None, None
            
            response = scraper.get(url, headers=headers)
+
+            # 处理 HTTP 状态码
+            if response.status_code == 404:
+                logging.warning(f"Page not found (404): {url}")
+                return None, 404  # 直接返回 404，调用方可以跳过
+            
            response.raise_for_status()  # 处理 HTTP 错误

            # 预处理 HTML（如果提供了 preprocessor）
@ -40,7 +46,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
-                return soup
+                return soup, response.status_code

            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except cloudscraper.exceptions.CloudflareChallengeError as e:
@ -51,7 +57,7 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
            logging.error(f"Unexpected error on {url}: {e}, Retring...")

    logging.error(f'Fetching failed after max retries. {url}')
-    return None  # 达到最大重试次数仍然失败
+    return None, None  # 达到最大重试次数仍然失败

 # 修复 HTML 结构，去除多余标签并修正 <a> 标签，在获取人种的时候需要
 def preprocess_html(html):
@ -78,6 +84,21 @@ def url_page_num(href):
    else:
        return None
    
+
+# <span class="avatar" style="background-image: url(https://c0.jdbstatic.com/avatars/md/mdRn.jpg)"></span>
+def parse_avatar_image(soup):
+    try:
+        span = soup.find("span", class_="avatar")
+        if not span:
+            return ""  # 没有找到 <span> 元素，返回空字符串
+        
+        style = span.get("style", "")
+        match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
+        return match.group(1) if match else ""  # 解析成功返回 URL，否则返回空字符串
+    except Exception as e:
+        return ""  # 发生异常时，返回空字符串
+
+
 # 解析 HTML 内容，提取需要的数据
 def parse_actors_uncensored(soup, href):
    div_actors = soup.find("div", id='actors')
@ -123,6 +144,29 @@ def parse_actors_uncensored(soup, href):

 # 解析 HTML 内容，提取需要的数据
 def parse_actor_detail(soup, href):
+    # 先找一下别名
+    alias_list = []
+
+    div_meta = soup.find('span', class_='actor-section-name')
+    if not div_meta:
+        logging.warning(f'warning: no meta data found in page {href}')
+        return None, None
+    alias_div = soup.find('div', class_='column section-title')
+    
+    if alias_div:
+        meta_list = alias_div.find_all('span', class_='section-meta')
+        if len(meta_list) > 1:
+            alias_list = meta_list[0].text.strip().split(", ")
+
+    # 头像
+    pic = ''
+    avatar = soup.find("div", class_="column actor-avatar")
+    if avatar:
+        pic = parse_avatar_image(avatar)
+
+    # 返回数据
+    actor = {}
+
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
@ -157,7 +201,13 @@ def parse_actor_detail(soup, href):
        if next_page_number and next_page_number > current_page_number :
            next_url = host_url + next_page_url

-    return list_data, next_url
+    actor = {
+        'pic' : pic,
+        'alias' : alias_list,
+        'movies' : list_data
+    }
+
+    return actor, next_url


 # 解析 HTML 内容，提取需要的数据
@ -257,7 +307,7 @@ def parse_series_detail(soup, href):
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
-        return None, None
+        return [], None
    
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
@ -337,7 +387,7 @@ def parse_maker_detail(soup, href):
    div_movies = soup.find("div", class_='movie-list h cols-4 vcols-5')
    if not div_movies:
        logging.warning(f"Warning: No movies div found ")
-        return None, None
+        return [], None
    
    # 解析元素
    rows = div_movies.find_all('div', class_='item')
--- a/scripts/javdb/src/sqlite_utils.py
+++ b/scripts/javdb/src/sqlite_utils.py
@ -17,21 +17,42 @@ def get_id_by_href(table: str, href: str) -> int:
    row = cursor.fetchone()
    return row[0] if row else None

-# 插入演员索引，来自于列表数据
-def insert_actor_index(name, href):
+
+def insert_actor_index(name, href, from_actor_list=None, from_movie_list=None):
    try:
-        cursor.execute("""
-            INSERT OR IGNORE INTO javdb_actors (href, name) VALUES (?, ?)
-        """, (
-            href, name
-        ))
+        # **查询是否已存在该演员**
+        cursor.execute("SELECT id, name, from_actor_list, from_movie_list FROM javdb_actors WHERE href = ?", (href,))
+        existing_actor = cursor.fetchone()
+
+        if existing_actor:  # **如果演员已存在**
+            actor_id, existing_name, existing_actor_list, existing_movie_list = existing_actor
+
+            # **如果没有传入值，则保持原有值**
+            from_actor_list = from_actor_list if from_actor_list is not None else existing_actor_list
+            from_movie_list = from_movie_list if from_movie_list is not None else existing_movie_list
+
+            cursor.execute("""
+                UPDATE javdb_actors 
+                SET name = ?, 
+                    from_actor_list = ?, 
+                    from_movie_list = ?,
+                    updated_at = datetime('now', 'localtime')
+                WHERE href = ?
+            """, (name, from_actor_list, from_movie_list, href))
+        else:  # **如果演员不存在，插入**
+            cursor.execute("""
+                INSERT INTO javdb_actors (href, name, from_actor_list, from_movie_list) 
+                VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0))
+            """, (href, name, from_actor_list, from_movie_list))
+
        conn.commit()

        performer_id = get_id_by_href('javdb_actors', href)
        if performer_id:
-            logging.debug(f'insert one actor index, id: {performer_id}, name: {name}, href: {href}')
+            logging.debug(f'Inserted/Updated actor index, id: {performer_id}, name: {name}, href: {href}')
+
+        return performer_id

-        return performer_id    
    except sqlite3.Error as e:
        conn.rollback()
        logging.error(f"数据库错误: {e}")
@ -41,28 +62,49 @@ def insert_actor_index(name, href):
        logging.error(f"未知错误: {e}")
        return None

-
-# """插入电影索引，来自于列表数据"""
-def insert_movie_index(title, href):
+def insert_movie_index(title, href, from_actor_list=None, from_movie_makers=None, from_movie_series=None):
    try:
-        # 插入或更新电影信息
-        cursor.execute("""
-            INSERT OR IGNORE INTO javdb_movies (title, href) VALUES (?, ?)
-        """,
-            (title, href)
-        )
+        # **先检查数据库中是否已有该电影**
+        cursor.execute("SELECT id, from_actor_list, from_movie_makers, from_movie_series FROM javdb_movies WHERE href = ?", (href,))
+        existing_movie = cursor.fetchone()
+
+        if existing_movie:  # **如果电影已存在**
+            movie_id, existing_actor, existing_maker, existing_series = existing_movie
+
+            # **如果没有传入值，就用原来的值**
+            from_actor_list = from_actor_list if from_actor_list is not None else existing_actor
+            from_movie_makers = from_movie_makers if from_movie_makers is not None else existing_maker
+            from_movie_series = from_movie_series if from_movie_series is not None else existing_series
+
+            cursor.execute("""
+                UPDATE javdb_movies 
+                SET title = ?, 
+                    from_actor_list = ?, 
+                    from_movie_makers = ?, 
+                    from_movie_series = ?,
+                    updated_at = datetime('now', 'localtime')
+                WHERE href = ?
+            """, (title, from_actor_list, from_movie_makers, from_movie_series, href))
+        else:  # **如果电影不存在，插入**
+            cursor.execute("""
+                INSERT INTO javdb_movies (title, href, from_actor_list, from_movie_makers, from_movie_series) 
+                VALUES (?, ?, COALESCE(?, 0), COALESCE(?, 0), COALESCE(?, 0))
+            """, (title, href, from_actor_list, from_movie_makers, from_movie_series))
+
        conn.commit()

        movie_id = get_id_by_href('javdb_movies', href)
        if movie_id:
-            logging.debug(f'insert one movie index, id: {movie_id}, title: {title}, href: {href}')    
+            logging.debug(f'Inserted/Updated movie index, id: {movie_id}, title: {title}, href: {href}')
+
+        return movie_id

-        return movie_id        
    except Exception as e:
        conn.rollback()
-        logging.error("Error inserting movie: %s", e)
+        logging.error(f"Error inserting/updating movie: {e}")
        return None
-    
+
+
 # 插入演员和电影的关联数据
 def insert_actor_movie(performer_id, movie_id, tags=''):
    try:
@ -117,7 +159,7 @@ def insert_or_update_actor(actor):
            movie_id = get_id_by_href('javdb_movies', movie['href'])
            # 影片不存在，先插入
            if movie_id is None:
-                movie_id = insert_movie_index(movie['title'], movie['href'])
+                movie_id = insert_movie_index(movie['title'], movie['href'], from_actor_list=1)
            if movie_id:
                tmp_id = insert_actor_movie(actor_id, movie_id)
                if tmp_id :
@ -369,7 +411,7 @@ def insert_or_update_movie(movie):
            performer_id = get_id_by_href('javdb_actors', performer['href'])
            # 如果演员不存在，先插入
            if performer_id is None:
-                performer_id = insert_actor_index(performer['name'], performer['href'])
+                performer_id = insert_actor_index(performer['name'], performer['href'], from_movie_list=1)
            if performer_id:
                tmp_id = insert_actor_movie(performer_id, movie_id)
                if tmp_id:
@ -465,7 +507,13 @@ def insert_task_log():
            INSERT INTO javdb_task_log (task_status) VALUES ('Start')
        """)
        conn.commit()
-        return cursor.lastrowid  # 获取插入的 task_id
+
+        task_id = cursor.lastrowid
+        if task_id is None:
+            return None
+        update_task_log(task_id=task_id, task_status='Start')
+
+        return task_id # 获取插入的 task_id
    except sqlite3.Error as e:
        logging.error(f"插入任务失败: {e}")
        return None
--- a/scripts/javdb/src/utils.py
+++ b/scripts/javdb/src/utils.py
@ -0,0 +1,18 @@
+import re
+import os
+import json
+import time
+import csv
+from urllib.parse import urlparse
+import logging
+
+
+# 去掉 https://www.javdb.com/makers/16w?f=download 后面的参数
+def remove_url_query(url: str) -> str:
+    try:
+        parsed_url = urlparse(url)
+        clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
+        return clean_url
+    except Exception as e:
+        print(f"解析 URL 失败: {e}")
+        return url