resources/javdb/src/fetch.py


import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import scraper
import utils

config.setup_logging()

debug = False
skip_local = False
scan_mode = 0
update_mode = 0

# 获取演员列表
def fetch_actor_list():
    next_url = scraper.actors_uncensored_base_url
    while next_url:
        logging.info(f'fetching page {next_url}')
        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="actors", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_actors_uncensored(soup, next_url)
            if list_data :
                # 写入数据库
                for row in list_data:
                    actor_id = db_tools.insert_actor_index(name=row['name'], href=row.get('href', ''), from_actor_list=1)
                    if actor_id:
                        logging.debug(f'insert performer index to db. performer_id:{actor_id}, name: {row['name']}, href:{row['href']}')
                    else:
                        logging.warning(f'insert performer index failed. name: {row['name']}, href:{row['href']}')
            else:
                logging.warning(f'fetch actor error. {next_url} ...')
        elif status_code  and status_code == 404:
            logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
            break

# 获取makers列表
def fetch_makers_list():
    next_url = scraper.makers_uncensored_base_url
    while next_url:
        logging.info(f'fetching page {next_url}')
        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="makers", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_makers_uncensored(soup, next_url)
            if list_data :
                # 写入数据库
                for row in list_data:
                    maker_id = db_tools.insert_or_update_makers(row, caller='list')
                    if maker_id:
                        logging.debug(f'insert maker to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
                    else:
                        logging.warning(f'insert maker failed. name: {row['name']}, href:{row['href']}')
            else:
                logging.warning(f'fetch actor error. {next_url} ...')

        elif status_code  and status_code == 404:
            logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
            break

# 获取series列表
def fetch_series_list():
    next_url = scraper.series_uncensored_base_url
    while next_url:
        logging.info(f'fetching page {next_url}')
        soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="series", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_series_uncensored(soup, next_url)
            if list_data :
                # 写入数据库
                for row in list_data:
                    maker_id = db_tools.insert_or_update_series(row, caller='list')
                    if maker_id:
                        logging.debug(f'insert series to db. maker_id:{maker_id}, name: {row['name']}, href:{row['href']}')
                    else:
                        logging.warning(f'insert series failed. name: {row['name']}, href:{row['href']}')
            else:
                logging.warning(f'fetch actor error. {next_url} ...')

        elif status_code  and status_code == 404:
            logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
            break


# 更新makers列表中的影片信息
def fetch_movies_by_maker():
    if debug:
        url_list = db_tools.query_maker_hrefs(name='muramura')
    else:
        if scan_mode==1:
            url_list = db_tools.query_maker_hrefs(from_list=1)
        elif scan_mode==0:
            url_list = db_tools.query_maker_hrefs(from_list=0)
        else:
            url_list = db_tools.query_maker_hrefs()

    for row in url_list:
        url = row['href']
        row_id = row['id']
        uncensored = row['from_list'] if row['from_list'] > 0 else None
        # 去掉可下载的标志（如果有）
        next_url = utils.remove_url_query(url)
        while next_url:
            logging.info(f"Fetching data for maker url {next_url} ...")
            soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
            if soup:
                list_data, next_url = scraper.parse_maker_detail(soup, next_url)
                if list_data:
                    for movie in list_data:
                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_makers=1, maker_id=row_id, uncensored=uncensored)
                        if tmp_id:
                            logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
                        else:
                            logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
                else :
                    logging.warning(f'parse_page_movie error. url: {next_url}')

            elif status_code  and status_code == 404:
                logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
                break

            # 调试增加brak
            if debug:
                return True

# 更新series列表中的影片信息
def fetch_movies_by_series():
    if debug:
        url_list = db_tools.query_series_hrefs(name='10musume')
    else:
        if scan_mode == 1:
            url_list = db_tools.query_series_hrefs(from_list=1)
        elif scan_mode == 0:
            url_list = db_tools.query_series_hrefs(from_list=0)
        else:
            url_list = db_tools.query_series_hrefs()

    for row in url_list:
        url = row['href']
        row_id = row['id']
        uncensored = row['from_list'] if row['from_list'] > 0 else None
        # 去掉可下载的标志（如果有）
        next_url = utils.remove_url_query(url)
        while next_url:
            logging.info(f"Fetching data for series url {next_url} ...")
            soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="column section-title", attr_type="class"))
            if soup:
                list_data, next_url = scraper.parse_series_detail(soup, next_url)
                if list_data:
                    for movie in list_data:
                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_series=1, series_id=row_id, uncensored=uncensored)
                        if tmp_id:
                            logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
                        else:
                            logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
                else :
                    logging.warning(f'parse_page_movie error. url: {next_url}')
            elif status_code  and status_code == 404:
                logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
                break

            # 调试增加brak
            if debug:
                return True

# 更新series列表中的影片信息
def fetch_movies_by_publishers():
    if debug:
        url_list = db_tools.query_publishers_hrefs(limit=1)
    else:
        if scan_mode == 1:
            url_list = db_tools.query_publishers_hrefs(from_list=1)
        elif scan_mode == 0:
            url_list = db_tools.query_publishers_hrefs(from_list=0)
        else:
            url_list = db_tools.query_publishers_hrefs()

    for row in url_list:
        url = row['href']
        row_id = row['id']
        # 去掉可下载的标志（如果有）
        next_url = utils.remove_url_query(url)
        while next_url:
            logging.info(f"Fetching data for publisher url {next_url} ...")
            soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="modal-card", attr_type="class"))
            if soup:
                list_data, next_url = scraper.parse_publisher_detail(soup, next_url)
                if list_data:
                    for movie in list_data:
                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], from_movie_publishers=1, pub_id=row_id)
                        if tmp_id:
                            logging.debug(f'insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}')
                        else:
                            logging.warning(f'insert movie index failed. title: {movie['title']}, href: {movie['href']}')
                else :
                    logging.warning(f'parse_page_movie error. url: {next_url}')
            elif status_code  and status_code == 404:
                logging.warning(f'fetch page error. httpcode: {status_code}, url: {next_url}')
                break

            # 调试增加brak
            if debug:
                return True


# 更新演员信息
def fetch_performers_detail():
    limit_count = 5 if debug else 100
    performers_list = []
    last_performer_id = 0
    abnormal_codes = [scraper.http_code_404, scraper.http_code_login]

    def get_performers(**kwargs):
        if scan_mode == 1:
            kwargs["from_actor_list"] = 1
        elif scan_mode == 0:
            kwargs["from_actor_list"] = 0
        else:
            logging.debug(f"scan all records")
        kwargs["order_by"] = 'id asc'
        return db_tools.query_actors(limit=limit_count, **kwargs)

    while True:
        if update_mode == 0:    # 只遍历新纪录
            performers_list = get_performers(start_id=0,                 is_full_data=0)
        elif update_mode == 1:  # 只遍历完整纪录
            performers_list = get_performers(start_id=last_performer_id, is_full_data=1)
        elif update_mode == 2:  # 0+1
            performers_list = get_performers(start_id=last_performer_id, is_full_data_not_in=abnormal_codes)
        elif update_mode == 3:  # 其他
            performers_list = get_performers(start_id=last_performer_id, is_full_data_in    =abnormal_codes)
        else:   # 全部
            performers_list = get_performers(start_id=last_performer_id)

        if len(performers_list) < 1:
            logging.info(f'all performers fetched.')
            break

        succ_rows = 0
        for performer in performers_list:
            url = performer['href']
            person = performer['name']
            pic = ''
            alias = []

            next_url = url
            all_movies = []
            need_insert = True
            while next_url:
                logging.debug(f"Fetching data for actor ({person}), url {next_url} ...")
                soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="span", identifier="actor-section-name", attr_type="class"))
                if soup:
                    data, next_url = scraper.parse_actor_detail(soup, next_url)
                    if data:
                        pic = data.get('pic', '')
                        alias = data.get('alias', [])
                        all_movies.extend(data.get('movies', []))

                elif status_code  and status_code == scraper.http_code_404:
                    actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404)
                    logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
                    need_insert = False
                    break
                elif status_code  and status_code == scraper.http_code_login:
                    actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_login)
                    logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
                    need_insert = False
                    break
                else:
                    logging.warning(f'fetch_page error. url: {url}')

            # 如果出现了401或者404，已经处理，直接跳过
            if not need_insert:
                continue

            # 获取完了个人的所有影片，开始插入数据
            performer_id = db_tools.insert_or_update_actor({
                'href': url,
                'name': person,
                'pic' : pic,
                'alias' : alias,
                'credits':all_movies
            })
            if performer_id:
                logging.debug(f'insert one person, id: {performer_id}, person: ({person}), url: {url}')
                last_performer_id = performer_id
                succ_rows += 1
            else:
                logging.warning(f'insert person: ({person}) {url} failed.')
            time.sleep(0.5)

        logging.info(f'total request: {len(performers_list)}, succ: {succ_rows}, last performer id: {last_performer_id}')
        # 调试break
        if debug:
            return True

# 更新影片信息
def fetch_movies_detail():
    limit_count = 10 if debug else 100
    movies_list = []
    last_movie_id = 0
    abnormal_codes = [scraper.http_code_404, scraper.http_code_login]

    def get_movies(**kwargs):
        if scan_mode == 1:
            kwargs["uncensored"] = 1
        elif scan_mode == 0:
            kwargs["uncensored"] = 0
        else:
            logging.debug(f"scan all records.")
        kwargs["order_by"] = 'id asc'
        return db_tools.query_movie_hrefs(limit=limit_count, **kwargs)

    while True:
        if update_mode == 0:    # 只遍历新纪录
            movies_list = get_movies(start_id=0,             is_full_data=0)
        elif update_mode == 1:  # 只遍历完整纪录
            movies_list = get_movies(start_id=last_movie_id, is_full_data=1)
        elif update_mode == 2:  # 0+1
            movies_list = get_movies(start_id=last_movie_id, is_full_data_not_in=abnormal_codes)
        elif update_mode == 3:  # 其他
            movies_list = get_movies(start_id=last_movie_id, is_full_data_in    =abnormal_codes)
        else:   # 全部
            movies_list = get_movies(start_id=last_movie_id)

        if len(movies_list) < 1:
            logging.info(f'all performers fetched.')
            break

        succ_count = 0
        for movie in movies_list:
            url = movie['href']
            title = movie['title']
            curr_id = movie['id']
            logging.debug(f"Fetching data for movie ({title}), url {url} ...")
            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
            # 从本地读取的文件，忽略
            if skip_local and status_code == scraper.http_code_local :
                last_movie_id = curr_id
                succ_count += 1
                continue
            # 解析页面，写入数据库
            if soup:
                movie_data = scraper.parse_movie_detail(soup, url, title)
                if movie_data :
                    movie_id = db_tools.insert_or_update_movie(movie_data)
                    if movie_id:
                        logging.debug(f'insert one movie, id: {movie_id}, title: ({title}) url: {url}')
                        last_movie_id = movie_id
                        succ_count += 1
                    else:
                        logging.warning(f'insert movie {url} failed.')
                else:
                    logging.warning(f'parse_page_movie error. url: {url}')

            elif status_code  and status_code == scraper.http_code_404:
                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
                logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
            elif status_code  and status_code == scraper.http_code_login:
                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login)
                logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
            else:
                logging.warning(f'fetch_page error. url: {url}')
            time.sleep(0.5)
        logging.info(f'total request: {len(movies_list)}, succ: {succ_count}. last movie id: {last_movie_id}')
        # 调试增加break
        if debug:
            return True


# 建立缩写到函数的映射
function_map = {
    "actor_list": fetch_actor_list,
    "maker_list": fetch_makers_list,
    "series_list": fetch_series_list,
    "makers": fetch_movies_by_maker,
    "series" : fetch_movies_by_series,
    "pub" : fetch_movies_by_publishers,
    "movies" : fetch_movies_detail,
    "actors" : fetch_performers_detail,
}

# 主函数
def main(cmd, args):
    # 开启任务
    task_id = db_tools.insert_task_log()
    if task_id is None:
        logging.warning(f'insert task log error.')
        return None

    logging.info(f"running task. id: {task_id}, args: {args}")

    # 执行指定的函数
    if cmd:
        function_names = args.cmd.split(",")  # 拆分输入
        for short_name in function_names:
            func = function_map.get(short_name.strip())  # 从映射中获取对应的函数
            if callable(func):
                db_tools.update_task_log(task_id, task_status=f'Running {short_name}')
                func()
            else:
                logging.warning(f" {short_name} is not a valid function shortcut.")
    else: # 全量执行
        for name, func in function_map.items():
            if callable(func):
                db_tools.update_task_log(task_id, task_status=f'Running {name}')
                func()
            else:
                logging.warning(f" {short_name} is not a valid function shortcut.")

    logging.info(f'all process completed!')
    db_tools.finalize_task_log(task_id)

    # TODO:
    # 1,

# 设置环境变量
def set_env(args):
    global debug
    debug = args.debug
    if debug:
        logger = logging.getLogger()
        logger.setLevel(logging.DEBUG)

    global skip_local
    skip_local = args.skip_local

    global scan_mode
    scan_mode = args.scan_mode

    global update_mode
    if args.update:
        update_mode = args.update

if __name__ == "__main__":
    # 命令行参数处理
    keys_str = ",".join(function_map.keys())

    parser = argparse.ArgumentParser(description='fetch javdb data.')
    parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
    parser.add_argument('--update', type=int, choices=[0, 1, 2, 3, 4], default=0, help='0-只遍历is_full_data=0(默认), 1-只遍历is_full_data=1, 2-遍历is_full_data<=1, 3-只遍历is_full_data>1(异常数据), 4-遍历所有')
    parser.add_argument('--scan_mode', type=int, choices=[0, 1, 2], default=2, help='0-只遍历所有 uncensored 的 makers/series/actors/movies, 1-与前者相反, 2-全量(默认)')
    parser.add_argument('--skip_local', action='store_true', help='如果本地缓存了页面，则跳过数据库操作')
    parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
    args = parser.parse_args()

    set_env(args)
    main(args.cmd, args)

'''
python3 ./fetch                             # 遍历新增的所有记录
python3 ./fetch --scan_mode=0               # 遍历新增的 uncensored   记录(无码片)
python3 ./fetch --scan_mode=1               # 遍历新增的 非uncensored 记录(有码片)

python3 ./fetch --update=4                  # 遍历全量的记录
python3 ./fetch --update=4 --scan_mode=0    # 遍历全量的 uncensored   记录(无码片)
python3 ./fetch --update=4 --scan_mode=1    # 遍历全量的 非uncensored 记录(有码片)
'''