resources/aabook/src/fetch.py

import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import scraper
import utils
import config

config.setup_logging()

debug = False
force = False

# 获取列表
def fetch_book_list():
    url = scraper.list_url_update
    while True:
        logging.info(f'fetching book list. url: {url}')
        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="list_main", attr_type="class"))
        if soup:
            # 获取书籍列表
            list_data, next_url = scraper.parse_book_list(soup, url=url)
            for item in list_data:
                row_id = db_tools.insert_or_update_common(item, db_tools.tbl_name_books)
                if row_id:
                    logging.debug(f"insert one book. row_id: {row_id}, name: {item['name']}")
                else:
                    logging.warning(f"insert book error. name: {item['name']}, href: {item['href']}")
            if next_url is None:
                logging.info(f'get all pages.')
                return True
            else:
                url = next_url
        elif status_code  and status_code == 404:
            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
        else:
            logging.warning(f'fetch page error. {url} ...')


# 获取详情
def fetch_real_content(url):
    soup, status_code = scraper.fetch_page(url, scraper.content_validator)
    if soup:
        data = scraper.parse_content_page(soup, url)
        if data:
            return data  # 段落的数组
    elif status_code  and status_code == 404:
        logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
    else:
        logging.warning(f'fetch page error. {url} ...')
    return None


# 获取内容页
def fetch_chapter_content(url):
    chapter_data = {}
    next_url = None

    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
    if soup:
        data, next_url = scraper.parse_chapter_page(soup, url)
        if data:
            chapter_data['title'] = data['title']
            contents = fetch_real_content(data['content_url'])
            if contents:
                chapter_data['contents'] = contents
            else:
                logging.warning(f"fetching real content faild. url: {data['content_url']}")
                return None, None
        else:
            logging.warning(f'fetch chapter page no data. url: {url}')
            return None, None
    else:
        logging.warning(f"fetch chapter page error. url: {url}, status_code: {status_code}")
        return None, None

    return chapter_data, next_url

# 获取小说详情页，获得首页地址
def fetch_book_detail(url):
    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
    if soup:
        detail = scraper.parse_book_detail(soup, url)
        return detail
    else:
        return None

# 获取某本小说的目录页
def fetch_book_toc(url):
    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="page_main", attr_type="class"))
    if soup:
        listdata = scraper.pase_chapter_list(soup, url)
        return listdata
    else:
        return None

# 获取小说的目录页，并插入到数据库
def fetch_table_of_contents():
    while True:
        update_list = db_tools.query_books(id=2547, is_latest=0, limit = 2 if debug else 100)
        if update_list is None or len(update_list) <1 :
            logging.info(f'no more data need fecth.')
            return

        for row in update_list:
            name = row['name']
            href = row['href']
            bookid = row['id']
            # 先打开详情页
            logging.info(f'----------fetching book {name}: {href}-------------')
            book_detail = fetch_book_detail(href)
            if book_detail is None:
                logging.warning(f'get book detail failed. url: {href}')
                continue

            # 获取目录页
            toc_url = book_detail['table_of_contents_href']
            if toc_url is None or toc_url == '':
                logging.warning(f'table_of_contents_href is not correct. url: {href}')
                continue

            logging.info(f'fetching page: {toc_url}')
            toc_data = fetch_book_toc(toc_url)

            # 解析目录页
            if toc_data is None:
                logging.warning(f'fetch_book_toc error. url: {toc_url}')
                continue

            # 插入所有的目录数据
            succ = 1
            for row in toc_data:
                section_title = row['title']
                chapters = row['chapters']
                section_id = db_tools.insert_or_update_book_sections({
                    'book_id' : int(bookid),
                    'section' : section_title,
                    'bookid_section': f'{bookid}_{section_title}'
                })
                if section_id is None:
                    logging.warning(f'insert section error. url: {toc_url}, section: {section_title}')
                    succ = 0
                    break
                else:
                    logging.debug(f'insert one books_sections record. id:{section_id}, key: {bookid}_{section_title}')

                # 插入目录数据
                for chap in chapters:
                    chap_row_id = db_tools.insert_chapter_data({
                        'book_id':   bookid,
                        'chapter_id': chap['chapter_id'],
                        'section_id': section_id,
                        'title':     chap['title'],
                        'href':     chap['href'],
                        'content':  '',
                        'has_content' : 0
                    })
                    if chap_row_id is None:
                        logging.warning(f'insert_chapter_data error. url: {toc_url}')
                        succ = 0
                        break
            if succ == 0 :
                logging.warning(f'fetch_book_toc data error. url: {toc_url}')
                continue

            # 读取完毕，更新列表
            row_id = db_tools.update_book_detail({
                'href' : href,
                **book_detail
            })
            if row_id:
                logging.debug(f'update book succ. id: {row_id}, url: {href}')
            else:
                logging.warning(f'update book failed. url: {href}')
        if debug:
            return

# 直接获取小说内容
def fetch_contents():
    while True:
        list_data = db_tools.query_no_content_chapters(limit = 10 if debug else 100)
        if list_data is None or len(list_data) <1 :
            logging.info(f'no more data need fecth.')
            return

        for row in list_data:
            url = row['href']
            logging.info(f'fetching content ({row['title']}) from {url}')
            content, next_url = fetch_chapter_content(url)
            if content and content['title'] and content['contents']:
                # 写入到数据表里
                db_tools.insert_chapter_data({
                    'book_id':  row['book_id'],
                    'chapter_id': row['chapter_id'],
                    'section_id': row['section_id'],
                    'title':     row['title'],
                    'href':     url,
                    'content':  '\n\n'.join(content['contents']),
                    'has_content': 1
                })
            else:
                logging.warning(f'fetch content error. url: {url}')
        if debug:
            return


'''
# 下载完整的小说
def fetch_book_data():
    update_list = db_tools.query_books(need_update=1, limit = 1)
    if update_list:
        for row in update_list:
            name = row['name']
            href = row['href']
            bookid = row['id']
            # 先打开详情页
            logging.info(f'----------fetching book {name}: {href}-------------')
            book_detail = fetch_book_detail(href)
            if book_detail:
                # 获取内容页，然后循环读取内容
                chapter_url = book_detail['start_page_href']
                chapter_id = utils.extract_page_num(chapter_url)
                # 断点续传，从上次拉取的最后一页开始
                if not force:
                    last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
                    if last_chapter_url:
                        chapter_url = last_chapter_url
                while chapter_url:
                    logging.info(f'fetching page: {chapter_url}')
                    content, next_url = fetch_chapter_content(chapter_url)
                    if content and content['title'] and content['contents']:
                        # 写入到数据表里
                        db_tools.insert_chapter_data({
                            'book_id':   bookid,
                            'chapter_id': chapter_id,
                            'title':     content['title'],
                            'href':     chapter_url,
                            'content':  '\n\n'.join(content['contents']),
                            'has_content': 1
                        })

                        if debug:
                            return
                    else:
                        logging.warning(f'fetch content error. url: {chapter_url}')
                    chapter_url = next_url
                # 读取完毕，更新列表
                row_id = db_tools.update_book_detail({
                    'href' : href,
                    **book_detail
                })
                if row_id:
                    logging.debug(f'update book succ. id: {row_id}, url: {href}')
                else:
                    logging.warning(f'update book failed. url: {href}')
            else:
                logging.warning(f'get book detail failed. url: {href}')
    else:
        logging.warning(f'get no data needed update.')
'''

# 建立缩写到函数的映射
function_map = {
    "list":     fetch_book_list,
    "toc" :     fetch_table_of_contents,
    "content":  fetch_contents,
}

# 主函数
def main(cmd, args_debug, args_force):
    global debug
    debug = args_debug

    global force
    force = args_force

    # 执行指定的函数
    if cmd:
        function_names = args.cmd.split(",")  # 拆分输入
        for short_name in function_names:
            func = function_map.get(short_name.strip())  # 从映射中获取对应的函数
            if callable(func):
                func()
            else:
                logging.warning(f" {short_name} is not a valid function shortcut.")
    else: # 全量执行
        for name, func in function_map.items():
            if callable(func):
                func()
            else:
                logging.warning(f" {short_name} is not a valid function shortcut.")

    logging.info(f'all process completed!')

    # TODO:
    # 1,

if __name__ == "__main__":
    # 命令行参数处理
    keys_str = ",".join(function_map.keys())

    parser = argparse.ArgumentParser(description='fetch aabook data.')
    parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
    parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
    parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
    args = parser.parse_args()

    main(args.cmd, args.debug, args.force)