modify scripts

2025-03-18 17:45:20 +08:00
parent d5dc76b87f
commit a4ea79d4db
14 changed files with 1369 additions and 13 deletions
--- a/aabook/aabook_fetch.py
+++ b/aabook/aabook_fetch.py
@ -10,6 +10,7 @@ from datetime import datetime
 from datetime import date
 import config # 日志配置
 from down_list import novel_map
 import utils
 # 日志
@ -21,7 +22,8 @@ list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&ca
 list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
 curr_novel_pages = 0
-meta_dir = 'meta'
+meta_dir  = f'{config.global_host_data_dir}/aabook/meta'
 novel_dir = f'{config.global_host_data_dir}/aabook/data'
 list_file = f'{meta_dir}/list.txt'
 details_file = f'{meta_dir}/details.txt'
@ -246,7 +248,7 @@ def extract_content_url(soup, base_url, chapid):
 # 判断内容是否被污染
 def check_content(content):
-    if '2005-2024 疯情书库' in content:
+    if '2005-2024 疯情书库' in content or '2005-2025 疯情书库' in content:
        return False
    return True
@ -263,13 +265,15 @@ def get_novel_pages():
    return curr_novel_pages
 # 解析章节内容并保存到文件中
-def download_novel(chapid, novel_name, dir_prefix='./aabook'):
+def download_novel(chapid, novel_name, dir_prefix=novel_dir):
    chapter_url = f'{base_url}/read-{chapid}.html'
    novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
    if os.path.exists(novel_file):
        os.remove(novel_file)  # 如果存在同名文件，删除重新下载
    # 保存到其他类型的文件
    chapters = []
    reset_novel_pages()
    while chapter_url:
        logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
@ -314,6 +318,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
                    f.write(chapter_title + '\n\n')
                # 写入每个段落内容到文件
                content = ''
                with open(novel_file, 'a', encoding='utf-8') as f:
                    for paragraph in paragraphs:
                        #cleaned_part = clean_watermarks(paragraph.get_text().strip())
@ -321,7 +326,9 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
                        #f.write(cleaned_part + '\n\n')
                        cleaned_text = process_paragraph(paragraph)
                        f.write(cleaned_text + '\n\n')
                        content = content + '<p>' + cleaned_text + '</p>'   # epub 里面，用html标签来分段落
                logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
                chapters.append((chapter_title, content))
            else:
                logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
                continue
@ -356,6 +363,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
            break
        time.sleep(3)
    # 全部获取完，生成epub文件
    utils.generate_epub(novel_name, 'nobody', chapters, dir_prefix)
 # 检查子目录是否存在，不存在则创建
@ -400,7 +409,7 @@ def download_books(need_down_list_file = details_file, cursor_file = down_list_f
                continue  # 已经下载过，跳过
            # 创建分类目录
-            down_dir = './data/' + category
+            down_dir = f'{novel_dir}/{category}'
            create_directory_if_not_exists(down_dir)
            # 调用下载函数下载书籍
@ -420,7 +429,7 @@ def download_map():
    # 遍历 novel_map，下载所有小说
    for novel_id, novel_name in novel_map.items():
        logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
-        download_novel(novel_id, novel_name, './local')
+        download_novel(novel_id, novel_name, novel_dir)
        logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
 # 获取更新列表，并下载
@ -444,6 +453,10 @@ def main():
        print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map")
        sys.exit(1)
    # 确保目录存在
    create_directory_if_not_exists(meta_dir)
    create_directory_if_not_exists(novel_dir)
    cmd = sys.argv[1]
    if cmd == "get_list":
--- a/aabook/config.py
+++ b/aabook/config.py
@ -3,13 +3,9 @@ import os
 import inspect
 from datetime import datetime
-# MySQL 配置
+home_dir = os.path.expanduser("~")
-db_config = {
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
-    'host': '172.18.0.3',
+global_share_data_dir = f'{home_dir}/sharedata'
    'user': 'root',
    'password': 'mysqlpw',
    'database': 'stockdb'
 }
 # 设置日志配置
 def setup_logging(log_filename=None):
--- a/aabook/down_list.py
+++ b/aabook/down_list.py
@ -10,7 +10,7 @@ novel_map_new = {
 }
 # 定义小说映射
 novel_map = {
-    364489: '诸天之乡村爱情',
+    371300: '临时夫妻',
 }
--- a/aabook/src/check_status.py
+++ b/aabook/src/check_status.py
@ -0,0 +1,12 @@
 import json
 import time
 import sqlite_utils as db_tools
 if __name__ == "__main__":
    # 命令行参数处理
    result = db_tools.get_statics()
    print(result)
--- a/aabook/src/config.py
+++ b/aabook/src/config.py
@ -0,0 +1,80 @@
 import logging
 import os
 import inspect
 import time
 from datetime import datetime
 from logging.handlers import RotatingFileHandler
 from collections import defaultdict
 home_dir = os.path.expanduser("~")
 global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
 global_share_data_dir = f'{home_dir}/sharedata'
 global_sqlite_path = f'{global_share_data_dir}/sqlite/books.db'
 log_dir = '../log'
 # 统计日志频率
 log_count = defaultdict(int)  # 记录日志的次数
 last_log_time = defaultdict(float)  # 记录上次写入的时间戳
 class RateLimitFilter(logging.Filter):
    """
    频率限制过滤器：
    1. 在 60 秒内，同样的日志最多写入 60 次，超过则忽略
    2. 如果日志速率超过 100 条/秒，发出告警
    """
    LOG_LIMIT = 60  # 每分钟最多记录相同消息 10 次
    def filter(self, record):
        global log_count, last_log_time
        message_key = record.getMessage()  # 获取日志内容
        # 计算当前时间
        now = time.time()
        elapsed = now - last_log_time[message_key]
        # 限制相同日志的写入频率
        if elapsed < 60:  # 60 秒内
            log_count[message_key] += 1
            if log_count[message_key] > self.LOG_LIMIT:
                print('reach limit.')
                return False  # 直接丢弃
        else:
            log_count[message_key] = 1  # 超过 60 秒，重新计数
        last_log_time[message_key] = now
        return True  # 允许写入日志
 def setup_logging(log_filename=None):
    if log_filename is None:
        caller_frame = inspect.stack()[1]
        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
        current_date = datetime.now().strftime('%Y%m%d')
        log_filename = f'{log_dir}/{caller_filename}_{current_date}.log'
    max_log_size = 100 * 1024 * 1024  # 10 MB
    max_log_files = 10  # 最多保留 10 个日志文件
    file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
    file_handler.setFormatter(logging.Formatter(
        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
    ))
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(
        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
    ))
    # 创建 logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.handlers = []  # 避免重复添加 handler
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    # 添加频率限制
    rate_limit_filter = RateLimitFilter()
    file_handler.addFilter(rate_limit_filter)
    console_handler.addFilter(rate_limit_filter)
--- a/aabook/src/convert_utils.py
+++ b/aabook/src/convert_utils.py
@ -0,0 +1,126 @@
 from ebooklib import epub
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.platypus import Paragraph, Spacer
 def generate_epub(data, save_path):
    # 创建 EPUB 书籍对象
    book = epub.EpubBook()
    # 设置书籍元数据
    book.set_title(data.get('title', '未知标题'))
    book.set_language('zh')
    book.add_author(data.get('author', '未知作者'))
    # 存储所有章节对象
    all_chapters = []
    sections = data.get('sections', [])
    if len(sections) == 1:
        # 如果只有一个 section，忽略 section 的 title，按一级目录处理
        for chapter in sections[0].get('chapters', []):
            chapter_title = chapter.get('title', '未知章节')
            chapter_content = chapter.get('content', '')
            paragraphs = chapter_content.split('\n\n')
            html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
            chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
            chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
            book.add_item(chapter_obj)
            all_chapters.append(chapter_obj)
    else:
        # 如果有多个 section，按两级目录处理
        for section in sections:
            section_title = section.get('title', '未知卷')
            section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_title}.xhtml', lang='zh')
            section_chapter.content = f'<h1>{section_title}</h1>'
            book.add_item(section_chapter)
            all_chapters.append(section_chapter)
            for chapter in section.get('chapters', []):
                chapter_title = chapter.get('title', '未知章节')
                chapter_content = chapter.get('content', '')
                paragraphs = chapter_content.split('\n\n')
                html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
                chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
                chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
                book.add_item(chapter_obj)
                all_chapters.append(chapter_obj)
    # 定义书籍的目录
    book.toc = tuple(all_chapters)
    # 定义书的结构
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    # 定义样式
    style = 'body { font-family: Times, serif; }'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    book.add_item(nav_css)
    # 定义书的结构
    book.spine = ['nav'] + all_chapters
    # 保存 EPUB 文件
    epub.write_epub(save_path, book, {})
 def generate_pdf(data, save_path):
    # 创建 PDF 画布
    c = canvas.Canvas(save_path, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []
    # 设置标题
    title = data.get('title', '未知标题')
    story.append(Paragraph(f'<font size=20>{title}</font>', styles['Title']))
    story.append(Spacer(1, 20))
    # 设置作者
    author = data.get('author', '未知作者')
    story.append(Paragraph(f'<font size=14>作者: {author}</font>', styles['Normal']))
    story.append(Spacer(1, 40))
    sections = data.get('sections', [])
    if len(sections) == 1:
        # 如果只有一个 section，忽略 section 的 title，按一级目录处理
        for chapter in sections[0].get('chapters', []):
            chapter_title = chapter.get('title', '未知章节')
            chapter_content = chapter.get('content', '')
            story.append(Paragraph(f'<font size=18>{chapter_title}</font>', styles['Heading1']))
            story.append(Spacer(1, 10))
            paragraphs = chapter_content.split('\n\n')
            for para in paragraphs:
                story.append(Paragraph(para, styles['Normal']))
                story.append(Spacer(1, 10))
            story.append(Spacer(1, 20))
    else:
        # 如果有多个 section，按两级目录处理
        for section in sections:
            section_title = section.get('title', '未知卷')
            story.append(Paragraph(f'<font size=20>{section_title}</font>', styles['Heading1']))
            story.append(Spacer(1, 15))
            for chapter in section.get('chapters', []):
                chapter_title = chapter.get('title', '未知章节')
                chapter_content = chapter.get('content', '')
                story.append(Paragraph(f'<font size=16>{chapter_title}</font>', styles['Heading2']))
                story.append(Spacer(1, 10))
                paragraphs = chapter_content.split('\n\n')
                for para in paragraphs:
                    story.append(Paragraph(para, styles['Normal']))
                    story.append(Spacer(1, 10))
                story.append(Spacer(1, 15))
    # 构建 PDF
    for element in story:
        element.wrapOn(c, letter[0] - 100, letter[1] - 100)
        element.drawOn(c, 50, letter[1] - element.wrapOn(c, letter[0] - 100, letter[1] - 100)[1] - 50)
        c.showPage()
    # 保存 PDF 文件
    c.save()
--- a/aabook/src/fetch.py
+++ b/aabook/src/fetch.py
@ -0,0 +1,312 @@
 import json
 import time
 import csv
 import argparse
 import logging
 from functools import partial
 import config
 import sqlite_utils as db_tools
 import scraper
 import utils 
 import config
 config.setup_logging()
 debug = False
 force = False
 # 获取列表
 def fetch_book_list():
    url = scraper.list_url_update
    while True:
        logging.info(f'fetching book list. url: {url}')
        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="list_main", attr_type="class"))
        if soup:
            # 获取书籍列表
            list_data, next_url = scraper.parse_book_list(soup, url=url)
            for item in list_data:
                row_id = db_tools.insert_or_update_common(item, db_tools.tbl_name_books)
                if row_id:
                    logging.debug(f'insert one book. row_id: {row_id}, name: {item['name']}')
                else:
                    logging.warning(f'insert book error. name: {item['name']}, href: {item['href']}')
            if next_url is None:
                logging.info(f'get all pages.')
                return True
            else:
                url = next_url
        elif status_code  and status_code == 404:
            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
        else:
            logging.warning(f'fetch page error. {url} ...')
 # 获取详情
 def fetch_real_content(url):
    soup, status_code = scraper.fetch_page(url, scraper.content_validator)
    if soup:
        data = scraper.parse_content_page(soup, url)
        if data:
            return data  # 段落的数组
    elif status_code  and status_code == 404:
        logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
    else:
        logging.warning(f'fetch page error. {url} ...')
    return None
 # 获取内容页
 def fetch_chapter_content(url):
    chapter_data = {}
    next_url = None
    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
    if soup:
        data, next_url = scraper.parse_chapter_page(soup, url)
        if data:
            chapter_data['title'] = data['title']
            contents = fetch_real_content(data['content_url'])
            if contents:
                chapter_data['contents'] = contents
            else:
                logging.warning(f'fetching real content faild. url: {data['content_url']}')
                return None, None
        else:
            logging.warning(f'fetch chapter page no data. url: {url}')
            return None, None
    else:
        logging.warning(f'fetch chapter page error. url: {url}, status_code: {status_code}')
        return None, None
    return chapter_data, next_url
 # 获取小说详情页，获得首页地址
 def fetch_book_detail(url):
    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
    if soup:
        detail = scraper.parse_book_detail(soup, url)
        return detail
    else:
        return None
 # 获取某本小说的目录页
 def fetch_book_toc(url):
    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="page_main", attr_type="class"))
    if soup:
        listdata = scraper.pase_chapter_list(soup, url)
        return listdata
    else:
        return None
 # 获取小说的目录页，并插入到数据库
 def fetch_table_of_contents(): 
    while True:
        update_list = db_tools.query_books(id=2547, is_latest=0, limit = 2 if debug else 100)
        if update_list is None or len(update_list) <1 :
            logging.info(f'no more data need fecth.')
            return
        for row in update_list:
            name = row['name']
            href = row['href']
            bookid = row['id']
            # 先打开详情页
            logging.info(f'----------fetching book {name}: {href}-------------')
            book_detail = fetch_book_detail(href)
            if book_detail is None:
                logging.warning(f'get book detail failed. url: {href}')
                continue
            # 获取目录页
            toc_url = book_detail['table_of_contents_href']
            if toc_url is None or toc_url == '':
                logging.warning(f'table_of_contents_href is not correct. url: {href}')
                continue
            logging.info(f'fetching page: {toc_url}')
            toc_data = fetch_book_toc(toc_url)
            # 解析目录页
            if toc_data is None:
                logging.warning(f'fetch_book_toc error. url: {toc_url}')
                continue
            # 插入所有的目录数据
            succ = 1
            for row in toc_data:
                section_title = row['title']
                chapters = row['chapters']
                section_id = db_tools.insert_or_update_book_sections({
                    'book_id' : int(bookid),
                    'section' : section_title,
                    'bookid_section': f'{bookid}_{section_title}'
                })
                if section_id is None:
                    logging.warning(f'insert section error. url: {toc_url}, section: {section_title}')
                    succ = 0
                    break
                else:
                    logging.debug(f'insert one books_sections record. id:{section_id}, key: {bookid}_{section_title}')
                # 插入目录数据
                for chap in chapters:
                    chap_row_id = db_tools.insert_chapter_data({
                        'book_id':   bookid,
                        'chapter_id': chap['chapter_id'],
                        'section_id': section_id,
                        'title':     chap['title'],
                        'href':     chap['href'],
                        'content':  '',
                        'has_content' : 0
                    })
                    if chap_row_id is None:
                        logging.warning(f'insert_chapter_data error. url: {toc_url}')
                        succ = 0
                        break
            if succ == 0 :
                logging.warning(f'fetch_book_toc data error. url: {toc_url}')
                continue
            # 读取完毕，更新列表
            row_id = db_tools.update_book_detail({
                'href' : href,
                **book_detail
            })
            if row_id:
                logging.debug(f'update book succ. id: {row_id}, url: {href}')
            else:
                logging.warning(f'update book failed. url: {href}')
        if debug:
            return
 # 直接获取小说内容
 def fetch_contents():
    while True:
        list_data = db_tools.query_no_content_chapters(limit = 10 if debug else 100)
        if list_data is None or len(list_data) <1 :
            logging.info(f'no more data need fecth.')
            return
        for row in list_data:
            url = row['href']
            logging.info(f'fetching content ({row['title']}) from {url}')
            content, next_url = fetch_chapter_content(url)
            if content and content['title'] and content['contents']:
                # 写入到数据表里
                db_tools.insert_chapter_data({
                    'book_id':  row['book_id'],
                    'chapter_id': row['chapter_id'],
                    'section_id': row['section_id'],
                    'title':     row['title'],
                    'href':     url,
                    'content':  '\n\n'.join(content['contents']),
                    'has_content': 1
                })
            else:
                logging.warning(f'fetch content error. url: {url}')
        if debug:
            return
 '''
 # 下载完整的小说
 def fetch_book_data():
    update_list = db_tools.query_books(need_update=1, limit = 1)
    if update_list:
        for row in update_list:
            name = row['name']
            href = row['href']
            bookid = row['id']
            # 先打开详情页
            logging.info(f'----------fetching book {name}: {href}-------------')
            book_detail = fetch_book_detail(href)
            if book_detail:
                # 获取内容页，然后循环读取内容
                chapter_url = book_detail['start_page_href']
                chapter_id = utils.extract_page_num(chapter_url)
                # 断点续传，从上次拉取的最后一页开始
                if not force:
                    last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
                    if last_chapter_url:
                        chapter_url = last_chapter_url
                while chapter_url:
                    logging.info(f'fetching page: {chapter_url}')
                    content, next_url = fetch_chapter_content(chapter_url)
                    if content and content['title'] and content['contents']:
                        # 写入到数据表里
                        db_tools.insert_chapter_data({
                            'book_id':   bookid,
                            'chapter_id': chapter_id,
                            'title':     content['title'],
                            'href':     chapter_url,
                            'content':  '\n\n'.join(content['contents']),
                            'has_content': 1
                        })
                        if debug:
                            return
                    else:
                        logging.warning(f'fetch content error. url: {chapter_url}')
                    chapter_url = next_url
                # 读取完毕，更新列表
                row_id = db_tools.update_book_detail({
                    'href' : href,
                    **book_detail
                })
                if row_id:
                    logging.debug(f'update book succ. id: {row_id}, url: {href}')
                else:
                    logging.warning(f'update book failed. url: {href}')
            else:
                logging.warning(f'get book detail failed. url: {href}')
    else:
        logging.warning(f'get no data needed update.')   
 '''
 # 建立缩写到函数的映射
 function_map = {
    "list":     fetch_book_list,
    "toc" :     fetch_table_of_contents,
    "content":  fetch_contents,
 }   
 # 主函数
 def main(cmd, args_debug, args_force):
    global debug
    debug = args_debug
    global force
    force = args_force
    # 执行指定的函数
    if cmd:
        function_names = args.cmd.split(",")  # 拆分输入
        for short_name in function_names:
            func = function_map.get(short_name.strip())  # 从映射中获取对应的函数
            if callable(func):
                func()
            else:
                logging.warning(f" {short_name} is not a valid function shortcut.")
    else: # 全量执行
        for name, func in function_map.items():
            if callable(func):
                func()
            else:
                logging.warning(f" {short_name} is not a valid function shortcut.")
    logging.info(f'all process completed!')
    # TODO:
    # 1, 
 if __name__ == "__main__":
    # 命令行参数处理
    keys_str = ",".join(function_map.keys())
    parser = argparse.ArgumentParser(description='fetch aabook data.')
    parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
    parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
    parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
    args = parser.parse_args()
    main(args.cmd, args.debug, args.force)
--- a/aabook/src/scraper.py
+++ b/aabook/src/scraper.py
@ -0,0 +1,364 @@
 import time
 import json
 import csv
 import logging
 import signal
 import sys
 import os
 import re
 import requests
 import random
 from bs4 import BeautifulSoup
 from requests.exceptions import RequestException
 from functools import partial
 import config
 import utils
 # 定义基础 URL 和可变参数
 host_url = 'https://aabook.xyz'
 list_url_update    = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
 #list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
 # User-Agent 列表
 user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
    "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
 ]
 #使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
 def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
    for attempt in range(max_retries):
        try:
            if 'aabook.xyz' not in url.lower():
                logging.error(f'wrong url format: {url}')
                return None, None
            # 随机选择一个 User-Agent
            headers = {
                'User-Agent': random.choice(user_agents)
            }
            response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.warning(f"Page not found (404): {url}")
                return None, 404  # 直接返回 404，调用方可以跳过
            response.raise_for_status()  # 处理 HTTP 错误
            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text
            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code
            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except requests.RequestException as e:
            logging.info(f"Warn fetching page {url}: {e}. Retrying ...")
            time.sleep(sleep_time)  # 休眠指定的时间，然后重试
    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败
 # 解析列表页
 def parse_book_list(soup, url):
    # 查找书籍列表
    list_main = soup.find('div', class_='list_main')
    if not list_main:
        logging.warning(f"No list_main Found in {url}")
        return None, None
    tbody = list_main.find('tbody')
    if not tbody:
        logging.warning(f"No tbody found in {url}")
        None, None
    list_data = []
    next_url = None
    # 获取每本书的基础信息：排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数，按日期排序时是最后更新日期)
    for tr in tbody.find_all('tr'):
        tds = tr.find_all('td')
        if len(tds) < 6:
            logging.info("Invalid tr format.")
        ranking = tds[0].text.strip()
        category = utils.remove_brackets_regex(tds[1].text.strip())
        book_link_tag = tds[2].find('a')
        book_name = book_link_tag.text.strip()
        book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
        book_num = utils.extract_book_num(book_link_tag['href'])
        author = tds[3].text.strip()
        monthly_tickets = tds[4].text.strip()
        update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数，按日期排序时是最后更新日期)
        list_data.append({
            'rank':     ranking,
            'category': category,
            'name':     book_name,
            'href':     book_link,
            'num':      book_num,
            'author':   author,
            'tickets':  monthly_tickets,
            'update_time':  update_time
        })
    # 查找下一页链接
    next_page_tag = soup.find('a', title='下一页')
    if next_page_tag:
        next_url = host_url + next_page_tag['href']
    return list_data, next_url
 # 解析详情页
 def parse_book_detail(soup, url):
    # 解析书籍详细信息
    book_info_tag = soup.find('li', class_='zuopinxinxi')
    if not book_info_tag:
        logging.warning(f"No details found in {url}")
        return None
    table_of_contents_href = ''
    table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
    if table_of_contents_href_tag:
        table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']
    book_info_lis = book_info_tag.find_all('li')
    if len(book_info_lis) < 4:
        logging.info(f"invalid book info in {url}")
        return None
    book_category = book_info_lis[0].find('span').text.strip()
    book_status = book_info_lis[1].find('span').text.strip()
    # 去掉后面的汉字，只要数字
    total_word_count = book_info_lis[2].find('span').text.strip()
    total_word_count = int(re.search(r'\d+', total_word_count).group())
    total_clicks = book_info_lis[3].find('span').text.strip()
    month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
    week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
    total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
    month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
    week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'
    # 读取创建时间
    creation_time_tag = soup.find('li', class_='update_time')
    created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')
    # 获取起始页链接和编号
    start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
    start_page_link = host_url + '/' + start_page_tag['href']
    start_page_number = start_page_link.split('-')[-1].replace('.html', '')
    return {
        'category':         book_category,
        'status' :          book_status,
        'total_words' :     total_word_count,
        'total_clicks':     total_clicks,   
        'month_clicks':     month_clicks,
        'week_clicks':      week_clicks,
        'total_recommend':  total_recommend,
        'month_recommend':  month_recommend,
        'week_recommend':   week_recommend,
        'created_time':     created_time,
        'start_page_href':  start_page_link,
        'start_page_num':   start_page_number,
        'table_of_contents_href': table_of_contents_href
    }
 # 解析书籍的目录页
 def pase_chapter_list(soup, url):
    # 获取小说的目录
    table_of_contents = []
    div_table_of_contents = soup.find('div', class_='page_main')
    if not div_table_of_contents:
        return None
    section_titles = div_table_of_contents.find_all('p', class_='section_title')
    sections = div_table_of_contents.find_all('ul', class_='section_list')
    if len(sections) > len(section_titles): # 一般是 后者比前者多1个，最后一个是广告
        logging.warning(f'sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}')
        return None
    else:
        for i in range(len(sections)):
            section_title = section_titles[i].get_text().strip()
            chap_list = sections[i].find_all("a")
            chap_data = []
            for chap in chap_list:
                chap_title = chap.get_text().strip()
                chap_link = f'{host_url}/{chap['href']}'
                chap_id = utils.extract_page_num(chap_link)
                chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
            table_of_contents.append({'title': section_title, 'chapters': chap_data})
    return table_of_contents
 # 解析书籍的章节页
 def parse_chapter_page(soup, url):
    # 获取章节标题
    chapter_title_tag = soup.find('h1', class_='chapter_title')
    if chapter_title_tag is None:
        logging.warning(f'Chapter title not found in {url}')
        return None, None
    title = chapter_title_tag.get_text().strip()
    content_url = None
    next_url = None
    chapid = utils.extract_page_num(url)
    # 遍历每一个 <script> 标签，查找内容页的链接
    script_tags = soup.find_all('script')
    for script_tag in script_tags:
        script_content = script_tag.string
        if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
            # 匹配到特定内容，提取出 _getcontent.php 的 URL 模板
            match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
            if match:
                # 从匹配中提取 v 参数值
                v_value = match.group(1)
                # 构建完整的 content_url
                content_url = f"{host_url}/_getcontent.php?id={chapid}&v={v_value}"
                break
    if content_url is None:
        logging.warning(f'Content url not found in {url}')
        return None, None
    # 获取小说的目录
    table_of_contents = []
    div_table_of_contents = soup.find('div', class_='mulu_con')
    if div_table_of_contents or False:  # 考虑要不要加上这个
        section_titles = div_table_of_contents.find_all('p')
        sections = div_table_of_contents.find_all('ul')
        if len(sections) != len(section_titles):
            logging.warning(f'sections not matched titles')
        else:
            for i in range(len(sections)):
                section_title = section_titles[i].get_text().strip()
                chap_list = sections[i].find_all("a")
                chap_data = []
                for chap in chap_list:
                    chap_title = chap.get_text().strip()
                    chap_link = chap['href']
                    chap_data.append({'href': chap_link, 'title': chap_title})
                table_of_contents.append({'title': section_title, 'chapters': chap_data})
    # 查找下一章的链接
    next_div = soup.find('div', class_='next_arrow')  
    if next_div:
        next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))            
        if next_page_tag:
            next_url = f'{host_url}/{next_page_tag['href']}' if next_page_tag['href'] else ''
    data = {'title': title, 'content_url': content_url, 'table_of_contents': table_of_contents}
    return data, next_url
 def process_paragraph(paragraph):
    # 获取完整的 HTML 结构，而不是 get_text()
    paragraph_html = str(paragraph)
    # 移除水印标签
    cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', paragraph_html, flags=re.DOTALL)
    # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
    soup = BeautifulSoup(cleaned_html, 'html.parser')
    cleaned_text = soup.get_text().strip()
    return cleaned_text
 # 解析内容页
 def parse_content_page(soup, url):
    content = []
    paragraphs = soup.find_all('p')
    if paragraphs:
        for paragraph in paragraphs:
            cleaned_text = process_paragraph(paragraph)
            content.append(cleaned_text)
    return content
 # 通用的 HTML 结构验证器
 def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name": 
        return bool(soup.find('select', {'name': identifier}))
    return False
 # 对内容是否被污染的判断
 def content_validator(soup):
    text = str(soup)
    dirty_words = ['2005-2024 疯情书库', '2005-2025 疯情书库', '2025 疯情书库', '2026 疯情书库', '2027 疯情书库']
    for word in dirty_words:
        if word in text:
            return False
    return True
 def test_content_page(url):
    soup, status_code = fetch_page(url, content_validator)
    if soup:
        data = parse_content_page(soup, url)
        if data:
            return data
        else :
            return []
 def test_chapter_page(url):
    soup, status_code = fetch_page(url, partial(generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
    if soup:
        data, next_url = parse_chapter_page(soup, url)
        if data:
            return data
        else :
            return None
 def test_book_detail(url):
    soup, status_code = fetch_page(url, partial(generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
    if soup:
        detail = parse_book_detail(soup, url)
        return detail
 def test_book_list():
    for num in range(5):
        url = list_url_update.format(num)
        soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="list_main", attr_type="class"))
        if soup:
            # 获取书籍列表
            list_data, next_url = parse_book_list(soup, url=url)
            for item in list_data:
                # 获取详情页
                detail = test_book_detail(item['href'])
                if detail:
                    print({
                        **item,
                        **detail
                    })
                    # 获取内容页
                    page_data = test_chapter_page(detail['start_page_href'])
                    if page_data:
                        print(page_data)
                        # 获取内容
                        contents = test_content_page(page_data['content_url'])
                        if contents and len(contents)>0:
                            print (contents[0])
                else:
                    print('get detail error.')
                return
 if __name__ == "__main__":
    test_book_list()
--- a/aabook/src/sqlite_utils.py
+++ b/aabook/src/sqlite_utils.py
@ -0,0 +1,278 @@
 import sqlite3
 import json
 import config
 import utils
 import logging
 import sys
 from datetime import datetime
 # 连接 SQLite 数据库
 DB_PATH = config.global_sqlite_path  # 替换为你的数据库文件
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
 tbl_name_books = 'books'
 tbl_name_chapters_prefix = 'chapters'
 tbl_name_section = 'books_sections'
 # 获取表的列名和默认值
 def get_table_columns_and_defaults(tbl_name):
    try:
        cursor.execute(f"PRAGMA table_info({tbl_name})")
        columns = cursor.fetchall()
        column_info = {}
        for col in columns:
            col_name = col[1]
            default_value = col[4]
            column_info[col_name] = default_value
        return column_info
    except sqlite3.Error as e:
        logging.error(f"Error getting table columns: {e}")
        return None
 # 检查并处理数据
 def check_and_process_data(data, tbl_name):
    column_info = get_table_columns_and_defaults(tbl_name=tbl_name)
    if column_info is None:
        return None
    processed_data = {}
    for col, default in column_info.items():
        if col == 'id':  # 自增主键，不需要用户提供
            continue
        if col == 'created_at' or col == 'updated_at':  # 日期函数，用户自己指定即可
            continue
        elif col in data:
            processed_data[col] = data[col]
        else:
            if default is not None:
                processed_data[col] = default
            else:
                processed_data[col] = None
    return processed_data
 # 插入或更新数据
 def insert_or_update_common(data, tbl_name, uniq_key='href'):
    try:
        processed_data = check_and_process_data(data, tbl_name)
        if processed_data is None:
            return None
        columns = ', '.join(processed_data.keys())
        values = list(processed_data.values())
        placeholders = ', '.join(['?' for _ in values])
        update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != {uniq_key}]) + ', updated_at=datetime(\'now\', \'localtime\')'
        sql = f'''
            INSERT INTO {tbl_name} ({columns}, updated_at)
            VALUES ({placeholders}, datetime('now', 'localtime'))
            ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
        '''
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None
 # 插入books表，并判断是否需要更新
 def insert_books_index(data):
    try:        
        # 查询是否存在以及是否需要更新
        cursor.execute(f"SELECT id FROM books WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
        existing_book = cursor.fetchone()
        if existing_book:  # **如果演员已存在**
            return existing_book[0]
        # 不存在，或者需要更新
        data['is_latest'] = 0
        return insert_or_update_common(data, tbl_name_books)
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None
 # 更新详细信息
 def update_book_detail(data):
    try:                
        data['is_latest'] = 1
        # 排除不更新的字段，只更新data中含有的字段
        fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
        # 构建更新语句
        set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
        sql = f"UPDATE {tbl_name_books} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
        # 准备参数
        values = [data[field] for field in fields_to_update]
        values.append(data['href'])
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href = ?", (data['href'],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None
 # 按条件查询 href 列表 
 def query_books(**filters):
    try:
        sql = f"SELECT href, name, id FROM {tbl_name_books} WHERE 1=1"
        params = []
        if "id" in filters:
            sql += " AND id = ?"
            params.append(filters["id"])
        if "href" in filters:
            sql += " AND href = ?"
            params.append(filters["href"])
        if "name" in filters:
            sql += " AND name LIKE ?"
            params.append(f"%{filters['name']}%")
        if "is_latest" in filters:
            sql += " AND is_latest = ?"
            params.append(filters["is_latest"])
        if 'limit' in filters:
            sql += " limit ?"
            params.append(filters["limit"])
        cursor.execute(sql, params)
        return [{'href': row[0], 'name': row[1], 'id': row[2]} for row in cursor.fetchall()]
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 # 检查表是否存在，不存在就创建
 def check_and_create_chapters_table(book_number):
    table_name = f"{tbl_name_chapters_prefix}_{book_number}"
    try:
        create_table_query = f'''
            CREATE TABLE if not exists {table_name} (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                book_id INTEGER,
                chapter_id INTEGER,
                section_id INTEGER,
                title TEXT,
                href TEXT UNIQUE,
                content TEXT,
                has_content INTEGER default 0,
                created_at TEXT DEFAULT (datetime('now', 'localtime')),
                updated_at TEXT DEFAULT (datetime('now', 'localtime')),
                FOREIGN KEY(book_id) REFERENCES books(id) ON DELETE CASCADE
            );
        '''
        cursor.execute(create_table_query)
        conn.commit()
        return table_name
    except sqlite3.Error as e:
        logging.error(f"create table failed: {e}")
        return None
 # 插入到数据表中
 def insert_chapter_data(data):
    tbl_num = int(data['book_id']) % 100
    tbl_name = check_and_create_chapters_table(tbl_num)
    if tbl_name :
        return insert_or_update_common(data, tbl_name)
    else:
        return None
 # 查询某本书最后的获取页码
 def query_last_chapter_by_book(bookid):
    tbl_num = int(bookid) % 100
    tbl_name = check_and_create_chapters_table(tbl_num)
    if tbl_name is None:
        return None
    try:
        sql = f"SELECT href FROM {tbl_name} WHERE book_id={bookid} order by id desc limit 1"
        cursor.execute(sql)
        row = cursor.fetchone()
        if row:  # **如果演员已存在**
            return row[0]
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None    
 # 获取没有内容的章节链接
 def query_no_content_chapters(limit = 100):
    # 用于存储所有结果的列表
    all_results = []
    # 循环遍历 0 到 100 的数字
    for i in range(100):
        table_name = f'{tbl_name_chapters_prefix}_{i}'
        try:
            # 计算还需要多少条数据
            remaining_count = limit - len(all_results)
            if remaining_count <= 0:
                break
            # 执行 SQL 查询，从每个表中获取 has_content = 0 的数据，数量不超过剩余所需数量
            query = f"SELECT href, title, book_id, chapter_id, section_id FROM {table_name} WHERE has_content = 0 LIMIT {remaining_count}"
            cursor.execute(query)
            results = [{'href': row[0], 'title': row[1], 'book_id': row[2], 'chapter_id': row[3], 'section_id': row[4]} for row in cursor.fetchall()]
            all_results.extend(results)
        except sqlite3.Error as e:
            print(f"Error querying table {table_name}: {e}")
    return all_results
 # 插入书本的卷信息
 def insert_or_update_book_sections(data):
    return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')
 # 统计信息
 def get_statics():
    result = {}
    try:
        # 获取 performers、studios 等表的最终行数
        cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} ")
        result['all_books'] = cursor.fetchone()[0]
        cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} where is_latest=1")
        result['all_books_latest'] = cursor.fetchone()[0]
    except sqlite3.Error as e:
        logging.error(f"query error: {e}")
    all_chapters = 0
    all_chapters_has_contents = 0
    # 循环遍历 0 到 100 的数字
    for i in range(100):
        table_name = f'{tbl_name_chapters_prefix}_{i}'
        try:
            cursor.execute(f"SELECT COUNT(*) FROM {table_name} ")
            all_chapters += cursor.fetchone()[0]
            cursor.execute(f"SELECT COUNT(*) FROM {table_name} where has_content=1")
            all_chapters_has_contents += cursor.fetchone()[0]
        except sqlite3.Error as e:
            logging.debug(f"Error querying table {table_name}: {e}")
    result['all_chapters'] = all_chapters
    result['all_chapters_has_contents'] = all_chapters_has_contents
    return result
--- a/aabook/src/utils.py
+++ b/aabook/src/utils.py
@ -0,0 +1,53 @@
 import requests
 import re
 import os
 import json
 import time
 import csv
 import logging
 from datetime import datetime
 import config
 # 从"创建时间  2025-03-08 13:57:00" 中提取时间
 def extract_create_time(input_str):
    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    match = re.search(pattern, input_str)
    if match:
        datetime_str = match.group(0)
        return datetime_str
    else:
        return input_str
 # 从 "read-374864.html" 中获取数字编号
 def extract_page_num(page_str, default_num = 0):
    # 定义正则表达式模式
    pattern = r'read-(\d+)\.html'
    # 使用 re.search 查找匹配项
    match = re.search(pattern, page_str)
    if match:
        number = match.group(1)
        return number
    else:
        return default_num
 # 从 "book-5549.html" 中获取数字编号
 def extract_book_num(page_str, default_num = 0):
    # 定义正则表达式模式
    pattern = r'book-(\d+)\.html'
    # 使用 re.search 查找匹配项
    match = re.search(pattern, page_str)
    if match:
        number = match.group(1)
        return number
    else:
        return default_num
 # 处理 [都市] 的方括号
 def remove_brackets_regex(input_str):
    pattern = r'\[(.*?)\]'
    match = re.match(pattern, input_str)
    if match:
        return match.group(1)
    return input_str
--- a/aabook/tools/tools_diff.py
+++ b/aabook/tools/tools_diff.py
--- a/aabook/tools/tools_dir.py
+++ b/aabook/tools/tools_dir.py
--- a/aabook/tools/tools_other.py
+++ b/aabook/tools/tools_other.py
--- a/aabook/utils.py
+++ b/aabook/utils.py
@ -0,0 +1,122 @@
 import requests
 from bs4 import BeautifulSoup
 from ebooklib import epub
 import re
 import os
 import json
 import time
 import csv
 import logging
 from datetime import datetime
 import config
 # 从"创建时间  2025-03-08 13:57:00" 中提取时间
 def extract_create_time(input_str):
    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    match = re.search(pattern, input_str)
    if match:
        datetime_str = match.group(0)
        return datetime_str
    else:
        return input_str
 # 从 "read-374864.html" 中获取数字编号
 def extract_page_num(page_str, default_num = 0):
    # 定义正则表达式模式
    pattern = r'read-(\d+)\.html'
    # 使用 re.search 查找匹配项
    match = re.search(pattern, page_str)
    if match:
        number = match.group(1)
        return number
    else:
        return default_num
 # 从 "book-5549.html" 中获取数字编号
 def extract_book_num(page_str, default_num = 0):
    # 定义正则表达式模式
    pattern = r'book-(\d+)\.html'
    # 使用 re.search 查找匹配项
    match = re.search(pattern, page_str)
    if match:
        number = match.group(1)
        return number
    else:
        return default_num
 # 处理 [都市] 的方括号
 def remove_brackets_regex(input_str):
    pattern = r'\[(.*?)\]'
    match = re.match(pattern, input_str)
    if match:
        return match.group(1)
    return input_str
 # 定义函数来抓取小说章节内容
 def fetch_chapter(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        # 这里需要根据实际网页结构修改选择器
        chapter_content = soup.find('div', class_='chapter-content').get_text()
        return chapter_content
    except requests.RequestException as e:
        print(f"请求出错: {e}")
        return None
 # 定义函数来生成 EPUB 文件
 def generate_epub(title, author, chapters, path):
    book = epub.EpubBook()
    book.set_title(title)
    book.set_language('zh')
    book.add_author(author)
    epub_chapters = []
    for chapter_title, chapter_content in chapters:
        c = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
        c.content = f'<h1>{chapter_title}</h1><p>{chapter_content}</p>'
        book.add_item(c)
        epub_chapters.append(c)
    # 定义书的结构
    book.toc = tuple(epub_chapters)
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    # 定义样式
    style = 'body { font-family: Times, serif; }'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    book.add_item(nav_css)
    # 定义书的结构
    book.spine = ['nav'] + epub_chapters
    # 保存 EPUB 文件
    epub.write_epub(f'{path}/{title}.epub', book, {})
 # 示例使用
 if __name__ == "__main__":
    # 这里需要替换为实际的小说章节链接
    chapter_info = [
        ('第一章', 'https://example.com/chapter1'),
        ('第二章', 'https://example.com/chapter2')
    ]
    title = '小说标题'
    author = '小说作者'
    chapters = []
    for chapter_title, url in chapter_info:
        content = fetch_chapter(url)
        if content:
            chapters.append((chapter_title, content))
    if chapters:
        generate_epub(title, author, chapters)
        print(f'{title}.epub 文件生成成功。')
    else:
        print('未获取到有效章节内容，无法生成 EPUB 文件。')