modify scripts

2025-03-18 17:45:20 +08:00
parent d5dc76b87f
commit a4ea79d4db
14 changed files with 1369 additions and 13 deletions
--- a/aabook/aabook_fetch.py
+++ b/aabook/aabook_fetch.py
@ -10,6 +10,7 @@ from datetime import datetime
 from datetime import date
 import config # 日志配置
 from down_list import novel_map
+import utils


 # 日志
@ -21,7 +22,8 @@ list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&ca
 list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
 curr_novel_pages = 0

-meta_dir = 'meta'
+meta_dir  = f'{config.global_host_data_dir}/aabook/meta'
+novel_dir = f'{config.global_host_data_dir}/aabook/data'

 list_file = f'{meta_dir}/list.txt'
 details_file = f'{meta_dir}/details.txt'
@ -246,7 +248,7 @@ def extract_content_url(soup, base_url, chapid):

 # 判断内容是否被污染
 def check_content(content):
-    if '2005-2024 疯情书库' in content:
+    if '2005-2024 疯情书库' in content or '2005-2025 疯情书库' in content:
        return False
        
    return True
@ -263,13 +265,15 @@ def get_novel_pages():
    return curr_novel_pages

 # 解析章节内容并保存到文件中
-def download_novel(chapid, novel_name, dir_prefix='./aabook'):
+def download_novel(chapid, novel_name, dir_prefix=novel_dir):
    chapter_url = f'{base_url}/read-{chapid}.html'

    novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
    if os.path.exists(novel_file):
        os.remove(novel_file)  # 如果存在同名文件，删除重新下载

+    # 保存到其他类型的文件
+    chapters = []
    reset_novel_pages()
    while chapter_url:
        logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
@ -314,6 +318,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
                    f.write(chapter_title + '\n\n')

                # 写入每个段落内容到文件
+                content = ''
                with open(novel_file, 'a', encoding='utf-8') as f:
                    for paragraph in paragraphs:
                        #cleaned_part = clean_watermarks(paragraph.get_text().strip())
@ -321,7 +326,9 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
                        #f.write(cleaned_part + '\n\n')
                        cleaned_text = process_paragraph(paragraph)
                        f.write(cleaned_text + '\n\n')
+                        content = content + '<p>' + cleaned_text + '</p>'   # epub 里面，用html标签来分段落
                logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
+                chapters.append((chapter_title, content))
            else:
                logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
                continue
@ -356,6 +363,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
            break

        time.sleep(3)
+    # 全部获取完，生成epub文件
+    utils.generate_epub(novel_name, 'nobody', chapters, dir_prefix)


 # 检查子目录是否存在，不存在则创建
@ -400,7 +409,7 @@ def download_books(need_down_list_file = details_file, cursor_file = down_list_f
                continue  # 已经下载过，跳过

            # 创建分类目录
-            down_dir = './data/' + category
+            down_dir = f'{novel_dir}/{category}'
            create_directory_if_not_exists(down_dir)
            
            # 调用下载函数下载书籍
@ -420,7 +429,7 @@ def download_map():
    # 遍历 novel_map，下载所有小说
    for novel_id, novel_name in novel_map.items():
        logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
-        download_novel(novel_id, novel_name, './local')
+        download_novel(novel_id, novel_name, novel_dir)
        logging.info(f"Completed download for {novel_id}_{novel_name}.\n")

 # 获取更新列表，并下载
@ -444,6 +453,10 @@ def main():
        print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map")
        sys.exit(1)

+    # 确保目录存在
+    create_directory_if_not_exists(meta_dir)
+    create_directory_if_not_exists(novel_dir)
+
    cmd = sys.argv[1]

    if cmd == "get_list":
--- a/aabook/config.py
+++ b/aabook/config.py
@ -3,13 +3,9 @@ import os
 import inspect
 from datetime import datetime

-# MySQL 配置
-db_config = {
-    'host': '172.18.0.3',
-    'user': 'root',
-    'password': 'mysqlpw',
-    'database': 'stockdb'
-}
+home_dir = os.path.expanduser("~")
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
+global_share_data_dir = f'{home_dir}/sharedata'

 # 设置日志配置
 def setup_logging(log_filename=None):
--- a/aabook/down_list.py
+++ b/aabook/down_list.py
@ -10,7 +10,7 @@ novel_map_new = {
 }
 # 定义小说映射
 novel_map = {
-    364489: '诸天之乡村爱情',
+    371300: '临时夫妻',
 }


--- a/aabook/src/check_status.py
+++ b/aabook/src/check_status.py
@ -0,0 +1,12 @@
+import json
+import time
+import sqlite_utils as db_tools
+
+
+if __name__ == "__main__":
+    # 命令行参数处理
+    result = db_tools.get_statics()
+    print(result)
+
+    
+
--- a/aabook/src/config.py
+++ b/aabook/src/config.py
@ -0,0 +1,80 @@
+import logging
+import os
+import inspect
+import time
+from datetime import datetime
+from logging.handlers import RotatingFileHandler
+from collections import defaultdict
+
+home_dir = os.path.expanduser("~")
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
+global_share_data_dir = f'{home_dir}/sharedata'
+global_sqlite_path = f'{global_share_data_dir}/sqlite/books.db'
+
+log_dir = '../log'
+# 统计日志频率
+log_count = defaultdict(int)  # 记录日志的次数
+last_log_time = defaultdict(float)  # 记录上次写入的时间戳
+
+class RateLimitFilter(logging.Filter):
+    """
+    频率限制过滤器：
+    1. 在 60 秒内，同样的日志最多写入 60 次，超过则忽略
+    2. 如果日志速率超过 100 条/秒，发出告警
+    """
+    LOG_LIMIT = 60  # 每分钟最多记录相同消息 10 次
+
+    def filter(self, record):
+        global log_count, last_log_time
+        message_key = record.getMessage()  # 获取日志内容
+        
+        # 计算当前时间
+        now = time.time()
+        elapsed = now - last_log_time[message_key]
+
+        # 限制相同日志的写入频率
+        if elapsed < 60:  # 60 秒内
+            log_count[message_key] += 1
+            if log_count[message_key] > self.LOG_LIMIT:
+                print('reach limit.')
+                return False  # 直接丢弃
+        else:
+            log_count[message_key] = 1  # 超过 60 秒，重新计数
+
+        last_log_time[message_key] = now
+
+        return True  # 允许写入日志
+
+
+
+def setup_logging(log_filename=None):
+    if log_filename is None:
+        caller_frame = inspect.stack()[1]
+        caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+        current_date = datetime.now().strftime('%Y%m%d')
+        log_filename = f'{log_dir}/{caller_filename}_{current_date}.log'
+    
+    max_log_size = 100 * 1024 * 1024  # 10 MB
+    max_log_files = 10  # 最多保留 10 个日志文件
+
+    file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
+    file_handler.setFormatter(logging.Formatter(
+        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+    ))
+
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter(
+        '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+    ))
+
+    # 创建 logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    logger.handlers = []  # 避免重复添加 handler
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+
+    # 添加频率限制
+    rate_limit_filter = RateLimitFilter()
+    file_handler.addFilter(rate_limit_filter)
+    console_handler.addFilter(rate_limit_filter)
--- a/aabook/src/convert_utils.py
+++ b/aabook/src/convert_utils.py
@ -0,0 +1,126 @@
+from ebooklib import epub
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.platypus import Paragraph, Spacer
+
+
+def generate_epub(data, save_path):
+    # 创建 EPUB 书籍对象
+    book = epub.EpubBook()
+
+    # 设置书籍元数据
+    book.set_title(data.get('title', '未知标题'))
+    book.set_language('zh')
+    book.add_author(data.get('author', '未知作者'))
+
+    # 存储所有章节对象
+    all_chapters = []
+
+    sections = data.get('sections', [])
+
+    if len(sections) == 1:
+        # 如果只有一个 section，忽略 section 的 title，按一级目录处理
+        for chapter in sections[0].get('chapters', []):
+            chapter_title = chapter.get('title', '未知章节')
+            chapter_content = chapter.get('content', '')
+            paragraphs = chapter_content.split('\n\n')
+            html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
+            chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
+            chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
+            book.add_item(chapter_obj)
+            all_chapters.append(chapter_obj)
+    else:
+        # 如果有多个 section，按两级目录处理
+        for section in sections:
+            section_title = section.get('title', '未知卷')
+            section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_title}.xhtml', lang='zh')
+            section_chapter.content = f'<h1>{section_title}</h1>'
+            book.add_item(section_chapter)
+            all_chapters.append(section_chapter)
+
+            for chapter in section.get('chapters', []):
+                chapter_title = chapter.get('title', '未知章节')
+                chapter_content = chapter.get('content', '')
+                paragraphs = chapter_content.split('\n\n')
+                html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
+                chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
+                chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
+                book.add_item(chapter_obj)
+                all_chapters.append(chapter_obj)
+
+    # 定义书籍的目录
+    book.toc = tuple(all_chapters)
+
+    # 定义书的结构
+    book.add_item(epub.EpubNcx())
+    book.add_item(epub.EpubNav())
+
+    # 定义样式
+    style = 'body { font-family: Times, serif; }'
+    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
+    book.add_item(nav_css)
+
+    # 定义书的结构
+    book.spine = ['nav'] + all_chapters
+
+    # 保存 EPUB 文件
+    epub.write_epub(save_path, book, {})
+
+
+def generate_pdf(data, save_path):
+    # 创建 PDF 画布
+    c = canvas.Canvas(save_path, pagesize=letter)
+    styles = getSampleStyleSheet()
+    story = []
+
+    # 设置标题
+    title = data.get('title', '未知标题')
+    story.append(Paragraph(f'<font size=20>{title}</font>', styles['Title']))
+    story.append(Spacer(1, 20))
+
+    # 设置作者
+    author = data.get('author', '未知作者')
+    story.append(Paragraph(f'<font size=14>作者: {author}</font>', styles['Normal']))
+    story.append(Spacer(1, 40))
+
+    sections = data.get('sections', [])
+
+    if len(sections) == 1:
+        # 如果只有一个 section，忽略 section 的 title，按一级目录处理
+        for chapter in sections[0].get('chapters', []):
+            chapter_title = chapter.get('title', '未知章节')
+            chapter_content = chapter.get('content', '')
+            story.append(Paragraph(f'<font size=18>{chapter_title}</font>', styles['Heading1']))
+            story.append(Spacer(1, 10))
+            paragraphs = chapter_content.split('\n\n')
+            for para in paragraphs:
+                story.append(Paragraph(para, styles['Normal']))
+                story.append(Spacer(1, 10))
+            story.append(Spacer(1, 20))
+    else:
+        # 如果有多个 section，按两级目录处理
+        for section in sections:
+            section_title = section.get('title', '未知卷')
+            story.append(Paragraph(f'<font size=20>{section_title}</font>', styles['Heading1']))
+            story.append(Spacer(1, 15))
+            for chapter in section.get('chapters', []):
+                chapter_title = chapter.get('title', '未知章节')
+                chapter_content = chapter.get('content', '')
+                story.append(Paragraph(f'<font size=16>{chapter_title}</font>', styles['Heading2']))
+                story.append(Spacer(1, 10))
+                paragraphs = chapter_content.split('\n\n')
+                for para in paragraphs:
+                    story.append(Paragraph(para, styles['Normal']))
+                    story.append(Spacer(1, 10))
+                story.append(Spacer(1, 15))
+
+    # 构建 PDF
+    for element in story:
+        element.wrapOn(c, letter[0] - 100, letter[1] - 100)
+        element.drawOn(c, 50, letter[1] - element.wrapOn(c, letter[0] - 100, letter[1] - 100)[1] - 50)
+        c.showPage()
+
+    # 保存 PDF 文件
+    c.save()
+    
--- a/aabook/src/fetch.py
+++ b/aabook/src/fetch.py
@ -0,0 +1,312 @@
+import json
+import time
+import csv
+import argparse
+import logging
+from functools import partial
+import config
+import sqlite_utils as db_tools
+import scraper
+import utils 
+import config
+
+config.setup_logging()
+
+debug = False
+force = False
+
+# 获取列表
+def fetch_book_list():
+    url = scraper.list_url_update
+    while True:
+        logging.info(f'fetching book list. url: {url}')
+        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="list_main", attr_type="class"))
+        if soup:
+            # 获取书籍列表
+            list_data, next_url = scraper.parse_book_list(soup, url=url)
+            for item in list_data:
+                row_id = db_tools.insert_or_update_common(item, db_tools.tbl_name_books)
+                if row_id:
+                    logging.debug(f'insert one book. row_id: {row_id}, name: {item['name']}')
+                else:
+                    logging.warning(f'insert book error. name: {item['name']}, href: {item['href']}')
+            if next_url is None:
+                logging.info(f'get all pages.')
+                return True
+            else:
+                url = next_url
+        elif status_code  and status_code == 404:
+            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+        else:
+            logging.warning(f'fetch page error. {url} ...')
+
+
+# 获取详情
+def fetch_real_content(url):
+    soup, status_code = scraper.fetch_page(url, scraper.content_validator)
+    if soup:
+        data = scraper.parse_content_page(soup, url)
+        if data:
+            return data  # 段落的数组
+    elif status_code  and status_code == 404:
+        logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+    else:
+        logging.warning(f'fetch page error. {url} ...')
+    return None
+
+
+# 获取内容页
+def fetch_chapter_content(url):
+    chapter_data = {}
+    next_url = None
+
+    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
+    if soup:
+        data, next_url = scraper.parse_chapter_page(soup, url)
+        if data:
+            chapter_data['title'] = data['title']
+            contents = fetch_real_content(data['content_url'])
+            if contents:
+                chapter_data['contents'] = contents
+            else:
+                logging.warning(f'fetching real content faild. url: {data['content_url']}')
+                return None, None
+        else:
+            logging.warning(f'fetch chapter page no data. url: {url}')
+            return None, None
+    else:
+        logging.warning(f'fetch chapter page error. url: {url}, status_code: {status_code}')
+        return None, None
+
+    return chapter_data, next_url
+
+# 获取小说详情页，获得首页地址
+def fetch_book_detail(url):
+    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
+    if soup:
+        detail = scraper.parse_book_detail(soup, url)
+        return detail
+    else:
+        return None
+
+# 获取某本小说的目录页
+def fetch_book_toc(url):
+    soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="page_main", attr_type="class"))
+    if soup:
+        listdata = scraper.pase_chapter_list(soup, url)
+        return listdata
+    else:
+        return None
+
+# 获取小说的目录页，并插入到数据库
+def fetch_table_of_contents(): 
+    while True:
+        update_list = db_tools.query_books(id=2547, is_latest=0, limit = 2 if debug else 100)
+        if update_list is None or len(update_list) <1 :
+            logging.info(f'no more data need fecth.')
+            return
+        
+        for row in update_list:
+            name = row['name']
+            href = row['href']
+            bookid = row['id']
+            # 先打开详情页
+            logging.info(f'----------fetching book {name}: {href}-------------')
+            book_detail = fetch_book_detail(href)
+            if book_detail is None:
+                logging.warning(f'get book detail failed. url: {href}')
+                continue
+            
+            # 获取目录页
+            toc_url = book_detail['table_of_contents_href']
+            if toc_url is None or toc_url == '':
+                logging.warning(f'table_of_contents_href is not correct. url: {href}')
+                continue
+
+            logging.info(f'fetching page: {toc_url}')
+            toc_data = fetch_book_toc(toc_url)
+
+            # 解析目录页
+            if toc_data is None:
+                logging.warning(f'fetch_book_toc error. url: {toc_url}')
+                continue
+
+            # 插入所有的目录数据
+            succ = 1
+            for row in toc_data:
+                section_title = row['title']
+                chapters = row['chapters']
+                section_id = db_tools.insert_or_update_book_sections({
+                    'book_id' : int(bookid),
+                    'section' : section_title,
+                    'bookid_section': f'{bookid}_{section_title}'
+                })
+                if section_id is None:
+                    logging.warning(f'insert section error. url: {toc_url}, section: {section_title}')
+                    succ = 0
+                    break
+                else:
+                    logging.debug(f'insert one books_sections record. id:{section_id}, key: {bookid}_{section_title}')
+
+                # 插入目录数据
+                for chap in chapters:
+                    chap_row_id = db_tools.insert_chapter_data({
+                        'book_id':   bookid,
+                        'chapter_id': chap['chapter_id'],
+                        'section_id': section_id,
+                        'title':     chap['title'],
+                        'href':     chap['href'],
+                        'content':  '',
+                        'has_content' : 0
+                    })
+                    if chap_row_id is None:
+                        logging.warning(f'insert_chapter_data error. url: {toc_url}')
+                        succ = 0
+                        break
+            if succ == 0 :
+                logging.warning(f'fetch_book_toc data error. url: {toc_url}')
+                continue
+
+            # 读取完毕，更新列表
+            row_id = db_tools.update_book_detail({
+                'href' : href,
+                **book_detail
+            })
+            if row_id:
+                logging.debug(f'update book succ. id: {row_id}, url: {href}')
+            else:
+                logging.warning(f'update book failed. url: {href}')
+        if debug:
+            return
+
+# 直接获取小说内容
+def fetch_contents():
+    while True:
+        list_data = db_tools.query_no_content_chapters(limit = 10 if debug else 100)
+        if list_data is None or len(list_data) <1 :
+            logging.info(f'no more data need fecth.')
+            return
+        
+        for row in list_data:
+            url = row['href']
+            logging.info(f'fetching content ({row['title']}) from {url}')
+            content, next_url = fetch_chapter_content(url)
+            if content and content['title'] and content['contents']:
+                # 写入到数据表里
+                db_tools.insert_chapter_data({
+                    'book_id':  row['book_id'],
+                    'chapter_id': row['chapter_id'],
+                    'section_id': row['section_id'],
+                    'title':     row['title'],
+                    'href':     url,
+                    'content':  '\n\n'.join(content['contents']),
+                    'has_content': 1
+                })
+            else:
+                logging.warning(f'fetch content error. url: {url}')
+        if debug:
+            return
+        
+
+'''
+# 下载完整的小说
+def fetch_book_data():
+    update_list = db_tools.query_books(need_update=1, limit = 1)
+    if update_list:
+        for row in update_list:
+            name = row['name']
+            href = row['href']
+            bookid = row['id']
+            # 先打开详情页
+            logging.info(f'----------fetching book {name}: {href}-------------')
+            book_detail = fetch_book_detail(href)
+            if book_detail:
+                # 获取内容页，然后循环读取内容
+                chapter_url = book_detail['start_page_href']
+                chapter_id = utils.extract_page_num(chapter_url)
+                # 断点续传，从上次拉取的最后一页开始
+                if not force:
+                    last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
+                    if last_chapter_url:
+                        chapter_url = last_chapter_url
+                while chapter_url:
+                    logging.info(f'fetching page: {chapter_url}')
+                    content, next_url = fetch_chapter_content(chapter_url)
+                    if content and content['title'] and content['contents']:
+                        # 写入到数据表里
+                        db_tools.insert_chapter_data({
+                            'book_id':   bookid,
+                            'chapter_id': chapter_id,
+                            'title':     content['title'],
+                            'href':     chapter_url,
+                            'content':  '\n\n'.join(content['contents']),
+                            'has_content': 1
+                        })
+
+                        if debug:
+                            return
+                    else:
+                        logging.warning(f'fetch content error. url: {chapter_url}')
+                    chapter_url = next_url
+                # 读取完毕，更新列表
+                row_id = db_tools.update_book_detail({
+                    'href' : href,
+                    **book_detail
+                })
+                if row_id:
+                    logging.debug(f'update book succ. id: {row_id}, url: {href}')
+                else:
+                    logging.warning(f'update book failed. url: {href}')
+            else:
+                logging.warning(f'get book detail failed. url: {href}')
+    else:
+        logging.warning(f'get no data needed update.')   
+'''
+
+# 建立缩写到函数的映射
+function_map = {
+    "list":     fetch_book_list,
+    "toc" :     fetch_table_of_contents,
+    "content":  fetch_contents,
+}   
+
+# 主函数
+def main(cmd, args_debug, args_force):
+    global debug
+    debug = args_debug
+
+    global force
+    force = args_force
+
+    # 执行指定的函数
+    if cmd:
+        function_names = args.cmd.split(",")  # 拆分输入
+        for short_name in function_names:
+            func = function_map.get(short_name.strip())  # 从映射中获取对应的函数
+            if callable(func):
+                func()
+            else:
+                logging.warning(f" {short_name} is not a valid function shortcut.")
+    else: # 全量执行
+        for name, func in function_map.items():
+            if callable(func):
+                func()
+            else:
+                logging.warning(f" {short_name} is not a valid function shortcut.")
+
+    logging.info(f'all process completed!')
+
+    # TODO:
+    # 1, 
+
+if __name__ == "__main__":
+    # 命令行参数处理
+    keys_str = ",".join(function_map.keys())
+
+    parser = argparse.ArgumentParser(description='fetch aabook data.')
+    parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
+    parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
+    args = parser.parse_args()
+    
+    main(args.cmd, args.debug, args.force)
--- a/aabook/src/scraper.py
+++ b/aabook/src/scraper.py
@ -0,0 +1,364 @@
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+import requests
+import random
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+import config
+import utils
+
+# 定义基础 URL 和可变参数
+host_url = 'https://aabook.xyz'
+list_url_update    = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
+#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
+
+# User-Agent 列表
+user_agents = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
+    "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
+]
+
+#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
+def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
+    for attempt in range(max_retries):
+        try:
+            if 'aabook.xyz' not in url.lower():
+                logging.error(f'wrong url format: {url}')
+                return None, None
+            
+            # 随机选择一个 User-Agent
+            headers = {
+                'User-Agent': random.choice(user_agents)
+            }
+            response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
+
+            # 处理 HTTP 状态码
+            if response.status_code == 404:
+                logging.warning(f"Page not found (404): {url}")
+                return None, 404  # 直接返回 404，调用方可以跳过
+            
+            response.raise_for_status()  # 处理 HTTP 错误
+
+            # 预处理 HTML（如果提供了 preprocessor）
+            html_text = preprocessor(response.text) if preprocessor else response.text
+
+            soup = BeautifulSoup(html_text, parser)
+            if validator(soup):  # 进行自定义页面检查
+                return soup, response.status_code
+
+            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+        except requests.RequestException as e:
+            logging.info(f"Warn fetching page {url}: {e}. Retrying ...")
+            time.sleep(sleep_time)  # 休眠指定的时间，然后重试
+
+    logging.error(f'Fetching failed after max retries. {url}')
+    return None, None  # 达到最大重试次数仍然失败
+
+
+# 解析列表页
+def parse_book_list(soup, url):
+    # 查找书籍列表
+    list_main = soup.find('div', class_='list_main')
+    if not list_main:
+        logging.warning(f"No list_main Found in {url}")
+        return None, None
+
+    tbody = list_main.find('tbody')
+    if not tbody:
+        logging.warning(f"No tbody found in {url}")
+        None, None
+
+    list_data = []
+    next_url = None
+    # 获取每本书的基础信息：排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数，按日期排序时是最后更新日期)
+    for tr in tbody.find_all('tr'):
+        tds = tr.find_all('td')
+        if len(tds) < 6:
+            logging.info("Invalid tr format.")
+        ranking = tds[0].text.strip()
+        category = utils.remove_brackets_regex(tds[1].text.strip())
+        book_link_tag = tds[2].find('a')
+        book_name = book_link_tag.text.strip()
+        book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
+        book_num = utils.extract_book_num(book_link_tag['href'])
+        author = tds[3].text.strip()
+        monthly_tickets = tds[4].text.strip()
+        update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数，按日期排序时是最后更新日期)
+
+        list_data.append({
+            'rank':     ranking,
+            'category': category,
+            'name':     book_name,
+            'href':     book_link,
+            'num':      book_num,
+            'author':   author,
+            'tickets':  monthly_tickets,
+            'update_time':  update_time
+        })
+
+    # 查找下一页链接
+    next_page_tag = soup.find('a', title='下一页')
+    if next_page_tag:
+        next_url = host_url + next_page_tag['href']
+    
+    return list_data, next_url
+
+# 解析详情页
+def parse_book_detail(soup, url):
+    # 解析书籍详细信息
+    book_info_tag = soup.find('li', class_='zuopinxinxi')
+    if not book_info_tag:
+        logging.warning(f"No details found in {url}")
+        return None
+
+    table_of_contents_href = ''
+    table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
+    if table_of_contents_href_tag:
+        table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']
+
+    book_info_lis = book_info_tag.find_all('li')
+    if len(book_info_lis) < 4:
+        logging.info(f"invalid book info in {url}")
+        return None
+    
+    book_category = book_info_lis[0].find('span').text.strip()
+    book_status = book_info_lis[1].find('span').text.strip()
+    # 去掉后面的汉字，只要数字
+    total_word_count = book_info_lis[2].find('span').text.strip()
+    total_word_count = int(re.search(r'\d+', total_word_count).group())
+
+    total_clicks = book_info_lis[3].find('span').text.strip()
+    month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
+    week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
+    total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
+    month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
+    week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'
+
+    # 读取创建时间
+    creation_time_tag = soup.find('li', class_='update_time')
+    created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')
+
+    # 获取起始页链接和编号
+    start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
+    start_page_link = host_url + '/' + start_page_tag['href']
+    start_page_number = start_page_link.split('-')[-1].replace('.html', '')
+
+    return {
+        'category':         book_category,
+        'status' :          book_status,
+        'total_words' :     total_word_count,
+        'total_clicks':     total_clicks,   
+        'month_clicks':     month_clicks,
+        'week_clicks':      week_clicks,
+        'total_recommend':  total_recommend,
+        'month_recommend':  month_recommend,
+        'week_recommend':   week_recommend,
+        'created_time':     created_time,
+        'start_page_href':  start_page_link,
+        'start_page_num':   start_page_number,
+        'table_of_contents_href': table_of_contents_href
+    }
+
+# 解析书籍的目录页
+def pase_chapter_list(soup, url):
+    # 获取小说的目录
+    table_of_contents = []
+    div_table_of_contents = soup.find('div', class_='page_main')
+    if not div_table_of_contents:
+        return None
+    
+    section_titles = div_table_of_contents.find_all('p', class_='section_title')
+    sections = div_table_of_contents.find_all('ul', class_='section_list')
+    if len(sections) > len(section_titles): # 一般是 后者比前者多1个，最后一个是广告
+        logging.warning(f'sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}')
+        return None
+    else:
+        for i in range(len(sections)):
+            section_title = section_titles[i].get_text().strip()
+            chap_list = sections[i].find_all("a")
+            chap_data = []
+            for chap in chap_list:
+                chap_title = chap.get_text().strip()
+                chap_link = f'{host_url}/{chap['href']}'
+                chap_id = utils.extract_page_num(chap_link)
+                chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
+            table_of_contents.append({'title': section_title, 'chapters': chap_data})
+
+    return table_of_contents
+
+# 解析书籍的章节页
+def parse_chapter_page(soup, url):
+    # 获取章节标题
+    chapter_title_tag = soup.find('h1', class_='chapter_title')
+    if chapter_title_tag is None:
+        logging.warning(f'Chapter title not found in {url}')
+        return None, None
+
+    title = chapter_title_tag.get_text().strip()
+    content_url = None
+    next_url = None
+    chapid = utils.extract_page_num(url)
+
+    # 遍历每一个 <script> 标签，查找内容页的链接
+    script_tags = soup.find_all('script')
+    for script_tag in script_tags:
+        script_content = script_tag.string
+        if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
+            # 匹配到特定内容，提取出 _getcontent.php 的 URL 模板
+            match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
+            if match:
+                # 从匹配中提取 v 参数值
+                v_value = match.group(1)
+                # 构建完整的 content_url
+                content_url = f"{host_url}/_getcontent.php?id={chapid}&v={v_value}"
+                break
+    if content_url is None:
+        logging.warning(f'Content url not found in {url}')
+        return None, None
+    
+    # 获取小说的目录
+    table_of_contents = []
+    div_table_of_contents = soup.find('div', class_='mulu_con')
+    if div_table_of_contents or False:  # 考虑要不要加上这个
+        section_titles = div_table_of_contents.find_all('p')
+        sections = div_table_of_contents.find_all('ul')
+        if len(sections) != len(section_titles):
+            logging.warning(f'sections not matched titles')
+        else:
+            for i in range(len(sections)):
+                section_title = section_titles[i].get_text().strip()
+                chap_list = sections[i].find_all("a")
+                chap_data = []
+                for chap in chap_list:
+                    chap_title = chap.get_text().strip()
+                    chap_link = chap['href']
+                    chap_data.append({'href': chap_link, 'title': chap_title})
+                table_of_contents.append({'title': section_title, 'chapters': chap_data})
+    
+    # 查找下一章的链接
+    next_div = soup.find('div', class_='next_arrow')  
+    if next_div:
+        next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))            
+        if next_page_tag:
+            next_url = f'{host_url}/{next_page_tag['href']}' if next_page_tag['href'] else ''
+
+    data = {'title': title, 'content_url': content_url, 'table_of_contents': table_of_contents}
+    return data, next_url
+
+
+def process_paragraph(paragraph):
+    # 获取完整的 HTML 结构，而不是 get_text()
+    paragraph_html = str(paragraph)
+
+    # 移除水印标签
+    cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', paragraph_html, flags=re.DOTALL)
+
+    # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
+    soup = BeautifulSoup(cleaned_html, 'html.parser')
+    cleaned_text = soup.get_text().strip()
+
+    return cleaned_text
+
+# 解析内容页
+def parse_content_page(soup, url):
+    content = []
+    paragraphs = soup.find_all('p')
+    if paragraphs:
+        for paragraph in paragraphs:
+            cleaned_text = process_paragraph(paragraph)
+            content.append(cleaned_text)
+    
+    return content
+
+# 通用的 HTML 结构验证器
+def generic_validator(soup, tag, identifier, attr_type="id"):
+    if attr_type == "id":
+        return soup.find(tag, id=identifier) is not None
+    elif attr_type == "class":
+        return bool(soup.find_all(tag, class_=identifier))
+    elif attr_type == "name": 
+        return bool(soup.find('select', {'name': identifier}))
+    return False
+
+# 对内容是否被污染的判断
+def content_validator(soup):
+    text = str(soup)
+    dirty_words = ['2005-2024 疯情书库', '2005-2025 疯情书库', '2025 疯情书库', '2026 疯情书库', '2027 疯情书库']
+    for word in dirty_words:
+        if word in text:
+            return False
+        
+    return True
+
+
+def test_content_page(url):
+    soup, status_code = fetch_page(url, content_validator)
+    if soup:
+        data = parse_content_page(soup, url)
+        if data:
+            return data
+        else :
+            return []
+
+
+
+def test_chapter_page(url):
+    soup, status_code = fetch_page(url, partial(generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
+    if soup:
+        data, next_url = parse_chapter_page(soup, url)
+        if data:
+            return data
+        else :
+            return None
+
+def test_book_detail(url):
+    soup, status_code = fetch_page(url, partial(generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
+    if soup:
+        detail = parse_book_detail(soup, url)
+        return detail
+
+
+def test_book_list():
+    for num in range(5):
+        url = list_url_update.format(num)
+        soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="list_main", attr_type="class"))
+        if soup:
+            # 获取书籍列表
+            list_data, next_url = parse_book_list(soup, url=url)
+            for item in list_data:
+                # 获取详情页
+                detail = test_book_detail(item['href'])
+                if detail:
+                    print({
+                        **item,
+                        **detail
+                    })
+
+                    # 获取内容页
+                    page_data = test_chapter_page(detail['start_page_href'])
+                    if page_data:
+                        print(page_data)
+                        # 获取内容
+                        contents = test_content_page(page_data['content_url'])
+                        if contents and len(contents)>0:
+                            print (contents[0])
+                        
+                else:
+                    print('get detail error.')
+                return
+
+
+if __name__ == "__main__":
+    test_book_list()
+
+    
--- a/aabook/src/sqlite_utils.py
+++ b/aabook/src/sqlite_utils.py
@ -0,0 +1,278 @@
+import sqlite3
+import json
+import config
+import utils
+import logging
+import sys
+from datetime import datetime
+
+# 连接 SQLite 数据库
+DB_PATH = config.global_sqlite_path  # 替换为你的数据库文件
+conn = sqlite3.connect(DB_PATH)
+cursor = conn.cursor()
+
+tbl_name_books = 'books'
+tbl_name_chapters_prefix = 'chapters'
+tbl_name_section = 'books_sections'
+
+# 获取表的列名和默认值
+def get_table_columns_and_defaults(tbl_name):
+    try:
+        cursor.execute(f"PRAGMA table_info({tbl_name})")
+        columns = cursor.fetchall()
+        column_info = {}
+        for col in columns:
+            col_name = col[1]
+            default_value = col[4]
+            column_info[col_name] = default_value
+        return column_info
+    except sqlite3.Error as e:
+        logging.error(f"Error getting table columns: {e}")
+        return None
+
+# 检查并处理数据
+def check_and_process_data(data, tbl_name):
+    column_info = get_table_columns_and_defaults(tbl_name=tbl_name)
+    if column_info is None:
+        return None
+    processed_data = {}
+    for col, default in column_info.items():
+        if col == 'id':  # 自增主键，不需要用户提供
+            continue
+        if col == 'created_at' or col == 'updated_at':  # 日期函数，用户自己指定即可
+            continue
+        elif col in data:
+            processed_data[col] = data[col]
+        else:
+            if default is not None:
+                processed_data[col] = default
+            else:
+                processed_data[col] = None
+    return processed_data
+
+        
+# 插入或更新数据
+def insert_or_update_common(data, tbl_name, uniq_key='href'):
+    try:
+        processed_data = check_and_process_data(data, tbl_name)
+        if processed_data is None:
+            return None
+        
+        columns = ', '.join(processed_data.keys())
+        values = list(processed_data.values())
+        placeholders = ', '.join(['?' for _ in values])
+        update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != {uniq_key}]) + ', updated_at=datetime(\'now\', \'localtime\')'
+
+        sql = f'''
+            INSERT INTO {tbl_name} ({columns}, updated_at)
+            VALUES ({placeholders}, datetime('now', 'localtime'))
+            ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
+        '''
+        cursor.execute(sql, values)
+        conn.commit()
+
+        # 获取插入或更新后的 report_id
+        cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
+        report_id = cursor.fetchone()[0]
+        return report_id
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting or updating data: {e}")
+        return None
+
+
+# 插入books表，并判断是否需要更新
+def insert_books_index(data):
+    try:        
+        # 查询是否存在以及是否需要更新
+        cursor.execute(f"SELECT id FROM books WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
+        existing_book = cursor.fetchone()
+
+        if existing_book:  # **如果演员已存在**
+            return existing_book[0]
+        
+        # 不存在，或者需要更新
+        data['is_latest'] = 0
+        return insert_or_update_common(data, tbl_name_books)
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting or updating data: {e}")
+        return None
+    
+# 更新详细信息
+def update_book_detail(data):
+    try:                
+        data['is_latest'] = 1
+
+        # 排除不更新的字段，只更新data中含有的字段
+        fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
+
+        # 构建更新语句
+        set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
+        sql = f"UPDATE {tbl_name_books} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
+
+        # 准备参数
+        values = [data[field] for field in fields_to_update]
+        values.append(data['href'])
+
+        cursor.execute(sql, values)
+        conn.commit()
+        
+        # 获取插入或更新后的 report_id
+        cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href = ?", (data['href'],))
+        report_id = cursor.fetchone()[0]
+        return report_id
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting or updating data: {e}")
+        return None
+
+# 按条件查询 href 列表 
+def query_books(**filters):
+    try:
+        sql = f"SELECT href, name, id FROM {tbl_name_books} WHERE 1=1"
+        params = []
+
+        if "id" in filters:
+            sql += " AND id = ?"
+            params.append(filters["id"])
+        if "href" in filters:
+            sql += " AND href = ?"
+            params.append(filters["href"])
+        if "name" in filters:
+            sql += " AND name LIKE ?"
+            params.append(f"%{filters['name']}%")
+        if "is_latest" in filters:
+            sql += " AND is_latest = ?"
+            params.append(filters["is_latest"])
+        if 'limit' in filters:
+            sql += " limit ?"
+            params.append(filters["limit"])
+
+
+        cursor.execute(sql, params)
+        return [{'href': row[0], 'name': row[1], 'id': row[2]} for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+    
+# 检查表是否存在，不存在就创建
+def check_and_create_chapters_table(book_number):
+    table_name = f"{tbl_name_chapters_prefix}_{book_number}"
+
+    try:
+        create_table_query = f'''
+            CREATE TABLE if not exists {table_name} (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                book_id INTEGER,
+                chapter_id INTEGER,
+                section_id INTEGER,
+                title TEXT,
+                href TEXT UNIQUE,
+                content TEXT,
+                has_content INTEGER default 0,
+                created_at TEXT DEFAULT (datetime('now', 'localtime')),
+                updated_at TEXT DEFAULT (datetime('now', 'localtime')),
+                FOREIGN KEY(book_id) REFERENCES books(id) ON DELETE CASCADE
+            );
+        '''
+        cursor.execute(create_table_query)
+        conn.commit()
+        return table_name
+
+    except sqlite3.Error as e:
+        logging.error(f"create table failed: {e}")
+        return None
+
+
+# 插入到数据表中
+def insert_chapter_data(data):
+    tbl_num = int(data['book_id']) % 100
+    tbl_name = check_and_create_chapters_table(tbl_num)
+    if tbl_name :
+        return insert_or_update_common(data, tbl_name)
+    else:
+        return None
+    
+# 查询某本书最后的获取页码
+def query_last_chapter_by_book(bookid):
+    tbl_num = int(bookid) % 100
+    tbl_name = check_and_create_chapters_table(tbl_num)
+    if tbl_name is None:
+        return None
+    
+    try:
+        sql = f"SELECT href FROM {tbl_name} WHERE book_id={bookid} order by id desc limit 1"
+        cursor.execute(sql)
+
+        row = cursor.fetchone()
+        if row:  # **如果演员已存在**
+            return row[0]
+        
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None    
+
+# 获取没有内容的章节链接
+def query_no_content_chapters(limit = 100):
+    # 用于存储所有结果的列表
+    all_results = []
+
+    # 循环遍历 0 到 100 的数字
+    for i in range(100):
+        table_name = f'{tbl_name_chapters_prefix}_{i}'
+        try:
+            # 计算还需要多少条数据
+            remaining_count = limit - len(all_results)
+            if remaining_count <= 0:
+                break
+            # 执行 SQL 查询，从每个表中获取 has_content = 0 的数据，数量不超过剩余所需数量
+            query = f"SELECT href, title, book_id, chapter_id, section_id FROM {table_name} WHERE has_content = 0 LIMIT {remaining_count}"
+            cursor.execute(query)
+
+            results = [{'href': row[0], 'title': row[1], 'book_id': row[2], 'chapter_id': row[3], 'section_id': row[4]} for row in cursor.fetchall()]
+            all_results.extend(results)
+        except sqlite3.Error as e:
+            print(f"Error querying table {table_name}: {e}")
+
+    return all_results
+    
+# 插入书本的卷信息
+def insert_or_update_book_sections(data):
+    return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')
+
+
+# 统计信息
+def get_statics():
+    result = {}
+    try:
+        # 获取 performers、studios 等表的最终行数
+        cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} ")
+        result['all_books'] = cursor.fetchone()[0]
+
+        cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} where is_latest=1")
+        result['all_books_latest'] = cursor.fetchone()[0]
+        
+    except sqlite3.Error as e:
+        logging.error(f"query error: {e}")
+
+    all_chapters = 0
+    all_chapters_has_contents = 0
+
+    # 循环遍历 0 到 100 的数字
+    for i in range(100):
+        table_name = f'{tbl_name_chapters_prefix}_{i}'
+        try:
+            cursor.execute(f"SELECT COUNT(*) FROM {table_name} ")
+            all_chapters += cursor.fetchone()[0]
+
+            cursor.execute(f"SELECT COUNT(*) FROM {table_name} where has_content=1")
+            all_chapters_has_contents += cursor.fetchone()[0]
+
+        except sqlite3.Error as e:
+            logging.debug(f"Error querying table {table_name}: {e}")
+    
+    result['all_chapters'] = all_chapters
+    result['all_chapters_has_contents'] = all_chapters_has_contents
+
+    return result
+
+    
--- a/aabook/src/utils.py
+++ b/aabook/src/utils.py
@ -0,0 +1,53 @@
+import requests
+import re
+import os
+import json
+import time
+import csv
+import logging
+from datetime import datetime
+import config
+
+# 从"创建时间  2025-03-08 13:57:00" 中提取时间
+def extract_create_time(input_str):
+    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
+    match = re.search(pattern, input_str)
+    if match:
+        datetime_str = match.group(0)
+        return datetime_str
+    else:
+        return input_str
+
+# 从 "read-374864.html" 中获取数字编号
+def extract_page_num(page_str, default_num = 0):
+    # 定义正则表达式模式
+    pattern = r'read-(\d+)\.html'
+    # 使用 re.search 查找匹配项
+    match = re.search(pattern, page_str)
+    if match:
+        number = match.group(1)
+        return number
+    else:
+        return default_num
+
+# 从 "book-5549.html" 中获取数字编号
+def extract_book_num(page_str, default_num = 0):
+    # 定义正则表达式模式
+    pattern = r'book-(\d+)\.html'
+    # 使用 re.search 查找匹配项
+    match = re.search(pattern, page_str)
+    if match:
+        number = match.group(1)
+        return number
+    else:
+        return default_num
+    
+# 处理 [都市] 的方括号
+def remove_brackets_regex(input_str):
+    pattern = r'\[(.*?)\]'
+    match = re.match(pattern, input_str)
+    if match:
+        return match.group(1)
+    return input_str
+
+
--- a/aabook/tools/tools_diff.py
+++ b/aabook/tools/tools_diff.py
--- a/aabook/tools/tools_dir.py
+++ b/aabook/tools/tools_dir.py
--- a/aabook/tools/tools_other.py
+++ b/aabook/tools/tools_other.py
--- a/aabook/utils.py
+++ b/aabook/utils.py
@ -0,0 +1,122 @@
+import requests
+from bs4 import BeautifulSoup
+from ebooklib import epub
+import re
+import os
+import json
+import time
+import csv
+import logging
+from datetime import datetime
+import config
+
+
+# 从"创建时间  2025-03-08 13:57:00" 中提取时间
+def extract_create_time(input_str):
+    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
+    match = re.search(pattern, input_str)
+    if match:
+        datetime_str = match.group(0)
+        return datetime_str
+    else:
+        return input_str
+
+# 从 "read-374864.html" 中获取数字编号
+def extract_page_num(page_str, default_num = 0):
+    # 定义正则表达式模式
+    pattern = r'read-(\d+)\.html'
+    # 使用 re.search 查找匹配项
+    match = re.search(pattern, page_str)
+    if match:
+        number = match.group(1)
+        return number
+    else:
+        return default_num
+
+# 从 "book-5549.html" 中获取数字编号
+def extract_book_num(page_str, default_num = 0):
+    # 定义正则表达式模式
+    pattern = r'book-(\d+)\.html'
+    # 使用 re.search 查找匹配项
+    match = re.search(pattern, page_str)
+    if match:
+        number = match.group(1)
+        return number
+    else:
+        return default_num
+    
+# 处理 [都市] 的方括号
+def remove_brackets_regex(input_str):
+    pattern = r'\[(.*?)\]'
+    match = re.match(pattern, input_str)
+    if match:
+        return match.group(1)
+    return input_str
+
+# 定义函数来抓取小说章节内容
+def fetch_chapter(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # 这里需要根据实际网页结构修改选择器
+        chapter_content = soup.find('div', class_='chapter-content').get_text()
+        return chapter_content
+    except requests.RequestException as e:
+        print(f"请求出错: {e}")
+        return None
+
+
+# 定义函数来生成 EPUB 文件
+def generate_epub(title, author, chapters, path):
+    book = epub.EpubBook()
+    book.set_title(title)
+    book.set_language('zh')
+    book.add_author(author)
+
+    epub_chapters = []
+    for chapter_title, chapter_content in chapters:
+        c = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
+        c.content = f'<h1>{chapter_title}</h1><p>{chapter_content}</p>'
+        book.add_item(c)
+        epub_chapters.append(c)
+
+    # 定义书的结构
+    book.toc = tuple(epub_chapters)
+    book.add_item(epub.EpubNcx())
+    book.add_item(epub.EpubNav())
+
+    # 定义样式
+    style = 'body { font-family: Times, serif; }'
+    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
+    book.add_item(nav_css)
+
+    # 定义书的结构
+    book.spine = ['nav'] + epub_chapters
+
+    # 保存 EPUB 文件
+    epub.write_epub(f'{path}/{title}.epub', book, {})
+
+
+# 示例使用
+if __name__ == "__main__":
+    # 这里需要替换为实际的小说章节链接
+    chapter_info = [
+        ('第一章', 'https://example.com/chapter1'),
+        ('第二章', 'https://example.com/chapter2')
+    ]
+    title = '小说标题'
+    author = '小说作者'
+
+    chapters = []
+    for chapter_title, url in chapter_info:
+        content = fetch_chapter(url)
+        if content:
+            chapters.append((chapter_title, content))
+
+    if chapters:
+        generate_epub(title, author, chapters)
+        print(f'{title}.epub 文件生成成功。')
+    else:
+        print('未获取到有效章节内容，无法生成 EPUB 文件。')
+