modify scripts

2025-08-12 11:53:40 +08:00
parent c2e94e043a
commit 265504632c
4 changed files with 121 additions and 80 deletions
--- a/aabook/src/dump_book.py
+++ b/aabook/src/dump_book.py
@ -9,7 +9,7 @@ import sqlite_utils as db_tools
 import scraper
 import utils 
 import config
-import convert_utils 
+from epub_tools import generate_epub
 config.setup_logging()
@ -41,16 +41,42 @@ map_books_2 = {
    5240    : "幸福的借种经历",
    5171    : "我的师娘和师妹",
 }
 map_books_3 = {
    67      : "山村野香",
 }
-map_books = map_books_2
+map_books = map_books_3
 # 使用示例
 if __name__ == "__main__":
-    #books = [2689, 3727, 4698, 5446]
+    # 命令行参数处理
-    #books = [3167, 2985, 3098]
+    parser = argparse.ArgumentParser(description='Dump book data to epub files')
-    #books = [2783]
+    parser.add_argument('--list', type=str, default='', help='booids to dump, e.g. 2689,3727,4698')
-    for book, name in map_books.items():
+    parser.add_argument('--url', type=str, default='', help='URL to fetch book data')
    args = parser.parse_args()
    # 从URL获取书籍数据
    if args.url:
        data = db_tools.get_contents_by_href(args.url)
        if data:
            title = data['title']
            href = data.get('href', '')
            book_file = f"{books_dir}/{title}.epub"
            generate_epub(data, book_file)
            print(f"dump {title} - {href} suss! file: {book_file}")
        else: 
            print(f"Failed to fetch data from {args.url}")
        exit(0)
    if args.list:
        book_ids = [int(bid) for bid in args.list.split(',')]
    else:
        book_ids = map_books.keys()
    for book in book_ids:
        data = db_tools.get_contents_by_book(book)
        if data:
            title = data['title']
-            convert_utils.generate_epub(data, f"{books_dir}/{title}.epub")
+            href = data.get('href', '')
-            print(f"dump {title} suss!")
+            book_file = f"{books_dir}/{title}.epub"
            generate_epub(data, book_file)
            print(f"dump {book} - {title} - {href} suss! file: {book_file}")
--- a/aabook/src/epub_tools.py
+++ b/aabook/src/epub_tools.py
@ -0,0 +1,68 @@
 from ebooklib import epub
 import os
 def generate_epub(data, save_path):
    # 创建 EPUB 书籍对象
    book = epub.EpubBook()
    # 设置书籍元数据
    book.set_title(data.get('title', '未知标题'))
    book.set_language('zh')
    book.add_author(data.get('author', '未知作者'))
    # 存储所有章节对象
    all_chapters = []
    sections = data.get('sections', [])
    if len(sections) == 1:
        # 如果只有一个 section，忽略 section 的 title，按一级目录处理
        for index, chapter in enumerate(sections[0].get('chapters', []), start=1):
            chapter_title = chapter.get('title', '未知章节')
            chapter_content = chapter.get('content', '')
            paragraphs = chapter_content.split('\n\n')
            html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
            # 为文件名添加序号，避免冲突
            chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{index}_{chapter_title}.xhtml', lang='zh')
            chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
            book.add_item(chapter_obj)
            all_chapters.append(chapter_obj)
    else:
        # 如果有多个 section，按两级目录处理
        for section_index, section in enumerate(sections, start=1):
            section_title = section.get('title', '未知卷')
            # 为 section 的文件名添加序号，避免冲突
            section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_index}_{section_title}.xhtml', lang='zh')
            section_chapter.content = f'<h1>{section_title}</h1>'
            book.add_item(section_chapter)
            all_chapters.append(section_chapter)
            for chapter_index, chapter in enumerate(section.get('chapters', []), start=1):
                chapter_title = chapter.get('title', '未知章节')
                chapter_content = chapter.get('content', '')
                paragraphs = chapter_content.split('\n\n')
                html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
                # 为 chapter 的文件名添加 section 序号和 chapter 序号，避免冲突
                chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{section_index}_{chapter_index}_{chapter_title}.xhtml', lang='zh')
                chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
                book.add_item(chapter_obj)
                all_chapters.append(chapter_obj)
    # 定义书籍的目录
    book.toc = tuple(all_chapters)
    # 定义书的结构
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    # 定义样式
    style = 'body { font-family: Times, serif; }'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    book.add_item(nav_css)
    # 定义书的结构
    book.spine = ['nav'] + all_chapters
    # 保存 EPUB 文件
    epub.write_epub(save_path, book, {})
--- a/aabook/src/sqlite_utils.py
+++ b/aabook/src/sqlite_utils.py
@ -397,7 +397,7 @@ def get_contents_by_book(id):
            return None
        # 查询是书否存在
-        cursor.execute(f"SELECT id, name, author, category, status FROM {tbl_name_books} WHERE id= {id}")
+        cursor.execute(f"SELECT id, name, author, category, status, href FROM {tbl_name_books} WHERE id= {id}")
        existing_book = cursor.fetchone()
        if not existing_book:  # 书不存在
            logging.warning(f"book {id} have no meta data.")
@ -408,6 +408,7 @@ def get_contents_by_book(id):
        book_data['author'] = existing_book[2]
        book_data['category'] = existing_book[3]
        book_data['status'] = existing_book[4]
        book_data['href'] = existing_book[5]
        book_data['sections'] = []
        # 组装section信息
@ -438,6 +439,23 @@ def get_contents_by_book(id):
        logging.error(f"查询 href 失败: {e}")
        return None
 # 获取完整的小说内容
 def get_contents_by_href(href):
    try:
        # 查询内容表是否存在
        # 查询是书否存在
        cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href= '{href}'")
        existing_book = cursor.fetchone()
        if not existing_book:  # 书不存在
            logging.warning(f"book {id} have no meta data.")
            return None
        return get_contents_by_book(existing_book[0])
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 # 统计信息
 def get_statics():
--- a/aabook/src/text_to_epub.py
+++ b/aabook/src/text_to_epub.py
@ -1,71 +0,0 @@
 import re
 from ebooklib import epub
 import os
 import config
 config.setup_logging()
 books_dir = f"{config.global_host_data_dir}/aabook/data"
 def txt_to_epub(txt_path, epub_path):
    # 读取 TXT 文件，使用 GB18030 编码
    with open(txt_path, 'r', encoding='gb18030') as file:
        content = file.read()
    # 按章节分割内容
    chapter_pattern = re.compile(r'第\d+章')
    chapter_matches = list(chapter_pattern.finditer(content))
    chapters = []
    for i in range(len(chapter_matches)):
        start = chapter_matches[i].start()
        if i < len(chapter_matches) - 1:
            end = chapter_matches[i + 1].start()
        else:
            end = len(content)
        chapter_content = content[start:end]
        chapter_title = chapter_pattern.search(chapter_content).group()
        chapters.append((chapter_title, chapter_content))
    # 创建 EPUB 书籍
    book = epub.EpubBook()
    book.set_title(os.path.basename(txt_path).replace('.txt', ''))
    book.set_language('zh')
    # 添加章节到 EPUB
    epub_chapters = []
    for title, content in chapters:
        chapter = epub.EpubHtml(title=title, file_name=f'{title}.xhtml', lang='zh')
        # 处理换行符，将换行符替换为 HTML 的 <br> 标签
        content = content.replace(title, "", 1).strip()
        content = content.replace('\r\n', '<br>')
        content = content.replace('\n', '<br>')
        chapter.content = f'<h1>{title}</h1><p>{content}</p>'
        book.add_item(chapter)
        epub_chapters.append(chapter)
    # 定义书籍结构
    book.toc = tuple(epub_chapters)
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    # 定义样式
    style = 'body { font-family: Times, serif; }'
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    book.add_item(nav_css)
    # 设置书籍的页面布局
    book.spine = ['nav'] + epub_chapters
    # 保存 EPUB 文件
    epub.write_epub(epub_path, book, {})
 # 使用示例
 txt_file1 = f"{books_dir}/我的青年岁月（加章节）.txt"
 txt_file2 = f"{books_dir}/废都（海外版）贾平凹-加章节.txt"
 epub_file1 = f"{books_dir}/我的青年岁月（加章节）.epub"
 epub_file2 = f"{books_dir}/废都（海外版）贾平凹-加章节.epub"
 txt_to_epub(txt_file1, epub_file1)
 txt_to_epub(txt_file2, epub_file2)