modify scripts

2025-08-12 11:53:40 +08:00
parent c2e94e043a
commit 265504632c
4 changed files with 121 additions and 80 deletions
--- a/aabook/src/dump_book.py
+++ b/aabook/src/dump_book.py
@ -9,7 +9,7 @@ import sqlite_utils as db_tools
 import scraper
 import utils 
 import config
-import convert_utils 
+from epub_tools import generate_epub

 config.setup_logging()

@ -41,16 +41,42 @@ map_books_2 = {
    5240    : "幸福的借种经历",
    5171    : "我的师娘和师妹",
 }
+map_books_3 = {
+    67      : "山村野香",
+}

-map_books = map_books_2
+map_books = map_books_3
 # 使用示例
 if __name__ == "__main__":
-    #books = [2689, 3727, 4698, 5446]
-    #books = [3167, 2985, 3098]
-    #books = [2783]
-    for book, name in map_books.items():
+    # 命令行参数处理
+    parser = argparse.ArgumentParser(description='Dump book data to epub files')
+    parser.add_argument('--list', type=str, default='', help='booids to dump, e.g. 2689,3727,4698')
+    parser.add_argument('--url', type=str, default='', help='URL to fetch book data')
+    args = parser.parse_args()
+
+    # 从URL获取书籍数据
+    if args.url:
+        data = db_tools.get_contents_by_href(args.url)
+        if data:
+            title = data['title']
+            href = data.get('href', '')
+            book_file = f"{books_dir}/{title}.epub"
+            generate_epub(data, book_file)
+            print(f"dump {title} - {href} suss! file: {book_file}")
+        else: 
+            print(f"Failed to fetch data from {args.url}")
+        exit(0)
+        
+    if args.list:
+        book_ids = [int(bid) for bid in args.list.split(',')]
+    else:
+        book_ids = map_books.keys()
+
+    for book in book_ids:
        data = db_tools.get_contents_by_book(book)
        if data:
            title = data['title']
-            convert_utils.generate_epub(data, f"{books_dir}/{title}.epub")
-            print(f"dump {title} suss!")
+            href = data.get('href', '')
+            book_file = f"{books_dir}/{title}.epub"
+            generate_epub(data, book_file)
+            print(f"dump {book} - {title} - {href} suss! file: {book_file}")
--- a/aabook/src/epub_tools.py
+++ b/aabook/src/epub_tools.py
@ -0,0 +1,68 @@
+from ebooklib import epub
+import os
+
+def generate_epub(data, save_path):
+    # 创建 EPUB 书籍对象
+    book = epub.EpubBook()
+
+    # 设置书籍元数据
+    book.set_title(data.get('title', '未知标题'))
+    book.set_language('zh')
+    book.add_author(data.get('author', '未知作者'))
+
+    # 存储所有章节对象
+    all_chapters = []
+
+    sections = data.get('sections', [])
+
+    if len(sections) == 1:
+        # 如果只有一个 section，忽略 section 的 title，按一级目录处理
+        for index, chapter in enumerate(sections[0].get('chapters', []), start=1):
+            chapter_title = chapter.get('title', '未知章节')
+            chapter_content = chapter.get('content', '')
+            paragraphs = chapter_content.split('\n\n')
+            html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
+            # 为文件名添加序号，避免冲突
+            chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{index}_{chapter_title}.xhtml', lang='zh')
+            chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
+            book.add_item(chapter_obj)
+            all_chapters.append(chapter_obj)
+    else:
+        # 如果有多个 section，按两级目录处理
+        for section_index, section in enumerate(sections, start=1):
+            section_title = section.get('title', '未知卷')
+            # 为 section 的文件名添加序号，避免冲突
+            section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_index}_{section_title}.xhtml', lang='zh')
+            section_chapter.content = f'<h1>{section_title}</h1>'
+            book.add_item(section_chapter)
+            all_chapters.append(section_chapter)
+
+            for chapter_index, chapter in enumerate(section.get('chapters', []), start=1):
+                chapter_title = chapter.get('title', '未知章节')
+                chapter_content = chapter.get('content', '')
+                paragraphs = chapter_content.split('\n\n')
+                html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
+                # 为 chapter 的文件名添加 section 序号和 chapter 序号，避免冲突
+                chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{section_index}_{chapter_index}_{chapter_title}.xhtml', lang='zh')
+                chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
+                book.add_item(chapter_obj)
+                all_chapters.append(chapter_obj)
+
+    # 定义书籍的目录
+    book.toc = tuple(all_chapters)
+
+    # 定义书的结构
+    book.add_item(epub.EpubNcx())
+    book.add_item(epub.EpubNav())
+
+    # 定义样式
+    style = 'body { font-family: Times, serif; }'
+    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
+    book.add_item(nav_css)
+
+    # 定义书的结构
+    book.spine = ['nav'] + all_chapters
+
+    # 保存 EPUB 文件
+    epub.write_epub(save_path, book, {})
+
--- a/aabook/src/sqlite_utils.py
+++ b/aabook/src/sqlite_utils.py
@ -397,7 +397,7 @@ def get_contents_by_book(id):
            return None
        
        # 查询是书否存在
-        cursor.execute(f"SELECT id, name, author, category, status FROM {tbl_name_books} WHERE id= {id}")
+        cursor.execute(f"SELECT id, name, author, category, status, href FROM {tbl_name_books} WHERE id= {id}")
        existing_book = cursor.fetchone()
        if not existing_book:  # 书不存在
            logging.warning(f"book {id} have no meta data.")
@ -408,6 +408,7 @@ def get_contents_by_book(id):
        book_data['author'] = existing_book[2]
        book_data['category'] = existing_book[3]
        book_data['status'] = existing_book[4]
+        book_data['href'] = existing_book[5]
        book_data['sections'] = []

        # 组装section信息
@ -438,6 +439,23 @@ def get_contents_by_book(id):
        logging.error(f"查询 href 失败: {e}")
        return None

+# 获取完整的小说内容
+def get_contents_by_href(href):
+    try:
+        # 查询内容表是否存在
+        # 查询是书否存在
+        cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href= '{href}'")
+        existing_book = cursor.fetchone()
+        if not existing_book:  # 书不存在
+            logging.warning(f"book {id} have no meta data.")
+            return None
+        
+        return get_contents_by_book(existing_book[0])
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+

 # 统计信息
 def get_statics():
--- a/aabook/src/text_to_epub.py
+++ b/aabook/src/text_to_epub.py
@ -1,71 +0,0 @@
-import re
-from ebooklib import epub
-import os
-import config
-
-config.setup_logging()
-
-books_dir = f"{config.global_host_data_dir}/aabook/data"
-
-
-def txt_to_epub(txt_path, epub_path):
-    # 读取 TXT 文件，使用 GB18030 编码
-    with open(txt_path, 'r', encoding='gb18030') as file:
-        content = file.read()
-
-    # 按章节分割内容
-    chapter_pattern = re.compile(r'第\d+章')
-    chapter_matches = list(chapter_pattern.finditer(content))
-    chapters = []
-    for i in range(len(chapter_matches)):
-        start = chapter_matches[i].start()
-        if i < len(chapter_matches) - 1:
-            end = chapter_matches[i + 1].start()
-        else:
-            end = len(content)
-        chapter_content = content[start:end]
-        chapter_title = chapter_pattern.search(chapter_content).group()
-        chapters.append((chapter_title, chapter_content))
-
-    # 创建 EPUB 书籍
-    book = epub.EpubBook()
-    book.set_title(os.path.basename(txt_path).replace('.txt', ''))
-    book.set_language('zh')
-
-    # 添加章节到 EPUB
-    epub_chapters = []
-    for title, content in chapters:
-        chapter = epub.EpubHtml(title=title, file_name=f'{title}.xhtml', lang='zh')
-        # 处理换行符，将换行符替换为 HTML 的 <br> 标签
-        content = content.replace(title, "", 1).strip()
-        content = content.replace('\r\n', '<br>')
-        content = content.replace('\n', '<br>')
-        chapter.content = f'<h1>{title}</h1><p>{content}</p>'
-        book.add_item(chapter)
-        epub_chapters.append(chapter)
-
-    # 定义书籍结构
-    book.toc = tuple(epub_chapters)
-    book.add_item(epub.EpubNcx())
-    book.add_item(epub.EpubNav())
-
-    # 定义样式
-    style = 'body { font-family: Times, serif; }'
-    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
-    book.add_item(nav_css)
-
-    # 设置书籍的页面布局
-    book.spine = ['nav'] + epub_chapters
-
-    # 保存 EPUB 文件
-    epub.write_epub(epub_path, book, {})
-
-
-# 使用示例
-txt_file1 = f"{books_dir}/我的青年岁月（加章节）.txt"
-txt_file2 = f"{books_dir}/废都（海外版）贾平凹-加章节.txt"
-epub_file1 = f"{books_dir}/我的青年岁月（加章节）.epub"
-epub_file2 = f"{books_dir}/废都（海外版）贾平凹-加章节.epub"
-
-txt_to_epub(txt_file1, epub_file1)
-txt_to_epub(txt_file2, epub_file2)