From 265504632cc583033083043fa5e86930aeff29dc Mon Sep 17 00:00:00 2001 From: sophon Date: Tue, 12 Aug 2025 11:53:40 +0800 Subject: [PATCH] modify scripts --- aabook/src/dump_book.py | 42 +++++++++++++++++----- aabook/src/epub_tools.py | 68 ++++++++++++++++++++++++++++++++++++ aabook/src/sqlite_utils.py | 20 ++++++++++- aabook/src/text_to_epub.py | 71 -------------------------------------- 4 files changed, 121 insertions(+), 80 deletions(-) create mode 100644 aabook/src/epub_tools.py delete mode 100644 aabook/src/text_to_epub.py diff --git a/aabook/src/dump_book.py b/aabook/src/dump_book.py index 6c88dd5..7725963 100644 --- a/aabook/src/dump_book.py +++ b/aabook/src/dump_book.py @@ -9,7 +9,7 @@ import sqlite_utils as db_tools import scraper import utils import config -import convert_utils +from epub_tools import generate_epub config.setup_logging() @@ -41,16 +41,42 @@ map_books_2 = { 5240 : "幸福的借种经历", 5171 : "我的师娘和师妹", } +map_books_3 = { + 67 : "山村野香", +} -map_books = map_books_2 +map_books = map_books_3 # 使用示例 if __name__ == "__main__": - #books = [2689, 3727, 4698, 5446] - #books = [3167, 2985, 3098] - #books = [2783] - for book, name in map_books.items(): + # 命令行参数处理 + parser = argparse.ArgumentParser(description='Dump book data to epub files') + parser.add_argument('--list', type=str, default='', help='booids to dump, e.g. 2689,3727,4698') + parser.add_argument('--url', type=str, default='', help='URL to fetch book data') + args = parser.parse_args() + + # 从URL获取书籍数据 + if args.url: + data = db_tools.get_contents_by_href(args.url) + if data: + title = data['title'] + href = data.get('href', '') + book_file = f"{books_dir}/{title}.epub" + generate_epub(data, book_file) + print(f"dump {title} - {href} suss! file: {book_file}") + else: + print(f"Failed to fetch data from {args.url}") + exit(0) + + if args.list: + book_ids = [int(bid) for bid in args.list.split(',')] + else: + book_ids = map_books.keys() + + for book in book_ids: data = db_tools.get_contents_by_book(book) if data: title = data['title'] - convert_utils.generate_epub(data, f"{books_dir}/{title}.epub") - print(f"dump {title} suss!") \ No newline at end of file + href = data.get('href', '') + book_file = f"{books_dir}/{title}.epub" + generate_epub(data, book_file) + print(f"dump {book} - {title} - {href} suss! file: {book_file}") \ No newline at end of file diff --git a/aabook/src/epub_tools.py b/aabook/src/epub_tools.py new file mode 100644 index 0000000..bdeaef9 --- /dev/null +++ b/aabook/src/epub_tools.py @@ -0,0 +1,68 @@ +from ebooklib import epub +import os + +def generate_epub(data, save_path): + # 创建 EPUB 书籍对象 + book = epub.EpubBook() + + # 设置书籍元数据 + book.set_title(data.get('title', '未知标题')) + book.set_language('zh') + book.add_author(data.get('author', '未知作者')) + + # 存储所有章节对象 + all_chapters = [] + + sections = data.get('sections', []) + + if len(sections) == 1: + # 如果只有一个 section,忽略 section 的 title,按一级目录处理 + for index, chapter in enumerate(sections[0].get('chapters', []), start=1): + chapter_title = chapter.get('title', '未知章节') + chapter_content = chapter.get('content', '') + paragraphs = chapter_content.split('\n\n') + html_content = ''.join([f'

{para}

' for para in paragraphs]) + # 为文件名添加序号,避免冲突 + chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{index}_{chapter_title}.xhtml', lang='zh') + chapter_obj.content = f'

{chapter_title}

{html_content}' + book.add_item(chapter_obj) + all_chapters.append(chapter_obj) + else: + # 如果有多个 section,按两级目录处理 + for section_index, section in enumerate(sections, start=1): + section_title = section.get('title', '未知卷') + # 为 section 的文件名添加序号,避免冲突 + section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_index}_{section_title}.xhtml', lang='zh') + section_chapter.content = f'

{section_title}

' + book.add_item(section_chapter) + all_chapters.append(section_chapter) + + for chapter_index, chapter in enumerate(section.get('chapters', []), start=1): + chapter_title = chapter.get('title', '未知章节') + chapter_content = chapter.get('content', '') + paragraphs = chapter_content.split('\n\n') + html_content = ''.join([f'

{para}

' for para in paragraphs]) + # 为 chapter 的文件名添加 section 序号和 chapter 序号,避免冲突 + chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{section_index}_{chapter_index}_{chapter_title}.xhtml', lang='zh') + chapter_obj.content = f'

{chapter_title}

{html_content}' + book.add_item(chapter_obj) + all_chapters.append(chapter_obj) + + # 定义书籍的目录 + book.toc = tuple(all_chapters) + + # 定义书的结构 + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + + # 定义样式 + style = 'body { font-family: Times, serif; }' + nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) + book.add_item(nav_css) + + # 定义书的结构 + book.spine = ['nav'] + all_chapters + + # 保存 EPUB 文件 + epub.write_epub(save_path, book, {}) + diff --git a/aabook/src/sqlite_utils.py b/aabook/src/sqlite_utils.py index f82b96d..5f9f480 100644 --- a/aabook/src/sqlite_utils.py +++ b/aabook/src/sqlite_utils.py @@ -397,7 +397,7 @@ def get_contents_by_book(id): return None # 查询是书否存在 - cursor.execute(f"SELECT id, name, author, category, status FROM {tbl_name_books} WHERE id= {id}") + cursor.execute(f"SELECT id, name, author, category, status, href FROM {tbl_name_books} WHERE id= {id}") existing_book = cursor.fetchone() if not existing_book: # 书不存在 logging.warning(f"book {id} have no meta data.") @@ -408,6 +408,7 @@ def get_contents_by_book(id): book_data['author'] = existing_book[2] book_data['category'] = existing_book[3] book_data['status'] = existing_book[4] + book_data['href'] = existing_book[5] book_data['sections'] = [] # 组装section信息 @@ -438,6 +439,23 @@ def get_contents_by_book(id): logging.error(f"查询 href 失败: {e}") return None +# 获取完整的小说内容 +def get_contents_by_href(href): + try: + # 查询内容表是否存在 + # 查询是书否存在 + cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href= '{href}'") + existing_book = cursor.fetchone() + if not existing_book: # 书不存在 + logging.warning(f"book {id} have no meta data.") + return None + + return get_contents_by_book(existing_book[0]) + + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + # 统计信息 def get_statics(): diff --git a/aabook/src/text_to_epub.py b/aabook/src/text_to_epub.py deleted file mode 100644 index 65709f6..0000000 --- a/aabook/src/text_to_epub.py +++ /dev/null @@ -1,71 +0,0 @@ -import re -from ebooklib import epub -import os -import config - -config.setup_logging() - -books_dir = f"{config.global_host_data_dir}/aabook/data" - - -def txt_to_epub(txt_path, epub_path): - # 读取 TXT 文件,使用 GB18030 编码 - with open(txt_path, 'r', encoding='gb18030') as file: - content = file.read() - - # 按章节分割内容 - chapter_pattern = re.compile(r'第\d+章') - chapter_matches = list(chapter_pattern.finditer(content)) - chapters = [] - for i in range(len(chapter_matches)): - start = chapter_matches[i].start() - if i < len(chapter_matches) - 1: - end = chapter_matches[i + 1].start() - else: - end = len(content) - chapter_content = content[start:end] - chapter_title = chapter_pattern.search(chapter_content).group() - chapters.append((chapter_title, chapter_content)) - - # 创建 EPUB 书籍 - book = epub.EpubBook() - book.set_title(os.path.basename(txt_path).replace('.txt', '')) - book.set_language('zh') - - # 添加章节到 EPUB - epub_chapters = [] - for title, content in chapters: - chapter = epub.EpubHtml(title=title, file_name=f'{title}.xhtml', lang='zh') - # 处理换行符,将换行符替换为 HTML 的
标签 - content = content.replace(title, "", 1).strip() - content = content.replace('\r\n', '
') - content = content.replace('\n', '
') - chapter.content = f'

{title}

{content}

' - book.add_item(chapter) - epub_chapters.append(chapter) - - # 定义书籍结构 - book.toc = tuple(epub_chapters) - book.add_item(epub.EpubNcx()) - book.add_item(epub.EpubNav()) - - # 定义样式 - style = 'body { font-family: Times, serif; }' - nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) - book.add_item(nav_css) - - # 设置书籍的页面布局 - book.spine = ['nav'] + epub_chapters - - # 保存 EPUB 文件 - epub.write_epub(epub_path, book, {}) - - -# 使用示例 -txt_file1 = f"{books_dir}/我的青年岁月(加章节).txt" -txt_file2 = f"{books_dir}/废都(海外版)贾平凹-加章节.txt" -epub_file1 = f"{books_dir}/我的青年岁月(加章节).epub" -epub_file2 = f"{books_dir}/废都(海外版)贾平凹-加章节.epub" - -txt_to_epub(txt_file1, epub_file1) -txt_to_epub(txt_file2, epub_file2)