diff --git a/aabook/src/convert_utils.py b/aabook/src/convert_utils.py index bca87ad..14f610f 100644 --- a/aabook/src/convert_utils.py +++ b/aabook/src/convert_utils.py @@ -1,9 +1,18 @@ from ebooklib import epub +from fpdf import FPDF +import os from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from reportlab.lib.styles import getSampleStyleSheet from reportlab.platypus import Paragraph, Spacer +# pdf 会报错,各种字体的问题。。。不折腾了 + +# 需要安装 +# apt-get install fonts-wqy-microhei +# apt-get install fontforge +# fontforge -lang=ff -c 'Open($1); Generate($2)' /usr/share/fonts/truetype/wqy/wqy-microhei.ttc /usr/share/fonts/truetype/wqy/wqy-microhei.ttf +font_path = "/usr/share/fonts/truetype/wqy/wqy-microhei.ttf" def generate_epub(data, save_path): # 创建 EPUB 书籍对象 @@ -21,30 +30,33 @@ def generate_epub(data, save_path): if len(sections) == 1: # 如果只有一个 section,忽略 section 的 title,按一级目录处理 - for chapter in sections[0].get('chapters', []): + for index, chapter in enumerate(sections[0].get('chapters', []), start=1): chapter_title = chapter.get('title', '未知章节') chapter_content = chapter.get('content', '') paragraphs = chapter_content.split('\n\n') html_content = ''.join([f'

{para}

' for para in paragraphs]) - chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh') + # 为文件名添加序号,避免冲突 + chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{index}_{chapter_title}.xhtml', lang='zh') chapter_obj.content = f'

{chapter_title}

{html_content}' book.add_item(chapter_obj) all_chapters.append(chapter_obj) else: # 如果有多个 section,按两级目录处理 - for section in sections: + for section_index, section in enumerate(sections, start=1): section_title = section.get('title', '未知卷') - section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_title}.xhtml', lang='zh') + # 为 section 的文件名添加序号,避免冲突 + section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_index}_{section_title}.xhtml', lang='zh') section_chapter.content = f'

{section_title}

' book.add_item(section_chapter) all_chapters.append(section_chapter) - for chapter in section.get('chapters', []): + for chapter_index, chapter in enumerate(section.get('chapters', []), start=1): chapter_title = chapter.get('title', '未知章节') chapter_content = chapter.get('content', '') paragraphs = chapter_content.split('\n\n') html_content = ''.join([f'

{para}

' for para in paragraphs]) - chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh') + # 为 chapter 的文件名添加 section 序号和 chapter 序号,避免冲突 + chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{section_index}_{chapter_index}_{chapter_title}.xhtml', lang='zh') chapter_obj.content = f'

{chapter_title}

{html_content}' book.add_item(chapter_obj) all_chapters.append(chapter_obj) @@ -69,6 +81,62 @@ def generate_epub(data, save_path): def generate_pdf(data, save_path): + # 数据校验 + if not isinstance(data, dict) or 'title' not in data or 'sections' not in data: + raise ValueError("Invalid data format. 'title' and 'sections' are required.") + + if not isinstance(data['sections'], list) or not data['sections']: + raise ValueError("'sections' must be a non-empty list.") + + # 初始化 PDF + pdf = FPDF() + pdf.set_auto_page_break(auto=True, margin=15) + pdf.add_page() + + # 加载支持中文的字体 + if not os.path.exists(font_path): + raise FileNotFoundError(f"Font not found at {font_path}. Please install NotoSansCJK.") + + pdf.add_font("NotoSans", "", font_path, uni=True) + pdf.set_font("NotoSans", size=16) + + # PDF 标题 + pdf.cell(200, 10, txt=data['title'], ln=True, align='C') + pdf.ln(10) + + # 判断是否有多卷 + multiple_sections = len(data['sections']) > 1 + + # 遍历 sections + for section in data['sections']: + if 'chapters' not in section or not isinstance(section['chapters'], list): + raise ValueError(f"Section '{section.get('title', 'Unknown')}' is missing valid 'chapters'.") + + # 如果有多个sections,显示卷标题 + if multiple_sections: + pdf.set_font("NotoSans", style='B', size=14) + pdf.cell(200, 10, txt=section['title'], ln=True) + pdf.ln(5) + + # 遍历 chapters + for chapter in section['chapters']: + if 'title' not in chapter or 'content' not in chapter: + raise ValueError("Each chapter must have a 'title' and 'content'.") + + pdf.set_font("NotoSans", style='B', size=12) + pdf.cell(200, 8, txt=chapter['title'], ln=True) + pdf.set_font("NotoSans", size=10) + pdf.multi_cell(0, 8, txt=chapter['content']) + pdf.ln(5) + + # 保存 PDF + try: + pdf.output(save_path) + print(f"PDF saved to {save_path}") + except Exception as e: + raise IOError(f"Failed to save PDF: {e}") + +def generate_pdf_2(data, save_path): # 创建 PDF 画布 c = canvas.Canvas(save_path, pagesize=letter) styles = getSampleStyleSheet() diff --git a/aabook/src/dump_book.py b/aabook/src/dump_book.py new file mode 100644 index 0000000..e26f65f --- /dev/null +++ b/aabook/src/dump_book.py @@ -0,0 +1,26 @@ +import json +import time +import csv +import argparse +import logging +from functools import partial +import config +import sqlite_utils as db_tools +import scraper +import utils +import config +import convert_utils + +config.setup_logging() + +books_dir = f"{config.global_host_data_dir}/aabook/data" + +# 使用示例 +if __name__ == "__main__": + books = [4, 2600] + for book in books: + data = db_tools.get_contents_by_book(book) + if data: + title = data['title'] + convert_utils.generate_epub(data, f"{books_dir}/{title}.epub") + print(f"dump {title} suss!") \ No newline at end of file diff --git a/aabook/src/sqlite_utils.py b/aabook/src/sqlite_utils.py index 5ecd73b..f82b96d 100644 --- a/aabook/src/sqlite_utils.py +++ b/aabook/src/sqlite_utils.py @@ -302,6 +302,21 @@ def query_last_chapter_by_book(bookid): logging.error(f"查询 href 失败: {e}") return None +# 查询表是否存在 +def check_table_exists(table_name): + try: + # 执行查询,检查表是否存在 + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,)) + result = cursor.fetchone() + + # 如果查询结果不为空,说明表存在 + return result is not None + + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + + # 获取没有内容的章节链接 def query_no_content_chapters(limit = 100): # 用于存储所有结果的列表 @@ -370,6 +385,59 @@ def query_toc_href(): def insert_or_update_book_sections(data): return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section') +# 获取完整的小说内容 +def get_contents_by_book(id): + try: + # 查询内容表是否存在 + tbl_num = int(id) % 100 + chap_tlb_name = f'{tbl_name_chapters_prefix}_{tbl_num}' + tbl_exists = check_table_exists(chap_tlb_name) + if tbl_exists is None or not tbl_exists: + logging.warning(f"book {id} have no data.") + return None + + # 查询是书否存在 + cursor.execute(f"SELECT id, name, author, category, status FROM {tbl_name_books} WHERE id= {id}") + existing_book = cursor.fetchone() + if not existing_book: # 书不存在 + logging.warning(f"book {id} have no meta data.") + return None + + book_data = {} + book_data['title'] = existing_book[1] + book_data['author'] = existing_book[2] + book_data['category'] = existing_book[3] + book_data['status'] = existing_book[4] + book_data['sections'] = [] + + # 组装section信息 + query = f"SELECT id, section FROM {tbl_name_section} WHERE book_id = {id}" + cursor.execute(query) + sections = [{'id': row[0], 'section': row[1]} for row in cursor.fetchall()] + + # 循环读取数据 + for row in sections: + section_id = row['id'] + section_title = row['section'] + chapters = [] + query = f"SELECT id, title, content FROM {chap_tlb_name} WHERE book_id = {id} and section_id={section_id} order by id asc" + cursor.execute(query) + for chap_row in cursor.fetchall(): + chapters.append({ + 'title': chap_row[1], + 'content': chap_row[2] + }) + book_data['sections'].append({ + 'title': section_title, + 'chapters' : chapters + }) + + return book_data + + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + # 统计信息 def get_statics():