modify scripts

This commit is contained in:
2025-08-12 11:53:40 +08:00
parent c2e94e043a
commit 265504632c
4 changed files with 121 additions and 80 deletions

View File

@ -9,7 +9,7 @@ import sqlite_utils as db_tools
import scraper import scraper
import utils import utils
import config import config
import convert_utils from epub_tools import generate_epub
config.setup_logging() config.setup_logging()
@ -41,16 +41,42 @@ map_books_2 = {
5240 : "幸福的借种经历", 5240 : "幸福的借种经历",
5171 : "我的师娘和师妹", 5171 : "我的师娘和师妹",
} }
map_books_3 = {
67 : "山村野香",
}
map_books = map_books_2 map_books = map_books_3
# 使用示例 # 使用示例
if __name__ == "__main__": if __name__ == "__main__":
#books = [2689, 3727, 4698, 5446] # 命令行参数处理
#books = [3167, 2985, 3098] parser = argparse.ArgumentParser(description='Dump book data to epub files')
#books = [2783] parser.add_argument('--list', type=str, default='', help='booids to dump, e.g. 2689,3727,4698')
for book, name in map_books.items(): parser.add_argument('--url', type=str, default='', help='URL to fetch book data')
args = parser.parse_args()
# 从URL获取书籍数据
if args.url:
data = db_tools.get_contents_by_href(args.url)
if data:
title = data['title']
href = data.get('href', '')
book_file = f"{books_dir}/{title}.epub"
generate_epub(data, book_file)
print(f"dump {title} - {href} suss! file: {book_file}")
else:
print(f"Failed to fetch data from {args.url}")
exit(0)
if args.list:
book_ids = [int(bid) for bid in args.list.split(',')]
else:
book_ids = map_books.keys()
for book in book_ids:
data = db_tools.get_contents_by_book(book) data = db_tools.get_contents_by_book(book)
if data: if data:
title = data['title'] title = data['title']
convert_utils.generate_epub(data, f"{books_dir}/{title}.epub") href = data.get('href', '')
print(f"dump {title} suss!") book_file = f"{books_dir}/{title}.epub"
generate_epub(data, book_file)
print(f"dump {book} - {title} - {href} suss! file: {book_file}")

68
aabook/src/epub_tools.py Normal file
View File

@ -0,0 +1,68 @@
from ebooklib import epub
import os
def generate_epub(data, save_path):
# 创建 EPUB 书籍对象
book = epub.EpubBook()
# 设置书籍元数据
book.set_title(data.get('title', '未知标题'))
book.set_language('zh')
book.add_author(data.get('author', '未知作者'))
# 存储所有章节对象
all_chapters = []
sections = data.get('sections', [])
if len(sections) == 1:
# 如果只有一个 section忽略 section 的 title按一级目录处理
for index, chapter in enumerate(sections[0].get('chapters', []), start=1):
chapter_title = chapter.get('title', '未知章节')
chapter_content = chapter.get('content', '')
paragraphs = chapter_content.split('\n\n')
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
# 为文件名添加序号,避免冲突
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{index}_{chapter_title}.xhtml', lang='zh')
chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
book.add_item(chapter_obj)
all_chapters.append(chapter_obj)
else:
# 如果有多个 section按两级目录处理
for section_index, section in enumerate(sections, start=1):
section_title = section.get('title', '未知卷')
# 为 section 的文件名添加序号,避免冲突
section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_index}_{section_title}.xhtml', lang='zh')
section_chapter.content = f'<h1>{section_title}</h1>'
book.add_item(section_chapter)
all_chapters.append(section_chapter)
for chapter_index, chapter in enumerate(section.get('chapters', []), start=1):
chapter_title = chapter.get('title', '未知章节')
chapter_content = chapter.get('content', '')
paragraphs = chapter_content.split('\n\n')
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
# 为 chapter 的文件名添加 section 序号和 chapter 序号,避免冲突
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{section_index}_{chapter_index}_{chapter_title}.xhtml', lang='zh')
chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
book.add_item(chapter_obj)
all_chapters.append(chapter_obj)
# 定义书籍的目录
book.toc = tuple(all_chapters)
# 定义书的结构
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# 定义样式
style = 'body { font-family: Times, serif; }'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
book.add_item(nav_css)
# 定义书的结构
book.spine = ['nav'] + all_chapters
# 保存 EPUB 文件
epub.write_epub(save_path, book, {})

View File

@ -397,7 +397,7 @@ def get_contents_by_book(id):
return None return None
# 查询是书否存在 # 查询是书否存在
cursor.execute(f"SELECT id, name, author, category, status FROM {tbl_name_books} WHERE id= {id}") cursor.execute(f"SELECT id, name, author, category, status, href FROM {tbl_name_books} WHERE id= {id}")
existing_book = cursor.fetchone() existing_book = cursor.fetchone()
if not existing_book: # 书不存在 if not existing_book: # 书不存在
logging.warning(f"book {id} have no meta data.") logging.warning(f"book {id} have no meta data.")
@ -408,6 +408,7 @@ def get_contents_by_book(id):
book_data['author'] = existing_book[2] book_data['author'] = existing_book[2]
book_data['category'] = existing_book[3] book_data['category'] = existing_book[3]
book_data['status'] = existing_book[4] book_data['status'] = existing_book[4]
book_data['href'] = existing_book[5]
book_data['sections'] = [] book_data['sections'] = []
# 组装section信息 # 组装section信息
@ -438,6 +439,23 @@ def get_contents_by_book(id):
logging.error(f"查询 href 失败: {e}") logging.error(f"查询 href 失败: {e}")
return None return None
# 获取完整的小说内容
def get_contents_by_href(href):
try:
# 查询内容表是否存在
# 查询是书否存在
cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href= '{href}'")
existing_book = cursor.fetchone()
if not existing_book: # 书不存在
logging.warning(f"book {id} have no meta data.")
return None
return get_contents_by_book(existing_book[0])
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 统计信息 # 统计信息
def get_statics(): def get_statics():

View File

@ -1,71 +0,0 @@
import re
from ebooklib import epub
import os
import config
config.setup_logging()
books_dir = f"{config.global_host_data_dir}/aabook/data"
def txt_to_epub(txt_path, epub_path):
# 读取 TXT 文件,使用 GB18030 编码
with open(txt_path, 'r', encoding='gb18030') as file:
content = file.read()
# 按章节分割内容
chapter_pattern = re.compile(r'\d+章')
chapter_matches = list(chapter_pattern.finditer(content))
chapters = []
for i in range(len(chapter_matches)):
start = chapter_matches[i].start()
if i < len(chapter_matches) - 1:
end = chapter_matches[i + 1].start()
else:
end = len(content)
chapter_content = content[start:end]
chapter_title = chapter_pattern.search(chapter_content).group()
chapters.append((chapter_title, chapter_content))
# 创建 EPUB 书籍
book = epub.EpubBook()
book.set_title(os.path.basename(txt_path).replace('.txt', ''))
book.set_language('zh')
# 添加章节到 EPUB
epub_chapters = []
for title, content in chapters:
chapter = epub.EpubHtml(title=title, file_name=f'{title}.xhtml', lang='zh')
# 处理换行符,将换行符替换为 HTML 的 <br> 标签
content = content.replace(title, "", 1).strip()
content = content.replace('\r\n', '<br>')
content = content.replace('\n', '<br>')
chapter.content = f'<h1>{title}</h1><p>{content}</p>'
book.add_item(chapter)
epub_chapters.append(chapter)
# 定义书籍结构
book.toc = tuple(epub_chapters)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# 定义样式
style = 'body { font-family: Times, serif; }'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
book.add_item(nav_css)
# 设置书籍的页面布局
book.spine = ['nav'] + epub_chapters
# 保存 EPUB 文件
epub.write_epub(epub_path, book, {})
# 使用示例
txt_file1 = f"{books_dir}/我的青年岁月(加章节).txt"
txt_file2 = f"{books_dir}/废都(海外版)贾平凹-加章节.txt"
epub_file1 = f"{books_dir}/我的青年岁月(加章节).epub"
epub_file2 = f"{books_dir}/废都(海外版)贾平凹-加章节.epub"
txt_to_epub(txt_file1, epub_file1)
txt_to_epub(txt_file2, epub_file2)