modify scripts
This commit is contained in:
@ -9,7 +9,7 @@ import sqlite_utils as db_tools
|
||||
import scraper
|
||||
import utils
|
||||
import config
|
||||
import convert_utils
|
||||
from epub_tools import generate_epub
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
@ -41,16 +41,42 @@ map_books_2 = {
|
||||
5240 : "幸福的借种经历",
|
||||
5171 : "我的师娘和师妹",
|
||||
}
|
||||
map_books_3 = {
|
||||
67 : "山村野香",
|
||||
}
|
||||
|
||||
map_books = map_books_2
|
||||
map_books = map_books_3
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
#books = [2689, 3727, 4698, 5446]
|
||||
#books = [3167, 2985, 3098]
|
||||
#books = [2783]
|
||||
for book, name in map_books.items():
|
||||
# 命令行参数处理
|
||||
parser = argparse.ArgumentParser(description='Dump book data to epub files')
|
||||
parser.add_argument('--list', type=str, default='', help='booids to dump, e.g. 2689,3727,4698')
|
||||
parser.add_argument('--url', type=str, default='', help='URL to fetch book data')
|
||||
args = parser.parse_args()
|
||||
|
||||
# 从URL获取书籍数据
|
||||
if args.url:
|
||||
data = db_tools.get_contents_by_href(args.url)
|
||||
if data:
|
||||
title = data['title']
|
||||
href = data.get('href', '')
|
||||
book_file = f"{books_dir}/{title}.epub"
|
||||
generate_epub(data, book_file)
|
||||
print(f"dump {title} - {href} suss! file: {book_file}")
|
||||
else:
|
||||
print(f"Failed to fetch data from {args.url}")
|
||||
exit(0)
|
||||
|
||||
if args.list:
|
||||
book_ids = [int(bid) for bid in args.list.split(',')]
|
||||
else:
|
||||
book_ids = map_books.keys()
|
||||
|
||||
for book in book_ids:
|
||||
data = db_tools.get_contents_by_book(book)
|
||||
if data:
|
||||
title = data['title']
|
||||
convert_utils.generate_epub(data, f"{books_dir}/{title}.epub")
|
||||
print(f"dump {title} suss!")
|
||||
href = data.get('href', '')
|
||||
book_file = f"{books_dir}/{title}.epub"
|
||||
generate_epub(data, book_file)
|
||||
print(f"dump {book} - {title} - {href} suss! file: {book_file}")
|
||||
68
aabook/src/epub_tools.py
Normal file
68
aabook/src/epub_tools.py
Normal file
@ -0,0 +1,68 @@
|
||||
from ebooklib import epub
|
||||
import os
|
||||
|
||||
def generate_epub(data, save_path):
|
||||
# 创建 EPUB 书籍对象
|
||||
book = epub.EpubBook()
|
||||
|
||||
# 设置书籍元数据
|
||||
book.set_title(data.get('title', '未知标题'))
|
||||
book.set_language('zh')
|
||||
book.add_author(data.get('author', '未知作者'))
|
||||
|
||||
# 存储所有章节对象
|
||||
all_chapters = []
|
||||
|
||||
sections = data.get('sections', [])
|
||||
|
||||
if len(sections) == 1:
|
||||
# 如果只有一个 section,忽略 section 的 title,按一级目录处理
|
||||
for index, chapter in enumerate(sections[0].get('chapters', []), start=1):
|
||||
chapter_title = chapter.get('title', '未知章节')
|
||||
chapter_content = chapter.get('content', '')
|
||||
paragraphs = chapter_content.split('\n\n')
|
||||
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
|
||||
# 为文件名添加序号,避免冲突
|
||||
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{index}_{chapter_title}.xhtml', lang='zh')
|
||||
chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
|
||||
book.add_item(chapter_obj)
|
||||
all_chapters.append(chapter_obj)
|
||||
else:
|
||||
# 如果有多个 section,按两级目录处理
|
||||
for section_index, section in enumerate(sections, start=1):
|
||||
section_title = section.get('title', '未知卷')
|
||||
# 为 section 的文件名添加序号,避免冲突
|
||||
section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_index}_{section_title}.xhtml', lang='zh')
|
||||
section_chapter.content = f'<h1>{section_title}</h1>'
|
||||
book.add_item(section_chapter)
|
||||
all_chapters.append(section_chapter)
|
||||
|
||||
for chapter_index, chapter in enumerate(section.get('chapters', []), start=1):
|
||||
chapter_title = chapter.get('title', '未知章节')
|
||||
chapter_content = chapter.get('content', '')
|
||||
paragraphs = chapter_content.split('\n\n')
|
||||
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
|
||||
# 为 chapter 的文件名添加 section 序号和 chapter 序号,避免冲突
|
||||
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{section_index}_{chapter_index}_{chapter_title}.xhtml', lang='zh')
|
||||
chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
|
||||
book.add_item(chapter_obj)
|
||||
all_chapters.append(chapter_obj)
|
||||
|
||||
# 定义书籍的目录
|
||||
book.toc = tuple(all_chapters)
|
||||
|
||||
# 定义书的结构
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
# 定义样式
|
||||
style = 'body { font-family: Times, serif; }'
|
||||
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
|
||||
book.add_item(nav_css)
|
||||
|
||||
# 定义书的结构
|
||||
book.spine = ['nav'] + all_chapters
|
||||
|
||||
# 保存 EPUB 文件
|
||||
epub.write_epub(save_path, book, {})
|
||||
|
||||
@ -397,7 +397,7 @@ def get_contents_by_book(id):
|
||||
return None
|
||||
|
||||
# 查询是书否存在
|
||||
cursor.execute(f"SELECT id, name, author, category, status FROM {tbl_name_books} WHERE id= {id}")
|
||||
cursor.execute(f"SELECT id, name, author, category, status, href FROM {tbl_name_books} WHERE id= {id}")
|
||||
existing_book = cursor.fetchone()
|
||||
if not existing_book: # 书不存在
|
||||
logging.warning(f"book {id} have no meta data.")
|
||||
@ -408,6 +408,7 @@ def get_contents_by_book(id):
|
||||
book_data['author'] = existing_book[2]
|
||||
book_data['category'] = existing_book[3]
|
||||
book_data['status'] = existing_book[4]
|
||||
book_data['href'] = existing_book[5]
|
||||
book_data['sections'] = []
|
||||
|
||||
# 组装section信息
|
||||
@ -438,6 +439,23 @@ def get_contents_by_book(id):
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 获取完整的小说内容
|
||||
def get_contents_by_href(href):
|
||||
try:
|
||||
# 查询内容表是否存在
|
||||
# 查询是书否存在
|
||||
cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href= '{href}'")
|
||||
existing_book = cursor.fetchone()
|
||||
if not existing_book: # 书不存在
|
||||
logging.warning(f"book {id} have no meta data.")
|
||||
return None
|
||||
|
||||
return get_contents_by_book(existing_book[0])
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 统计信息
|
||||
def get_statics():
|
||||
|
||||
@ -1,71 +0,0 @@
|
||||
import re
|
||||
from ebooklib import epub
|
||||
import os
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
books_dir = f"{config.global_host_data_dir}/aabook/data"
|
||||
|
||||
|
||||
def txt_to_epub(txt_path, epub_path):
|
||||
# 读取 TXT 文件,使用 GB18030 编码
|
||||
with open(txt_path, 'r', encoding='gb18030') as file:
|
||||
content = file.read()
|
||||
|
||||
# 按章节分割内容
|
||||
chapter_pattern = re.compile(r'第\d+章')
|
||||
chapter_matches = list(chapter_pattern.finditer(content))
|
||||
chapters = []
|
||||
for i in range(len(chapter_matches)):
|
||||
start = chapter_matches[i].start()
|
||||
if i < len(chapter_matches) - 1:
|
||||
end = chapter_matches[i + 1].start()
|
||||
else:
|
||||
end = len(content)
|
||||
chapter_content = content[start:end]
|
||||
chapter_title = chapter_pattern.search(chapter_content).group()
|
||||
chapters.append((chapter_title, chapter_content))
|
||||
|
||||
# 创建 EPUB 书籍
|
||||
book = epub.EpubBook()
|
||||
book.set_title(os.path.basename(txt_path).replace('.txt', ''))
|
||||
book.set_language('zh')
|
||||
|
||||
# 添加章节到 EPUB
|
||||
epub_chapters = []
|
||||
for title, content in chapters:
|
||||
chapter = epub.EpubHtml(title=title, file_name=f'{title}.xhtml', lang='zh')
|
||||
# 处理换行符,将换行符替换为 HTML 的 <br> 标签
|
||||
content = content.replace(title, "", 1).strip()
|
||||
content = content.replace('\r\n', '<br>')
|
||||
content = content.replace('\n', '<br>')
|
||||
chapter.content = f'<h1>{title}</h1><p>{content}</p>'
|
||||
book.add_item(chapter)
|
||||
epub_chapters.append(chapter)
|
||||
|
||||
# 定义书籍结构
|
||||
book.toc = tuple(epub_chapters)
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
# 定义样式
|
||||
style = 'body { font-family: Times, serif; }'
|
||||
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
|
||||
book.add_item(nav_css)
|
||||
|
||||
# 设置书籍的页面布局
|
||||
book.spine = ['nav'] + epub_chapters
|
||||
|
||||
# 保存 EPUB 文件
|
||||
epub.write_epub(epub_path, book, {})
|
||||
|
||||
|
||||
# 使用示例
|
||||
txt_file1 = f"{books_dir}/我的青年岁月(加章节).txt"
|
||||
txt_file2 = f"{books_dir}/废都(海外版)贾平凹-加章节.txt"
|
||||
epub_file1 = f"{books_dir}/我的青年岁月(加章节).epub"
|
||||
epub_file2 = f"{books_dir}/废都(海外版)贾平凹-加章节.epub"
|
||||
|
||||
txt_to_epub(txt_file1, epub_file1)
|
||||
txt_to_epub(txt_file2, epub_file2)
|
||||
Reference in New Issue
Block a user