122 lines
3.4 KiB
Python
122 lines
3.4 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
from ebooklib import epub
|
|
import re
|
|
import os
|
|
import json
|
|
import time
|
|
import csv
|
|
import logging
|
|
from datetime import datetime
|
|
import config
|
|
|
|
|
|
# 从"创建时间 2025-03-08 13:57:00" 中提取时间
|
|
def extract_create_time(input_str):
|
|
pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
|
|
match = re.search(pattern, input_str)
|
|
if match:
|
|
datetime_str = match.group(0)
|
|
return datetime_str
|
|
else:
|
|
return input_str
|
|
|
|
# 从 "read-374864.html" 中获取数字编号
|
|
def extract_page_num(page_str, default_num = 0):
|
|
# 定义正则表达式模式
|
|
pattern = r'read-(\d+)\.html'
|
|
# 使用 re.search 查找匹配项
|
|
match = re.search(pattern, page_str)
|
|
if match:
|
|
number = match.group(1)
|
|
return number
|
|
else:
|
|
return default_num
|
|
|
|
# 从 "book-5549.html" 中获取数字编号
|
|
def extract_book_num(page_str, default_num = 0):
|
|
# 定义正则表达式模式
|
|
pattern = r'book-(\d+)\.html'
|
|
# 使用 re.search 查找匹配项
|
|
match = re.search(pattern, page_str)
|
|
if match:
|
|
number = match.group(1)
|
|
return number
|
|
else:
|
|
return default_num
|
|
|
|
# 处理 [都市] 的方括号
|
|
def remove_brackets_regex(input_str):
|
|
pattern = r'\[(.*?)\]'
|
|
match = re.match(pattern, input_str)
|
|
if match:
|
|
return match.group(1)
|
|
return input_str
|
|
|
|
# 定义函数来抓取小说章节内容
|
|
def fetch_chapter(url):
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
# 这里需要根据实际网页结构修改选择器
|
|
chapter_content = soup.find('div', class_='chapter-content').get_text()
|
|
return chapter_content
|
|
except requests.RequestException as e:
|
|
print(f"请求出错: {e}")
|
|
return None
|
|
|
|
|
|
# 定义函数来生成 EPUB 文件
|
|
def generate_epub(title, author, chapters, path):
|
|
book = epub.EpubBook()
|
|
book.set_title(title)
|
|
book.set_language('zh')
|
|
book.add_author(author)
|
|
|
|
epub_chapters = []
|
|
for chapter_title, chapter_content in chapters:
|
|
c = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
|
|
c.content = f'<h1>{chapter_title}</h1><p>{chapter_content}</p>'
|
|
book.add_item(c)
|
|
epub_chapters.append(c)
|
|
|
|
# 定义书的结构
|
|
book.toc = tuple(epub_chapters)
|
|
book.add_item(epub.EpubNcx())
|
|
book.add_item(epub.EpubNav())
|
|
|
|
# 定义样式
|
|
style = 'body { font-family: Times, serif; }'
|
|
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
|
|
book.add_item(nav_css)
|
|
|
|
# 定义书的结构
|
|
book.spine = ['nav'] + epub_chapters
|
|
|
|
# 保存 EPUB 文件
|
|
epub.write_epub(f'{path}/{title}.epub', book, {})
|
|
|
|
|
|
# 示例使用
|
|
if __name__ == "__main__":
|
|
# 这里需要替换为实际的小说章节链接
|
|
chapter_info = [
|
|
('第一章', 'https://example.com/chapter1'),
|
|
('第二章', 'https://example.com/chapter2')
|
|
]
|
|
title = '小说标题'
|
|
author = '小说作者'
|
|
|
|
chapters = []
|
|
for chapter_title, url in chapter_info:
|
|
content = fetch_chapter(url)
|
|
if content:
|
|
chapters.append((chapter_title, content))
|
|
|
|
if chapters:
|
|
generate_epub(title, author, chapters)
|
|
print(f'{title}.epub 文件生成成功。')
|
|
else:
|
|
print('未获取到有效章节内容,无法生成 EPUB 文件。')
|
|
|