modify scripts
This commit is contained in:
@ -10,6 +10,7 @@ from datetime import datetime
|
||||
from datetime import date
|
||||
import config # 日志配置
|
||||
from down_list import novel_map
|
||||
import utils
|
||||
|
||||
|
||||
# 日志
|
||||
@ -21,7 +22,8 @@ list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&ca
|
||||
list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
|
||||
curr_novel_pages = 0
|
||||
|
||||
meta_dir = 'meta'
|
||||
meta_dir = f'{config.global_host_data_dir}/aabook/meta'
|
||||
novel_dir = f'{config.global_host_data_dir}/aabook/data'
|
||||
|
||||
list_file = f'{meta_dir}/list.txt'
|
||||
details_file = f'{meta_dir}/details.txt'
|
||||
@ -246,7 +248,7 @@ def extract_content_url(soup, base_url, chapid):
|
||||
|
||||
# 判断内容是否被污染
|
||||
def check_content(content):
|
||||
if '2005-2024 疯情书库' in content:
|
||||
if '2005-2024 疯情书库' in content or '2005-2025 疯情书库' in content:
|
||||
return False
|
||||
|
||||
return True
|
||||
@ -263,13 +265,15 @@ def get_novel_pages():
|
||||
return curr_novel_pages
|
||||
|
||||
# 解析章节内容并保存到文件中
|
||||
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
def download_novel(chapid, novel_name, dir_prefix=novel_dir):
|
||||
chapter_url = f'{base_url}/read-{chapid}.html'
|
||||
|
||||
novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
|
||||
if os.path.exists(novel_file):
|
||||
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
||||
|
||||
# 保存到其他类型的文件
|
||||
chapters = []
|
||||
reset_novel_pages()
|
||||
while chapter_url:
|
||||
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
|
||||
@ -314,6 +318,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
f.write(chapter_title + '\n\n')
|
||||
|
||||
# 写入每个段落内容到文件
|
||||
content = ''
|
||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||
for paragraph in paragraphs:
|
||||
#cleaned_part = clean_watermarks(paragraph.get_text().strip())
|
||||
@ -321,7 +326,9 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
#f.write(cleaned_part + '\n\n')
|
||||
cleaned_text = process_paragraph(paragraph)
|
||||
f.write(cleaned_text + '\n\n')
|
||||
content = content + '<p>' + cleaned_text + '</p>' # epub 里面,用html标签来分段落
|
||||
logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
|
||||
chapters.append((chapter_title, content))
|
||||
else:
|
||||
logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
|
||||
continue
|
||||
@ -356,6 +363,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
break
|
||||
|
||||
time.sleep(3)
|
||||
# 全部获取完,生成epub文件
|
||||
utils.generate_epub(novel_name, 'nobody', chapters, dir_prefix)
|
||||
|
||||
|
||||
# 检查子目录是否存在,不存在则创建
|
||||
@ -400,7 +409,7 @@ def download_books(need_down_list_file = details_file, cursor_file = down_list_f
|
||||
continue # 已经下载过,跳过
|
||||
|
||||
# 创建分类目录
|
||||
down_dir = './data/' + category
|
||||
down_dir = f'{novel_dir}/{category}'
|
||||
create_directory_if_not_exists(down_dir)
|
||||
|
||||
# 调用下载函数下载书籍
|
||||
@ -420,7 +429,7 @@ def download_map():
|
||||
# 遍历 novel_map,下载所有小说
|
||||
for novel_id, novel_name in novel_map.items():
|
||||
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
|
||||
download_novel(novel_id, novel_name, './local')
|
||||
download_novel(novel_id, novel_name, novel_dir)
|
||||
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
|
||||
|
||||
# 获取更新列表,并下载
|
||||
@ -444,6 +453,10 @@ def main():
|
||||
print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map")
|
||||
sys.exit(1)
|
||||
|
||||
# 确保目录存在
|
||||
create_directory_if_not_exists(meta_dir)
|
||||
create_directory_if_not_exists(novel_dir)
|
||||
|
||||
cmd = sys.argv[1]
|
||||
|
||||
if cmd == "get_list":
|
||||
|
||||
@ -3,13 +3,9 @@ import os
|
||||
import inspect
|
||||
from datetime import datetime
|
||||
|
||||
# MySQL 配置
|
||||
db_config = {
|
||||
'host': '172.18.0.3',
|
||||
'user': 'root',
|
||||
'password': 'mysqlpw',
|
||||
'database': 'stockdb'
|
||||
}
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
|
||||
global_share_data_dir = f'{home_dir}/sharedata'
|
||||
|
||||
# 设置日志配置
|
||||
def setup_logging(log_filename=None):
|
||||
|
||||
@ -10,7 +10,7 @@ novel_map_new = {
|
||||
}
|
||||
# 定义小说映射
|
||||
novel_map = {
|
||||
364489: '诸天之乡村爱情',
|
||||
371300: '临时夫妻',
|
||||
}
|
||||
|
||||
|
||||
|
||||
12
aabook/src/check_status.py
Normal file
12
aabook/src/check_status.py
Normal file
@ -0,0 +1,12 @@
|
||||
import json
|
||||
import time
|
||||
import sqlite_utils as db_tools
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
result = db_tools.get_statics()
|
||||
print(result)
|
||||
|
||||
|
||||
|
||||
80
aabook/src/config.py
Normal file
80
aabook/src/config.py
Normal file
@ -0,0 +1,80 @@
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
import time
|
||||
from datetime import datetime
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from collections import defaultdict
|
||||
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
|
||||
global_share_data_dir = f'{home_dir}/sharedata'
|
||||
global_sqlite_path = f'{global_share_data_dir}/sqlite/books.db'
|
||||
|
||||
log_dir = '../log'
|
||||
# 统计日志频率
|
||||
log_count = defaultdict(int) # 记录日志的次数
|
||||
last_log_time = defaultdict(float) # 记录上次写入的时间戳
|
||||
|
||||
class RateLimitFilter(logging.Filter):
|
||||
"""
|
||||
频率限制过滤器:
|
||||
1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
|
||||
2. 如果日志速率超过 100 条/秒,发出告警
|
||||
"""
|
||||
LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
|
||||
|
||||
def filter(self, record):
|
||||
global log_count, last_log_time
|
||||
message_key = record.getMessage() # 获取日志内容
|
||||
|
||||
# 计算当前时间
|
||||
now = time.time()
|
||||
elapsed = now - last_log_time[message_key]
|
||||
|
||||
# 限制相同日志的写入频率
|
||||
if elapsed < 60: # 60 秒内
|
||||
log_count[message_key] += 1
|
||||
if log_count[message_key] > self.LOG_LIMIT:
|
||||
print('reach limit.')
|
||||
return False # 直接丢弃
|
||||
else:
|
||||
log_count[message_key] = 1 # 超过 60 秒,重新计数
|
||||
|
||||
last_log_time[message_key] = now
|
||||
|
||||
return True # 允许写入日志
|
||||
|
||||
|
||||
|
||||
def setup_logging(log_filename=None):
|
||||
if log_filename is None:
|
||||
caller_frame = inspect.stack()[1]
|
||||
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
log_filename = f'{log_dir}/{caller_filename}_{current_date}.log'
|
||||
|
||||
max_log_size = 100 * 1024 * 1024 # 10 MB
|
||||
max_log_files = 10 # 最多保留 10 个日志文件
|
||||
|
||||
file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
|
||||
file_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
|
||||
))
|
||||
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
|
||||
))
|
||||
|
||||
# 创建 logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.handlers = [] # 避免重复添加 handler
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# 添加频率限制
|
||||
rate_limit_filter = RateLimitFilter()
|
||||
file_handler.addFilter(rate_limit_filter)
|
||||
console_handler.addFilter(rate_limit_filter)
|
||||
126
aabook/src/convert_utils.py
Normal file
126
aabook/src/convert_utils.py
Normal file
@ -0,0 +1,126 @@
|
||||
from ebooklib import epub
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
from reportlab.platypus import Paragraph, Spacer
|
||||
|
||||
|
||||
def generate_epub(data, save_path):
|
||||
# 创建 EPUB 书籍对象
|
||||
book = epub.EpubBook()
|
||||
|
||||
# 设置书籍元数据
|
||||
book.set_title(data.get('title', '未知标题'))
|
||||
book.set_language('zh')
|
||||
book.add_author(data.get('author', '未知作者'))
|
||||
|
||||
# 存储所有章节对象
|
||||
all_chapters = []
|
||||
|
||||
sections = data.get('sections', [])
|
||||
|
||||
if len(sections) == 1:
|
||||
# 如果只有一个 section,忽略 section 的 title,按一级目录处理
|
||||
for chapter in sections[0].get('chapters', []):
|
||||
chapter_title = chapter.get('title', '未知章节')
|
||||
chapter_content = chapter.get('content', '')
|
||||
paragraphs = chapter_content.split('\n\n')
|
||||
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
|
||||
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
|
||||
chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
|
||||
book.add_item(chapter_obj)
|
||||
all_chapters.append(chapter_obj)
|
||||
else:
|
||||
# 如果有多个 section,按两级目录处理
|
||||
for section in sections:
|
||||
section_title = section.get('title', '未知卷')
|
||||
section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_title}.xhtml', lang='zh')
|
||||
section_chapter.content = f'<h1>{section_title}</h1>'
|
||||
book.add_item(section_chapter)
|
||||
all_chapters.append(section_chapter)
|
||||
|
||||
for chapter in section.get('chapters', []):
|
||||
chapter_title = chapter.get('title', '未知章节')
|
||||
chapter_content = chapter.get('content', '')
|
||||
paragraphs = chapter_content.split('\n\n')
|
||||
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
|
||||
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
|
||||
chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
|
||||
book.add_item(chapter_obj)
|
||||
all_chapters.append(chapter_obj)
|
||||
|
||||
# 定义书籍的目录
|
||||
book.toc = tuple(all_chapters)
|
||||
|
||||
# 定义书的结构
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
# 定义样式
|
||||
style = 'body { font-family: Times, serif; }'
|
||||
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
|
||||
book.add_item(nav_css)
|
||||
|
||||
# 定义书的结构
|
||||
book.spine = ['nav'] + all_chapters
|
||||
|
||||
# 保存 EPUB 文件
|
||||
epub.write_epub(save_path, book, {})
|
||||
|
||||
|
||||
def generate_pdf(data, save_path):
|
||||
# 创建 PDF 画布
|
||||
c = canvas.Canvas(save_path, pagesize=letter)
|
||||
styles = getSampleStyleSheet()
|
||||
story = []
|
||||
|
||||
# 设置标题
|
||||
title = data.get('title', '未知标题')
|
||||
story.append(Paragraph(f'<font size=20>{title}</font>', styles['Title']))
|
||||
story.append(Spacer(1, 20))
|
||||
|
||||
# 设置作者
|
||||
author = data.get('author', '未知作者')
|
||||
story.append(Paragraph(f'<font size=14>作者: {author}</font>', styles['Normal']))
|
||||
story.append(Spacer(1, 40))
|
||||
|
||||
sections = data.get('sections', [])
|
||||
|
||||
if len(sections) == 1:
|
||||
# 如果只有一个 section,忽略 section 的 title,按一级目录处理
|
||||
for chapter in sections[0].get('chapters', []):
|
||||
chapter_title = chapter.get('title', '未知章节')
|
||||
chapter_content = chapter.get('content', '')
|
||||
story.append(Paragraph(f'<font size=18>{chapter_title}</font>', styles['Heading1']))
|
||||
story.append(Spacer(1, 10))
|
||||
paragraphs = chapter_content.split('\n\n')
|
||||
for para in paragraphs:
|
||||
story.append(Paragraph(para, styles['Normal']))
|
||||
story.append(Spacer(1, 10))
|
||||
story.append(Spacer(1, 20))
|
||||
else:
|
||||
# 如果有多个 section,按两级目录处理
|
||||
for section in sections:
|
||||
section_title = section.get('title', '未知卷')
|
||||
story.append(Paragraph(f'<font size=20>{section_title}</font>', styles['Heading1']))
|
||||
story.append(Spacer(1, 15))
|
||||
for chapter in section.get('chapters', []):
|
||||
chapter_title = chapter.get('title', '未知章节')
|
||||
chapter_content = chapter.get('content', '')
|
||||
story.append(Paragraph(f'<font size=16>{chapter_title}</font>', styles['Heading2']))
|
||||
story.append(Spacer(1, 10))
|
||||
paragraphs = chapter_content.split('\n\n')
|
||||
for para in paragraphs:
|
||||
story.append(Paragraph(para, styles['Normal']))
|
||||
story.append(Spacer(1, 10))
|
||||
story.append(Spacer(1, 15))
|
||||
|
||||
# 构建 PDF
|
||||
for element in story:
|
||||
element.wrapOn(c, letter[0] - 100, letter[1] - 100)
|
||||
element.drawOn(c, 50, letter[1] - element.wrapOn(c, letter[0] - 100, letter[1] - 100)[1] - 50)
|
||||
c.showPage()
|
||||
|
||||
# 保存 PDF 文件
|
||||
c.save()
|
||||
|
||||
312
aabook/src/fetch.py
Normal file
312
aabook/src/fetch.py
Normal file
@ -0,0 +1,312 @@
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
from functools import partial
|
||||
import config
|
||||
import sqlite_utils as db_tools
|
||||
import scraper
|
||||
import utils
|
||||
import config
|
||||
|
||||
config.setup_logging()
|
||||
|
||||
debug = False
|
||||
force = False
|
||||
|
||||
# 获取列表
|
||||
def fetch_book_list():
|
||||
url = scraper.list_url_update
|
||||
while True:
|
||||
logging.info(f'fetching book list. url: {url}')
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="list_main", attr_type="class"))
|
||||
if soup:
|
||||
# 获取书籍列表
|
||||
list_data, next_url = scraper.parse_book_list(soup, url=url)
|
||||
for item in list_data:
|
||||
row_id = db_tools.insert_or_update_common(item, db_tools.tbl_name_books)
|
||||
if row_id:
|
||||
logging.debug(f'insert one book. row_id: {row_id}, name: {item['name']}')
|
||||
else:
|
||||
logging.warning(f'insert book error. name: {item['name']}, href: {item['href']}')
|
||||
if next_url is None:
|
||||
logging.info(f'get all pages.')
|
||||
return True
|
||||
else:
|
||||
url = next_url
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch page error. {url} ...')
|
||||
|
||||
|
||||
# 获取详情
|
||||
def fetch_real_content(url):
|
||||
soup, status_code = scraper.fetch_page(url, scraper.content_validator)
|
||||
if soup:
|
||||
data = scraper.parse_content_page(soup, url)
|
||||
if data:
|
||||
return data # 段落的数组
|
||||
elif status_code and status_code == 404:
|
||||
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
|
||||
else:
|
||||
logging.warning(f'fetch page error. {url} ...')
|
||||
return None
|
||||
|
||||
|
||||
# 获取内容页
|
||||
def fetch_chapter_content(url):
|
||||
chapter_data = {}
|
||||
next_url = None
|
||||
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
|
||||
if soup:
|
||||
data, next_url = scraper.parse_chapter_page(soup, url)
|
||||
if data:
|
||||
chapter_data['title'] = data['title']
|
||||
contents = fetch_real_content(data['content_url'])
|
||||
if contents:
|
||||
chapter_data['contents'] = contents
|
||||
else:
|
||||
logging.warning(f'fetching real content faild. url: {data['content_url']}')
|
||||
return None, None
|
||||
else:
|
||||
logging.warning(f'fetch chapter page no data. url: {url}')
|
||||
return None, None
|
||||
else:
|
||||
logging.warning(f'fetch chapter page error. url: {url}, status_code: {status_code}')
|
||||
return None, None
|
||||
|
||||
return chapter_data, next_url
|
||||
|
||||
# 获取小说详情页,获得首页地址
|
||||
def fetch_book_detail(url):
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
|
||||
if soup:
|
||||
detail = scraper.parse_book_detail(soup, url)
|
||||
return detail
|
||||
else:
|
||||
return None
|
||||
|
||||
# 获取某本小说的目录页
|
||||
def fetch_book_toc(url):
|
||||
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="page_main", attr_type="class"))
|
||||
if soup:
|
||||
listdata = scraper.pase_chapter_list(soup, url)
|
||||
return listdata
|
||||
else:
|
||||
return None
|
||||
|
||||
# 获取小说的目录页,并插入到数据库
|
||||
def fetch_table_of_contents():
|
||||
while True:
|
||||
update_list = db_tools.query_books(id=2547, is_latest=0, limit = 2 if debug else 100)
|
||||
if update_list is None or len(update_list) <1 :
|
||||
logging.info(f'no more data need fecth.')
|
||||
return
|
||||
|
||||
for row in update_list:
|
||||
name = row['name']
|
||||
href = row['href']
|
||||
bookid = row['id']
|
||||
# 先打开详情页
|
||||
logging.info(f'----------fetching book {name}: {href}-------------')
|
||||
book_detail = fetch_book_detail(href)
|
||||
if book_detail is None:
|
||||
logging.warning(f'get book detail failed. url: {href}')
|
||||
continue
|
||||
|
||||
# 获取目录页
|
||||
toc_url = book_detail['table_of_contents_href']
|
||||
if toc_url is None or toc_url == '':
|
||||
logging.warning(f'table_of_contents_href is not correct. url: {href}')
|
||||
continue
|
||||
|
||||
logging.info(f'fetching page: {toc_url}')
|
||||
toc_data = fetch_book_toc(toc_url)
|
||||
|
||||
# 解析目录页
|
||||
if toc_data is None:
|
||||
logging.warning(f'fetch_book_toc error. url: {toc_url}')
|
||||
continue
|
||||
|
||||
# 插入所有的目录数据
|
||||
succ = 1
|
||||
for row in toc_data:
|
||||
section_title = row['title']
|
||||
chapters = row['chapters']
|
||||
section_id = db_tools.insert_or_update_book_sections({
|
||||
'book_id' : int(bookid),
|
||||
'section' : section_title,
|
||||
'bookid_section': f'{bookid}_{section_title}'
|
||||
})
|
||||
if section_id is None:
|
||||
logging.warning(f'insert section error. url: {toc_url}, section: {section_title}')
|
||||
succ = 0
|
||||
break
|
||||
else:
|
||||
logging.debug(f'insert one books_sections record. id:{section_id}, key: {bookid}_{section_title}')
|
||||
|
||||
# 插入目录数据
|
||||
for chap in chapters:
|
||||
chap_row_id = db_tools.insert_chapter_data({
|
||||
'book_id': bookid,
|
||||
'chapter_id': chap['chapter_id'],
|
||||
'section_id': section_id,
|
||||
'title': chap['title'],
|
||||
'href': chap['href'],
|
||||
'content': '',
|
||||
'has_content' : 0
|
||||
})
|
||||
if chap_row_id is None:
|
||||
logging.warning(f'insert_chapter_data error. url: {toc_url}')
|
||||
succ = 0
|
||||
break
|
||||
if succ == 0 :
|
||||
logging.warning(f'fetch_book_toc data error. url: {toc_url}')
|
||||
continue
|
||||
|
||||
# 读取完毕,更新列表
|
||||
row_id = db_tools.update_book_detail({
|
||||
'href' : href,
|
||||
**book_detail
|
||||
})
|
||||
if row_id:
|
||||
logging.debug(f'update book succ. id: {row_id}, url: {href}')
|
||||
else:
|
||||
logging.warning(f'update book failed. url: {href}')
|
||||
if debug:
|
||||
return
|
||||
|
||||
# 直接获取小说内容
|
||||
def fetch_contents():
|
||||
while True:
|
||||
list_data = db_tools.query_no_content_chapters(limit = 10 if debug else 100)
|
||||
if list_data is None or len(list_data) <1 :
|
||||
logging.info(f'no more data need fecth.')
|
||||
return
|
||||
|
||||
for row in list_data:
|
||||
url = row['href']
|
||||
logging.info(f'fetching content ({row['title']}) from {url}')
|
||||
content, next_url = fetch_chapter_content(url)
|
||||
if content and content['title'] and content['contents']:
|
||||
# 写入到数据表里
|
||||
db_tools.insert_chapter_data({
|
||||
'book_id': row['book_id'],
|
||||
'chapter_id': row['chapter_id'],
|
||||
'section_id': row['section_id'],
|
||||
'title': row['title'],
|
||||
'href': url,
|
||||
'content': '\n\n'.join(content['contents']),
|
||||
'has_content': 1
|
||||
})
|
||||
else:
|
||||
logging.warning(f'fetch content error. url: {url}')
|
||||
if debug:
|
||||
return
|
||||
|
||||
|
||||
'''
|
||||
# 下载完整的小说
|
||||
def fetch_book_data():
|
||||
update_list = db_tools.query_books(need_update=1, limit = 1)
|
||||
if update_list:
|
||||
for row in update_list:
|
||||
name = row['name']
|
||||
href = row['href']
|
||||
bookid = row['id']
|
||||
# 先打开详情页
|
||||
logging.info(f'----------fetching book {name}: {href}-------------')
|
||||
book_detail = fetch_book_detail(href)
|
||||
if book_detail:
|
||||
# 获取内容页,然后循环读取内容
|
||||
chapter_url = book_detail['start_page_href']
|
||||
chapter_id = utils.extract_page_num(chapter_url)
|
||||
# 断点续传,从上次拉取的最后一页开始
|
||||
if not force:
|
||||
last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
|
||||
if last_chapter_url:
|
||||
chapter_url = last_chapter_url
|
||||
while chapter_url:
|
||||
logging.info(f'fetching page: {chapter_url}')
|
||||
content, next_url = fetch_chapter_content(chapter_url)
|
||||
if content and content['title'] and content['contents']:
|
||||
# 写入到数据表里
|
||||
db_tools.insert_chapter_data({
|
||||
'book_id': bookid,
|
||||
'chapter_id': chapter_id,
|
||||
'title': content['title'],
|
||||
'href': chapter_url,
|
||||
'content': '\n\n'.join(content['contents']),
|
||||
'has_content': 1
|
||||
})
|
||||
|
||||
if debug:
|
||||
return
|
||||
else:
|
||||
logging.warning(f'fetch content error. url: {chapter_url}')
|
||||
chapter_url = next_url
|
||||
# 读取完毕,更新列表
|
||||
row_id = db_tools.update_book_detail({
|
||||
'href' : href,
|
||||
**book_detail
|
||||
})
|
||||
if row_id:
|
||||
logging.debug(f'update book succ. id: {row_id}, url: {href}')
|
||||
else:
|
||||
logging.warning(f'update book failed. url: {href}')
|
||||
else:
|
||||
logging.warning(f'get book detail failed. url: {href}')
|
||||
else:
|
||||
logging.warning(f'get no data needed update.')
|
||||
'''
|
||||
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"list": fetch_book_list,
|
||||
"toc" : fetch_table_of_contents,
|
||||
"content": fetch_contents,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
def main(cmd, args_debug, args_force):
|
||||
global debug
|
||||
debug = args_debug
|
||||
|
||||
global force
|
||||
force = args_force
|
||||
|
||||
# 执行指定的函数
|
||||
if cmd:
|
||||
function_names = args.cmd.split(",") # 拆分输入
|
||||
for short_name in function_names:
|
||||
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
|
||||
if callable(func):
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
else: # 全量执行
|
||||
for name, func in function_map.items():
|
||||
if callable(func):
|
||||
func()
|
||||
else:
|
||||
logging.warning(f" {short_name} is not a valid function shortcut.")
|
||||
|
||||
logging.info(f'all process completed!')
|
||||
|
||||
# TODO:
|
||||
# 1,
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行参数处理
|
||||
keys_str = ",".join(function_map.keys())
|
||||
|
||||
parser = argparse.ArgumentParser(description='fetch aabook data.')
|
||||
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
|
||||
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
|
||||
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.cmd, args.debug, args.force)
|
||||
364
aabook/src/scraper.py
Normal file
364
aabook/src/scraper.py
Normal file
@ -0,0 +1,364 @@
|
||||
import time
|
||||
import json
|
||||
import csv
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import random
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import RequestException
|
||||
from functools import partial
|
||||
import config
|
||||
import utils
|
||||
|
||||
# 定义基础 URL 和可变参数
|
||||
host_url = 'https://aabook.xyz'
|
||||
list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
|
||||
#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
||||
|
||||
# User-Agent 列表
|
||||
user_agents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
|
||||
"Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
|
||||
]
|
||||
|
||||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||||
def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if 'aabook.xyz' not in url.lower():
|
||||
logging.error(f'wrong url format: {url}')
|
||||
return None, None
|
||||
|
||||
# 随机选择一个 User-Agent
|
||||
headers = {
|
||||
'User-Agent': random.choice(user_agents)
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
|
||||
|
||||
# 处理 HTTP 状态码
|
||||
if response.status_code == 404:
|
||||
logging.warning(f"Page not found (404): {url}")
|
||||
return None, 404 # 直接返回 404,调用方可以跳过
|
||||
|
||||
response.raise_for_status() # 处理 HTTP 错误
|
||||
|
||||
# 预处理 HTML(如果提供了 preprocessor)
|
||||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||||
|
||||
soup = BeautifulSoup(html_text, parser)
|
||||
if validator(soup): # 进行自定义页面检查
|
||||
return soup, response.status_code
|
||||
|
||||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||||
except requests.RequestException as e:
|
||||
logging.info(f"Warn fetching page {url}: {e}. Retrying ...")
|
||||
time.sleep(sleep_time) # 休眠指定的时间,然后重试
|
||||
|
||||
logging.error(f'Fetching failed after max retries. {url}')
|
||||
return None, None # 达到最大重试次数仍然失败
|
||||
|
||||
|
||||
# 解析列表页
|
||||
def parse_book_list(soup, url):
|
||||
# 查找书籍列表
|
||||
list_main = soup.find('div', class_='list_main')
|
||||
if not list_main:
|
||||
logging.warning(f"No list_main Found in {url}")
|
||||
return None, None
|
||||
|
||||
tbody = list_main.find('tbody')
|
||||
if not tbody:
|
||||
logging.warning(f"No tbody found in {url}")
|
||||
None, None
|
||||
|
||||
list_data = []
|
||||
next_url = None
|
||||
# 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期)
|
||||
for tr in tbody.find_all('tr'):
|
||||
tds = tr.find_all('td')
|
||||
if len(tds) < 6:
|
||||
logging.info("Invalid tr format.")
|
||||
ranking = tds[0].text.strip()
|
||||
category = utils.remove_brackets_regex(tds[1].text.strip())
|
||||
book_link_tag = tds[2].find('a')
|
||||
book_name = book_link_tag.text.strip()
|
||||
book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
|
||||
book_num = utils.extract_book_num(book_link_tag['href'])
|
||||
author = tds[3].text.strip()
|
||||
monthly_tickets = tds[4].text.strip()
|
||||
update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期)
|
||||
|
||||
list_data.append({
|
||||
'rank': ranking,
|
||||
'category': category,
|
||||
'name': book_name,
|
||||
'href': book_link,
|
||||
'num': book_num,
|
||||
'author': author,
|
||||
'tickets': monthly_tickets,
|
||||
'update_time': update_time
|
||||
})
|
||||
|
||||
# 查找下一页链接
|
||||
next_page_tag = soup.find('a', title='下一页')
|
||||
if next_page_tag:
|
||||
next_url = host_url + next_page_tag['href']
|
||||
|
||||
return list_data, next_url
|
||||
|
||||
# 解析详情页
|
||||
def parse_book_detail(soup, url):
|
||||
# 解析书籍详细信息
|
||||
book_info_tag = soup.find('li', class_='zuopinxinxi')
|
||||
if not book_info_tag:
|
||||
logging.warning(f"No details found in {url}")
|
||||
return None
|
||||
|
||||
table_of_contents_href = ''
|
||||
table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
|
||||
if table_of_contents_href_tag:
|
||||
table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']
|
||||
|
||||
book_info_lis = book_info_tag.find_all('li')
|
||||
if len(book_info_lis) < 4:
|
||||
logging.info(f"invalid book info in {url}")
|
||||
return None
|
||||
|
||||
book_category = book_info_lis[0].find('span').text.strip()
|
||||
book_status = book_info_lis[1].find('span').text.strip()
|
||||
# 去掉后面的汉字,只要数字
|
||||
total_word_count = book_info_lis[2].find('span').text.strip()
|
||||
total_word_count = int(re.search(r'\d+', total_word_count).group())
|
||||
|
||||
total_clicks = book_info_lis[3].find('span').text.strip()
|
||||
month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
|
||||
week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
|
||||
total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
|
||||
month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
|
||||
week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'
|
||||
|
||||
# 读取创建时间
|
||||
creation_time_tag = soup.find('li', class_='update_time')
|
||||
created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')
|
||||
|
||||
# 获取起始页链接和编号
|
||||
start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
|
||||
start_page_link = host_url + '/' + start_page_tag['href']
|
||||
start_page_number = start_page_link.split('-')[-1].replace('.html', '')
|
||||
|
||||
return {
|
||||
'category': book_category,
|
||||
'status' : book_status,
|
||||
'total_words' : total_word_count,
|
||||
'total_clicks': total_clicks,
|
||||
'month_clicks': month_clicks,
|
||||
'week_clicks': week_clicks,
|
||||
'total_recommend': total_recommend,
|
||||
'month_recommend': month_recommend,
|
||||
'week_recommend': week_recommend,
|
||||
'created_time': created_time,
|
||||
'start_page_href': start_page_link,
|
||||
'start_page_num': start_page_number,
|
||||
'table_of_contents_href': table_of_contents_href
|
||||
}
|
||||
|
||||
# 解析书籍的目录页
|
||||
def pase_chapter_list(soup, url):
|
||||
# 获取小说的目录
|
||||
table_of_contents = []
|
||||
div_table_of_contents = soup.find('div', class_='page_main')
|
||||
if not div_table_of_contents:
|
||||
return None
|
||||
|
||||
section_titles = div_table_of_contents.find_all('p', class_='section_title')
|
||||
sections = div_table_of_contents.find_all('ul', class_='section_list')
|
||||
if len(sections) > len(section_titles): # 一般是 后者比前者多1个,最后一个是广告
|
||||
logging.warning(f'sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}')
|
||||
return None
|
||||
else:
|
||||
for i in range(len(sections)):
|
||||
section_title = section_titles[i].get_text().strip()
|
||||
chap_list = sections[i].find_all("a")
|
||||
chap_data = []
|
||||
for chap in chap_list:
|
||||
chap_title = chap.get_text().strip()
|
||||
chap_link = f'{host_url}/{chap['href']}'
|
||||
chap_id = utils.extract_page_num(chap_link)
|
||||
chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
|
||||
table_of_contents.append({'title': section_title, 'chapters': chap_data})
|
||||
|
||||
return table_of_contents
|
||||
|
||||
# 解析书籍的章节页
|
||||
def parse_chapter_page(soup, url):
|
||||
# 获取章节标题
|
||||
chapter_title_tag = soup.find('h1', class_='chapter_title')
|
||||
if chapter_title_tag is None:
|
||||
logging.warning(f'Chapter title not found in {url}')
|
||||
return None, None
|
||||
|
||||
title = chapter_title_tag.get_text().strip()
|
||||
content_url = None
|
||||
next_url = None
|
||||
chapid = utils.extract_page_num(url)
|
||||
|
||||
# 遍历每一个 <script> 标签,查找内容页的链接
|
||||
script_tags = soup.find_all('script')
|
||||
for script_tag in script_tags:
|
||||
script_content = script_tag.string
|
||||
if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
|
||||
# 匹配到特定内容,提取出 _getcontent.php 的 URL 模板
|
||||
match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
|
||||
if match:
|
||||
# 从匹配中提取 v 参数值
|
||||
v_value = match.group(1)
|
||||
# 构建完整的 content_url
|
||||
content_url = f"{host_url}/_getcontent.php?id={chapid}&v={v_value}"
|
||||
break
|
||||
if content_url is None:
|
||||
logging.warning(f'Content url not found in {url}')
|
||||
return None, None
|
||||
|
||||
# 获取小说的目录
|
||||
table_of_contents = []
|
||||
div_table_of_contents = soup.find('div', class_='mulu_con')
|
||||
if div_table_of_contents or False: # 考虑要不要加上这个
|
||||
section_titles = div_table_of_contents.find_all('p')
|
||||
sections = div_table_of_contents.find_all('ul')
|
||||
if len(sections) != len(section_titles):
|
||||
logging.warning(f'sections not matched titles')
|
||||
else:
|
||||
for i in range(len(sections)):
|
||||
section_title = section_titles[i].get_text().strip()
|
||||
chap_list = sections[i].find_all("a")
|
||||
chap_data = []
|
||||
for chap in chap_list:
|
||||
chap_title = chap.get_text().strip()
|
||||
chap_link = chap['href']
|
||||
chap_data.append({'href': chap_link, 'title': chap_title})
|
||||
table_of_contents.append({'title': section_title, 'chapters': chap_data})
|
||||
|
||||
# 查找下一章的链接
|
||||
next_div = soup.find('div', class_='next_arrow')
|
||||
if next_div:
|
||||
next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))
|
||||
if next_page_tag:
|
||||
next_url = f'{host_url}/{next_page_tag['href']}' if next_page_tag['href'] else ''
|
||||
|
||||
data = {'title': title, 'content_url': content_url, 'table_of_contents': table_of_contents}
|
||||
return data, next_url
|
||||
|
||||
|
||||
def process_paragraph(paragraph):
|
||||
# 获取完整的 HTML 结构,而不是 get_text()
|
||||
paragraph_html = str(paragraph)
|
||||
|
||||
# 移除水印标签
|
||||
cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', paragraph_html, flags=re.DOTALL)
|
||||
|
||||
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
|
||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||
cleaned_text = soup.get_text().strip()
|
||||
|
||||
return cleaned_text
|
||||
|
||||
# 解析内容页
|
||||
def parse_content_page(soup, url):
|
||||
content = []
|
||||
paragraphs = soup.find_all('p')
|
||||
if paragraphs:
|
||||
for paragraph in paragraphs:
|
||||
cleaned_text = process_paragraph(paragraph)
|
||||
content.append(cleaned_text)
|
||||
|
||||
return content
|
||||
|
||||
# 通用的 HTML 结构验证器
|
||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||
if attr_type == "id":
|
||||
return soup.find(tag, id=identifier) is not None
|
||||
elif attr_type == "class":
|
||||
return bool(soup.find_all(tag, class_=identifier))
|
||||
elif attr_type == "name":
|
||||
return bool(soup.find('select', {'name': identifier}))
|
||||
return False
|
||||
|
||||
# 对内容是否被污染的判断
|
||||
def content_validator(soup):
|
||||
text = str(soup)
|
||||
dirty_words = ['2005-2024 疯情书库', '2005-2025 疯情书库', '2025 疯情书库', '2026 疯情书库', '2027 疯情书库']
|
||||
for word in dirty_words:
|
||||
if word in text:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def test_content_page(url):
|
||||
soup, status_code = fetch_page(url, content_validator)
|
||||
if soup:
|
||||
data = parse_content_page(soup, url)
|
||||
if data:
|
||||
return data
|
||||
else :
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def test_chapter_page(url):
|
||||
soup, status_code = fetch_page(url, partial(generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
|
||||
if soup:
|
||||
data, next_url = parse_chapter_page(soup, url)
|
||||
if data:
|
||||
return data
|
||||
else :
|
||||
return None
|
||||
|
||||
def test_book_detail(url):
|
||||
soup, status_code = fetch_page(url, partial(generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
|
||||
if soup:
|
||||
detail = parse_book_detail(soup, url)
|
||||
return detail
|
||||
|
||||
|
||||
def test_book_list():
|
||||
for num in range(5):
|
||||
url = list_url_update.format(num)
|
||||
soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="list_main", attr_type="class"))
|
||||
if soup:
|
||||
# 获取书籍列表
|
||||
list_data, next_url = parse_book_list(soup, url=url)
|
||||
for item in list_data:
|
||||
# 获取详情页
|
||||
detail = test_book_detail(item['href'])
|
||||
if detail:
|
||||
print({
|
||||
**item,
|
||||
**detail
|
||||
})
|
||||
|
||||
# 获取内容页
|
||||
page_data = test_chapter_page(detail['start_page_href'])
|
||||
if page_data:
|
||||
print(page_data)
|
||||
# 获取内容
|
||||
contents = test_content_page(page_data['content_url'])
|
||||
if contents and len(contents)>0:
|
||||
print (contents[0])
|
||||
|
||||
else:
|
||||
print('get detail error.')
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_book_list()
|
||||
|
||||
|
||||
278
aabook/src/sqlite_utils.py
Normal file
278
aabook/src/sqlite_utils.py
Normal file
@ -0,0 +1,278 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import config
|
||||
import utils
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
# 连接 SQLite 数据库
|
||||
DB_PATH = config.global_sqlite_path # 替换为你的数据库文件
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
tbl_name_books = 'books'
|
||||
tbl_name_chapters_prefix = 'chapters'
|
||||
tbl_name_section = 'books_sections'
|
||||
|
||||
# 获取表的列名和默认值
|
||||
def get_table_columns_and_defaults(tbl_name):
|
||||
try:
|
||||
cursor.execute(f"PRAGMA table_info({tbl_name})")
|
||||
columns = cursor.fetchall()
|
||||
column_info = {}
|
||||
for col in columns:
|
||||
col_name = col[1]
|
||||
default_value = col[4]
|
||||
column_info[col_name] = default_value
|
||||
return column_info
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error getting table columns: {e}")
|
||||
return None
|
||||
|
||||
# 检查并处理数据
|
||||
def check_and_process_data(data, tbl_name):
|
||||
column_info = get_table_columns_and_defaults(tbl_name=tbl_name)
|
||||
if column_info is None:
|
||||
return None
|
||||
processed_data = {}
|
||||
for col, default in column_info.items():
|
||||
if col == 'id': # 自增主键,不需要用户提供
|
||||
continue
|
||||
if col == 'created_at' or col == 'updated_at': # 日期函数,用户自己指定即可
|
||||
continue
|
||||
elif col in data:
|
||||
processed_data[col] = data[col]
|
||||
else:
|
||||
if default is not None:
|
||||
processed_data[col] = default
|
||||
else:
|
||||
processed_data[col] = None
|
||||
return processed_data
|
||||
|
||||
|
||||
# 插入或更新数据
|
||||
def insert_or_update_common(data, tbl_name, uniq_key='href'):
|
||||
try:
|
||||
processed_data = check_and_process_data(data, tbl_name)
|
||||
if processed_data is None:
|
||||
return None
|
||||
|
||||
columns = ', '.join(processed_data.keys())
|
||||
values = list(processed_data.values())
|
||||
placeholders = ', '.join(['?' for _ in values])
|
||||
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != {uniq_key}]) + ', updated_at=datetime(\'now\', \'localtime\')'
|
||||
|
||||
sql = f'''
|
||||
INSERT INTO {tbl_name} ({columns}, updated_at)
|
||||
VALUES ({placeholders}, datetime('now', 'localtime'))
|
||||
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
|
||||
'''
|
||||
cursor.execute(sql, values)
|
||||
conn.commit()
|
||||
|
||||
# 获取插入或更新后的 report_id
|
||||
cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
||||
report_id = cursor.fetchone()[0]
|
||||
return report_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入books表,并判断是否需要更新
|
||||
def insert_books_index(data):
|
||||
try:
|
||||
# 查询是否存在以及是否需要更新
|
||||
cursor.execute(f"SELECT id FROM books WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
|
||||
existing_book = cursor.fetchone()
|
||||
|
||||
if existing_book: # **如果演员已存在**
|
||||
return existing_book[0]
|
||||
|
||||
# 不存在,或者需要更新
|
||||
data['is_latest'] = 0
|
||||
return insert_or_update_common(data, tbl_name_books)
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
# 更新详细信息
|
||||
def update_book_detail(data):
|
||||
try:
|
||||
data['is_latest'] = 1
|
||||
|
||||
# 排除不更新的字段,只更新data中含有的字段
|
||||
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
|
||||
|
||||
# 构建更新语句
|
||||
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
|
||||
sql = f"UPDATE {tbl_name_books} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
|
||||
|
||||
# 准备参数
|
||||
values = [data[field] for field in fields_to_update]
|
||||
values.append(data['href'])
|
||||
|
||||
cursor.execute(sql, values)
|
||||
conn.commit()
|
||||
|
||||
# 获取插入或更新后的 report_id
|
||||
cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href = ?", (data['href'],))
|
||||
report_id = cursor.fetchone()[0]
|
||||
return report_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
# 按条件查询 href 列表
|
||||
def query_books(**filters):
|
||||
try:
|
||||
sql = f"SELECT href, name, id FROM {tbl_name_books} WHERE 1=1"
|
||||
params = []
|
||||
|
||||
if "id" in filters:
|
||||
sql += " AND id = ?"
|
||||
params.append(filters["id"])
|
||||
if "href" in filters:
|
||||
sql += " AND href = ?"
|
||||
params.append(filters["href"])
|
||||
if "name" in filters:
|
||||
sql += " AND name LIKE ?"
|
||||
params.append(f"%{filters['name']}%")
|
||||
if "is_latest" in filters:
|
||||
sql += " AND is_latest = ?"
|
||||
params.append(filters["is_latest"])
|
||||
if 'limit' in filters:
|
||||
sql += " limit ?"
|
||||
params.append(filters["limit"])
|
||||
|
||||
|
||||
cursor.execute(sql, params)
|
||||
return [{'href': row[0], 'name': row[1], 'id': row[2]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 检查表是否存在,不存在就创建
|
||||
def check_and_create_chapters_table(book_number):
|
||||
table_name = f"{tbl_name_chapters_prefix}_{book_number}"
|
||||
|
||||
try:
|
||||
create_table_query = f'''
|
||||
CREATE TABLE if not exists {table_name} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
book_id INTEGER,
|
||||
chapter_id INTEGER,
|
||||
section_id INTEGER,
|
||||
title TEXT,
|
||||
href TEXT UNIQUE,
|
||||
content TEXT,
|
||||
has_content INTEGER default 0,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
FOREIGN KEY(book_id) REFERENCES books(id) ON DELETE CASCADE
|
||||
);
|
||||
'''
|
||||
cursor.execute(create_table_query)
|
||||
conn.commit()
|
||||
return table_name
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"create table failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 插入到数据表中
|
||||
def insert_chapter_data(data):
|
||||
tbl_num = int(data['book_id']) % 100
|
||||
tbl_name = check_and_create_chapters_table(tbl_num)
|
||||
if tbl_name :
|
||||
return insert_or_update_common(data, tbl_name)
|
||||
else:
|
||||
return None
|
||||
|
||||
# 查询某本书最后的获取页码
|
||||
def query_last_chapter_by_book(bookid):
|
||||
tbl_num = int(bookid) % 100
|
||||
tbl_name = check_and_create_chapters_table(tbl_num)
|
||||
if tbl_name is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
sql = f"SELECT href FROM {tbl_name} WHERE book_id={bookid} order by id desc limit 1"
|
||||
cursor.execute(sql)
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row: # **如果演员已存在**
|
||||
return row[0]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 获取没有内容的章节链接
|
||||
def query_no_content_chapters(limit = 100):
|
||||
# 用于存储所有结果的列表
|
||||
all_results = []
|
||||
|
||||
# 循环遍历 0 到 100 的数字
|
||||
for i in range(100):
|
||||
table_name = f'{tbl_name_chapters_prefix}_{i}'
|
||||
try:
|
||||
# 计算还需要多少条数据
|
||||
remaining_count = limit - len(all_results)
|
||||
if remaining_count <= 0:
|
||||
break
|
||||
# 执行 SQL 查询,从每个表中获取 has_content = 0 的数据,数量不超过剩余所需数量
|
||||
query = f"SELECT href, title, book_id, chapter_id, section_id FROM {table_name} WHERE has_content = 0 LIMIT {remaining_count}"
|
||||
cursor.execute(query)
|
||||
|
||||
results = [{'href': row[0], 'title': row[1], 'book_id': row[2], 'chapter_id': row[3], 'section_id': row[4]} for row in cursor.fetchall()]
|
||||
all_results.extend(results)
|
||||
except sqlite3.Error as e:
|
||||
print(f"Error querying table {table_name}: {e}")
|
||||
|
||||
return all_results
|
||||
|
||||
# 插入书本的卷信息
|
||||
def insert_or_update_book_sections(data):
|
||||
return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')
|
||||
|
||||
|
||||
# 统计信息
|
||||
def get_statics():
|
||||
result = {}
|
||||
try:
|
||||
# 获取 performers、studios 等表的最终行数
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} ")
|
||||
result['all_books'] = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} where is_latest=1")
|
||||
result['all_books_latest'] = cursor.fetchone()[0]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"query error: {e}")
|
||||
|
||||
all_chapters = 0
|
||||
all_chapters_has_contents = 0
|
||||
|
||||
# 循环遍历 0 到 100 的数字
|
||||
for i in range(100):
|
||||
table_name = f'{tbl_name_chapters_prefix}_{i}'
|
||||
try:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table_name} ")
|
||||
all_chapters += cursor.fetchone()[0]
|
||||
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table_name} where has_content=1")
|
||||
all_chapters_has_contents += cursor.fetchone()[0]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.debug(f"Error querying table {table_name}: {e}")
|
||||
|
||||
result['all_chapters'] = all_chapters
|
||||
result['all_chapters_has_contents'] = all_chapters_has_contents
|
||||
|
||||
return result
|
||||
|
||||
|
||||
53
aabook/src/utils.py
Normal file
53
aabook/src/utils.py
Normal file
@ -0,0 +1,53 @@
|
||||
import requests
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import config
|
||||
|
||||
# 从"创建时间 2025-03-08 13:57:00" 中提取时间
|
||||
def extract_create_time(input_str):
|
||||
pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
|
||||
match = re.search(pattern, input_str)
|
||||
if match:
|
||||
datetime_str = match.group(0)
|
||||
return datetime_str
|
||||
else:
|
||||
return input_str
|
||||
|
||||
# 从 "read-374864.html" 中获取数字编号
|
||||
def extract_page_num(page_str, default_num = 0):
|
||||
# 定义正则表达式模式
|
||||
pattern = r'read-(\d+)\.html'
|
||||
# 使用 re.search 查找匹配项
|
||||
match = re.search(pattern, page_str)
|
||||
if match:
|
||||
number = match.group(1)
|
||||
return number
|
||||
else:
|
||||
return default_num
|
||||
|
||||
# 从 "book-5549.html" 中获取数字编号
|
||||
def extract_book_num(page_str, default_num = 0):
|
||||
# 定义正则表达式模式
|
||||
pattern = r'book-(\d+)\.html'
|
||||
# 使用 re.search 查找匹配项
|
||||
match = re.search(pattern, page_str)
|
||||
if match:
|
||||
number = match.group(1)
|
||||
return number
|
||||
else:
|
||||
return default_num
|
||||
|
||||
# 处理 [都市] 的方括号
|
||||
def remove_brackets_regex(input_str):
|
||||
pattern = r'\[(.*?)\]'
|
||||
match = re.match(pattern, input_str)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return input_str
|
||||
|
||||
|
||||
122
aabook/utils.py
Normal file
122
aabook/utils.py
Normal file
@ -0,0 +1,122 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import epub
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import config
|
||||
|
||||
|
||||
# 从"创建时间 2025-03-08 13:57:00" 中提取时间
|
||||
def extract_create_time(input_str):
|
||||
pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
|
||||
match = re.search(pattern, input_str)
|
||||
if match:
|
||||
datetime_str = match.group(0)
|
||||
return datetime_str
|
||||
else:
|
||||
return input_str
|
||||
|
||||
# 从 "read-374864.html" 中获取数字编号
|
||||
def extract_page_num(page_str, default_num = 0):
|
||||
# 定义正则表达式模式
|
||||
pattern = r'read-(\d+)\.html'
|
||||
# 使用 re.search 查找匹配项
|
||||
match = re.search(pattern, page_str)
|
||||
if match:
|
||||
number = match.group(1)
|
||||
return number
|
||||
else:
|
||||
return default_num
|
||||
|
||||
# 从 "book-5549.html" 中获取数字编号
|
||||
def extract_book_num(page_str, default_num = 0):
|
||||
# 定义正则表达式模式
|
||||
pattern = r'book-(\d+)\.html'
|
||||
# 使用 re.search 查找匹配项
|
||||
match = re.search(pattern, page_str)
|
||||
if match:
|
||||
number = match.group(1)
|
||||
return number
|
||||
else:
|
||||
return default_num
|
||||
|
||||
# 处理 [都市] 的方括号
|
||||
def remove_brackets_regex(input_str):
|
||||
pattern = r'\[(.*?)\]'
|
||||
match = re.match(pattern, input_str)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return input_str
|
||||
|
||||
# 定义函数来抓取小说章节内容
|
||||
def fetch_chapter(url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# 这里需要根据实际网页结构修改选择器
|
||||
chapter_content = soup.find('div', class_='chapter-content').get_text()
|
||||
return chapter_content
|
||||
except requests.RequestException as e:
|
||||
print(f"请求出错: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 定义函数来生成 EPUB 文件
|
||||
def generate_epub(title, author, chapters, path):
|
||||
book = epub.EpubBook()
|
||||
book.set_title(title)
|
||||
book.set_language('zh')
|
||||
book.add_author(author)
|
||||
|
||||
epub_chapters = []
|
||||
for chapter_title, chapter_content in chapters:
|
||||
c = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
|
||||
c.content = f'<h1>{chapter_title}</h1><p>{chapter_content}</p>'
|
||||
book.add_item(c)
|
||||
epub_chapters.append(c)
|
||||
|
||||
# 定义书的结构
|
||||
book.toc = tuple(epub_chapters)
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
# 定义样式
|
||||
style = 'body { font-family: Times, serif; }'
|
||||
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
|
||||
book.add_item(nav_css)
|
||||
|
||||
# 定义书的结构
|
||||
book.spine = ['nav'] + epub_chapters
|
||||
|
||||
# 保存 EPUB 文件
|
||||
epub.write_epub(f'{path}/{title}.epub', book, {})
|
||||
|
||||
|
||||
# 示例使用
|
||||
if __name__ == "__main__":
|
||||
# 这里需要替换为实际的小说章节链接
|
||||
chapter_info = [
|
||||
('第一章', 'https://example.com/chapter1'),
|
||||
('第二章', 'https://example.com/chapter2')
|
||||
]
|
||||
title = '小说标题'
|
||||
author = '小说作者'
|
||||
|
||||
chapters = []
|
||||
for chapter_title, url in chapter_info:
|
||||
content = fetch_chapter(url)
|
||||
if content:
|
||||
chapters.append((chapter_title, content))
|
||||
|
||||
if chapters:
|
||||
generate_epub(title, author, chapters)
|
||||
print(f'{title}.epub 文件生成成功。')
|
||||
else:
|
||||
print('未获取到有效章节内容,无法生成 EPUB 文件。')
|
||||
|
||||
Reference in New Issue
Block a user