modify scripts

This commit is contained in:
oscarz
2025-03-18 17:45:20 +08:00
parent d5dc76b87f
commit a4ea79d4db
14 changed files with 1369 additions and 13 deletions

View File

@ -10,6 +10,7 @@ from datetime import datetime
from datetime import date from datetime import date
import config # 日志配置 import config # 日志配置
from down_list import novel_map from down_list import novel_map
import utils
# 日志 # 日志
@ -21,7 +22,8 @@ list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&ca
list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update' list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
curr_novel_pages = 0 curr_novel_pages = 0
meta_dir = 'meta' meta_dir = f'{config.global_host_data_dir}/aabook/meta'
novel_dir = f'{config.global_host_data_dir}/aabook/data'
list_file = f'{meta_dir}/list.txt' list_file = f'{meta_dir}/list.txt'
details_file = f'{meta_dir}/details.txt' details_file = f'{meta_dir}/details.txt'
@ -246,7 +248,7 @@ def extract_content_url(soup, base_url, chapid):
# 判断内容是否被污染 # 判断内容是否被污染
def check_content(content): def check_content(content):
if '2005-2024 疯情书库' in content: if '2005-2024 疯情书库' in content or '2005-2025 疯情书库' in content:
return False return False
return True return True
@ -263,13 +265,15 @@ def get_novel_pages():
return curr_novel_pages return curr_novel_pages
# 解析章节内容并保存到文件中 # 解析章节内容并保存到文件中
def download_novel(chapid, novel_name, dir_prefix='./aabook'): def download_novel(chapid, novel_name, dir_prefix=novel_dir):
chapter_url = f'{base_url}/read-{chapid}.html' chapter_url = f'{base_url}/read-{chapid}.html'
novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt' novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
if os.path.exists(novel_file): if os.path.exists(novel_file):
os.remove(novel_file) # 如果存在同名文件,删除重新下载 os.remove(novel_file) # 如果存在同名文件,删除重新下载
# 保存到其他类型的文件
chapters = []
reset_novel_pages() reset_novel_pages()
while chapter_url: while chapter_url:
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}") logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
@ -314,6 +318,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
f.write(chapter_title + '\n\n') f.write(chapter_title + '\n\n')
# 写入每个段落内容到文件 # 写入每个段落内容到文件
content = ''
with open(novel_file, 'a', encoding='utf-8') as f: with open(novel_file, 'a', encoding='utf-8') as f:
for paragraph in paragraphs: for paragraph in paragraphs:
#cleaned_part = clean_watermarks(paragraph.get_text().strip()) #cleaned_part = clean_watermarks(paragraph.get_text().strip())
@ -321,7 +326,9 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
#f.write(cleaned_part + '\n\n') #f.write(cleaned_part + '\n\n')
cleaned_text = process_paragraph(paragraph) cleaned_text = process_paragraph(paragraph)
f.write(cleaned_text + '\n\n') f.write(cleaned_text + '\n\n')
content = content + '<p>' + cleaned_text + '</p>' # epub 里面用html标签来分段落
logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]") logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
chapters.append((chapter_title, content))
else: else:
logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...") logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
continue continue
@ -356,6 +363,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
break break
time.sleep(3) time.sleep(3)
# 全部获取完生成epub文件
utils.generate_epub(novel_name, 'nobody', chapters, dir_prefix)
# 检查子目录是否存在,不存在则创建 # 检查子目录是否存在,不存在则创建
@ -400,7 +409,7 @@ def download_books(need_down_list_file = details_file, cursor_file = down_list_f
continue # 已经下载过,跳过 continue # 已经下载过,跳过
# 创建分类目录 # 创建分类目录
down_dir = './data/' + category down_dir = f'{novel_dir}/{category}'
create_directory_if_not_exists(down_dir) create_directory_if_not_exists(down_dir)
# 调用下载函数下载书籍 # 调用下载函数下载书籍
@ -420,7 +429,7 @@ def download_map():
# 遍历 novel_map下载所有小说 # 遍历 novel_map下载所有小说
for novel_id, novel_name in novel_map.items(): for novel_id, novel_name in novel_map.items():
logging.info(f"Starting download for {novel_name} (ID: {novel_id})") logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
download_novel(novel_id, novel_name, './local') download_novel(novel_id, novel_name, novel_dir)
logging.info(f"Completed download for {novel_id}_{novel_name}.\n") logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
# 获取更新列表,并下载 # 获取更新列表,并下载
@ -444,6 +453,10 @@ def main():
print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map") print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map")
sys.exit(1) sys.exit(1)
# 确保目录存在
create_directory_if_not_exists(meta_dir)
create_directory_if_not_exists(novel_dir)
cmd = sys.argv[1] cmd = sys.argv[1]
if cmd == "get_list": if cmd == "get_list":

View File

@ -3,13 +3,9 @@ import os
import inspect import inspect
from datetime import datetime from datetime import datetime
# MySQL 配置 home_dir = os.path.expanduser("~")
db_config = { global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
'host': '172.18.0.3', global_share_data_dir = f'{home_dir}/sharedata'
'user': 'root',
'password': 'mysqlpw',
'database': 'stockdb'
}
# 设置日志配置 # 设置日志配置
def setup_logging(log_filename=None): def setup_logging(log_filename=None):

View File

@ -10,7 +10,7 @@ novel_map_new = {
} }
# 定义小说映射 # 定义小说映射
novel_map = { novel_map = {
364489: '诸天之乡村爱情', 371300: '临时夫妻',
} }

View File

@ -0,0 +1,12 @@
import json
import time
import sqlite_utils as db_tools
if __name__ == "__main__":
# 命令行参数处理
result = db_tools.get_statics()
print(result)

80
aabook/src/config.py Normal file
View File

@ -0,0 +1,80 @@
import logging
import os
import inspect
import time
from datetime import datetime
from logging.handlers import RotatingFileHandler
from collections import defaultdict
home_dir = os.path.expanduser("~")
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
global_share_data_dir = f'{home_dir}/sharedata'
global_sqlite_path = f'{global_share_data_dir}/sqlite/books.db'
log_dir = '../log'
# 统计日志频率
log_count = defaultdict(int) # 记录日志的次数
last_log_time = defaultdict(float) # 记录上次写入的时间戳
class RateLimitFilter(logging.Filter):
"""
频率限制过滤器:
1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
2. 如果日志速率超过 100 条/秒,发出告警
"""
LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
def filter(self, record):
global log_count, last_log_time
message_key = record.getMessage() # 获取日志内容
# 计算当前时间
now = time.time()
elapsed = now - last_log_time[message_key]
# 限制相同日志的写入频率
if elapsed < 60: # 60 秒内
log_count[message_key] += 1
if log_count[message_key] > self.LOG_LIMIT:
print('reach limit.')
return False # 直接丢弃
else:
log_count[message_key] = 1 # 超过 60 秒,重新计数
last_log_time[message_key] = now
return True # 允许写入日志
def setup_logging(log_filename=None):
if log_filename is None:
caller_frame = inspect.stack()[1]
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
current_date = datetime.now().strftime('%Y%m%d')
log_filename = f'{log_dir}/{caller_filename}_{current_date}.log'
max_log_size = 100 * 1024 * 1024 # 10 MB
max_log_files = 10 # 最多保留 10 个日志文件
file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
))
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter(
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
))
# 创建 logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [] # 避免重复添加 handler
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# 添加频率限制
rate_limit_filter = RateLimitFilter()
file_handler.addFilter(rate_limit_filter)
console_handler.addFilter(rate_limit_filter)

126
aabook/src/convert_utils.py Normal file
View File

@ -0,0 +1,126 @@
from ebooklib import epub
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph, Spacer
def generate_epub(data, save_path):
# 创建 EPUB 书籍对象
book = epub.EpubBook()
# 设置书籍元数据
book.set_title(data.get('title', '未知标题'))
book.set_language('zh')
book.add_author(data.get('author', '未知作者'))
# 存储所有章节对象
all_chapters = []
sections = data.get('sections', [])
if len(sections) == 1:
# 如果只有一个 section忽略 section 的 title按一级目录处理
for chapter in sections[0].get('chapters', []):
chapter_title = chapter.get('title', '未知章节')
chapter_content = chapter.get('content', '')
paragraphs = chapter_content.split('\n\n')
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
chapter_obj.content = f'<h1>{chapter_title}</h1>{html_content}'
book.add_item(chapter_obj)
all_chapters.append(chapter_obj)
else:
# 如果有多个 section按两级目录处理
for section in sections:
section_title = section.get('title', '未知卷')
section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_title}.xhtml', lang='zh')
section_chapter.content = f'<h1>{section_title}</h1>'
book.add_item(section_chapter)
all_chapters.append(section_chapter)
for chapter in section.get('chapters', []):
chapter_title = chapter.get('title', '未知章节')
chapter_content = chapter.get('content', '')
paragraphs = chapter_content.split('\n\n')
html_content = ''.join([f'<p>{para}</p>' for para in paragraphs])
chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
chapter_obj.content = f'<h2>{chapter_title}</h2>{html_content}'
book.add_item(chapter_obj)
all_chapters.append(chapter_obj)
# 定义书籍的目录
book.toc = tuple(all_chapters)
# 定义书的结构
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# 定义样式
style = 'body { font-family: Times, serif; }'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
book.add_item(nav_css)
# 定义书的结构
book.spine = ['nav'] + all_chapters
# 保存 EPUB 文件
epub.write_epub(save_path, book, {})
def generate_pdf(data, save_path):
# 创建 PDF 画布
c = canvas.Canvas(save_path, pagesize=letter)
styles = getSampleStyleSheet()
story = []
# 设置标题
title = data.get('title', '未知标题')
story.append(Paragraph(f'<font size=20>{title}</font>', styles['Title']))
story.append(Spacer(1, 20))
# 设置作者
author = data.get('author', '未知作者')
story.append(Paragraph(f'<font size=14>作者: {author}</font>', styles['Normal']))
story.append(Spacer(1, 40))
sections = data.get('sections', [])
if len(sections) == 1:
# 如果只有一个 section忽略 section 的 title按一级目录处理
for chapter in sections[0].get('chapters', []):
chapter_title = chapter.get('title', '未知章节')
chapter_content = chapter.get('content', '')
story.append(Paragraph(f'<font size=18>{chapter_title}</font>', styles['Heading1']))
story.append(Spacer(1, 10))
paragraphs = chapter_content.split('\n\n')
for para in paragraphs:
story.append(Paragraph(para, styles['Normal']))
story.append(Spacer(1, 10))
story.append(Spacer(1, 20))
else:
# 如果有多个 section按两级目录处理
for section in sections:
section_title = section.get('title', '未知卷')
story.append(Paragraph(f'<font size=20>{section_title}</font>', styles['Heading1']))
story.append(Spacer(1, 15))
for chapter in section.get('chapters', []):
chapter_title = chapter.get('title', '未知章节')
chapter_content = chapter.get('content', '')
story.append(Paragraph(f'<font size=16>{chapter_title}</font>', styles['Heading2']))
story.append(Spacer(1, 10))
paragraphs = chapter_content.split('\n\n')
for para in paragraphs:
story.append(Paragraph(para, styles['Normal']))
story.append(Spacer(1, 10))
story.append(Spacer(1, 15))
# 构建 PDF
for element in story:
element.wrapOn(c, letter[0] - 100, letter[1] - 100)
element.drawOn(c, 50, letter[1] - element.wrapOn(c, letter[0] - 100, letter[1] - 100)[1] - 50)
c.showPage()
# 保存 PDF 文件
c.save()

312
aabook/src/fetch.py Normal file
View File

@ -0,0 +1,312 @@
import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as db_tools
import scraper
import utils
import config
config.setup_logging()
debug = False
force = False
# 获取列表
def fetch_book_list():
url = scraper.list_url_update
while True:
logging.info(f'fetching book list. url: {url}')
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="list_main", attr_type="class"))
if soup:
# 获取书籍列表
list_data, next_url = scraper.parse_book_list(soup, url=url)
for item in list_data:
row_id = db_tools.insert_or_update_common(item, db_tools.tbl_name_books)
if row_id:
logging.debug(f'insert one book. row_id: {row_id}, name: {item['name']}')
else:
logging.warning(f'insert book error. name: {item['name']}, href: {item['href']}')
if next_url is None:
logging.info(f'get all pages.')
return True
else:
url = next_url
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch page error. {url} ...')
# 获取详情
def fetch_real_content(url):
soup, status_code = scraper.fetch_page(url, scraper.content_validator)
if soup:
data = scraper.parse_content_page(soup, url)
if data:
return data # 段落的数组
elif status_code and status_code == 404:
logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
else:
logging.warning(f'fetch page error. {url} ...')
return None
# 获取内容页
def fetch_chapter_content(url):
chapter_data = {}
next_url = None
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
if soup:
data, next_url = scraper.parse_chapter_page(soup, url)
if data:
chapter_data['title'] = data['title']
contents = fetch_real_content(data['content_url'])
if contents:
chapter_data['contents'] = contents
else:
logging.warning(f'fetching real content faild. url: {data['content_url']}')
return None, None
else:
logging.warning(f'fetch chapter page no data. url: {url}')
return None, None
else:
logging.warning(f'fetch chapter page error. url: {url}, status_code: {status_code}')
return None, None
return chapter_data, next_url
# 获取小说详情页,获得首页地址
def fetch_book_detail(url):
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
if soup:
detail = scraper.parse_book_detail(soup, url)
return detail
else:
return None
# 获取某本小说的目录页
def fetch_book_toc(url):
soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="page_main", attr_type="class"))
if soup:
listdata = scraper.pase_chapter_list(soup, url)
return listdata
else:
return None
# 获取小说的目录页,并插入到数据库
def fetch_table_of_contents():
while True:
update_list = db_tools.query_books(id=2547, is_latest=0, limit = 2 if debug else 100)
if update_list is None or len(update_list) <1 :
logging.info(f'no more data need fecth.')
return
for row in update_list:
name = row['name']
href = row['href']
bookid = row['id']
# 先打开详情页
logging.info(f'----------fetching book {name}: {href}-------------')
book_detail = fetch_book_detail(href)
if book_detail is None:
logging.warning(f'get book detail failed. url: {href}')
continue
# 获取目录页
toc_url = book_detail['table_of_contents_href']
if toc_url is None or toc_url == '':
logging.warning(f'table_of_contents_href is not correct. url: {href}')
continue
logging.info(f'fetching page: {toc_url}')
toc_data = fetch_book_toc(toc_url)
# 解析目录页
if toc_data is None:
logging.warning(f'fetch_book_toc error. url: {toc_url}')
continue
# 插入所有的目录数据
succ = 1
for row in toc_data:
section_title = row['title']
chapters = row['chapters']
section_id = db_tools.insert_or_update_book_sections({
'book_id' : int(bookid),
'section' : section_title,
'bookid_section': f'{bookid}_{section_title}'
})
if section_id is None:
logging.warning(f'insert section error. url: {toc_url}, section: {section_title}')
succ = 0
break
else:
logging.debug(f'insert one books_sections record. id:{section_id}, key: {bookid}_{section_title}')
# 插入目录数据
for chap in chapters:
chap_row_id = db_tools.insert_chapter_data({
'book_id': bookid,
'chapter_id': chap['chapter_id'],
'section_id': section_id,
'title': chap['title'],
'href': chap['href'],
'content': '',
'has_content' : 0
})
if chap_row_id is None:
logging.warning(f'insert_chapter_data error. url: {toc_url}')
succ = 0
break
if succ == 0 :
logging.warning(f'fetch_book_toc data error. url: {toc_url}')
continue
# 读取完毕,更新列表
row_id = db_tools.update_book_detail({
'href' : href,
**book_detail
})
if row_id:
logging.debug(f'update book succ. id: {row_id}, url: {href}')
else:
logging.warning(f'update book failed. url: {href}')
if debug:
return
# 直接获取小说内容
def fetch_contents():
while True:
list_data = db_tools.query_no_content_chapters(limit = 10 if debug else 100)
if list_data is None or len(list_data) <1 :
logging.info(f'no more data need fecth.')
return
for row in list_data:
url = row['href']
logging.info(f'fetching content ({row['title']}) from {url}')
content, next_url = fetch_chapter_content(url)
if content and content['title'] and content['contents']:
# 写入到数据表里
db_tools.insert_chapter_data({
'book_id': row['book_id'],
'chapter_id': row['chapter_id'],
'section_id': row['section_id'],
'title': row['title'],
'href': url,
'content': '\n\n'.join(content['contents']),
'has_content': 1
})
else:
logging.warning(f'fetch content error. url: {url}')
if debug:
return
'''
# 下载完整的小说
def fetch_book_data():
update_list = db_tools.query_books(need_update=1, limit = 1)
if update_list:
for row in update_list:
name = row['name']
href = row['href']
bookid = row['id']
# 先打开详情页
logging.info(f'----------fetching book {name}: {href}-------------')
book_detail = fetch_book_detail(href)
if book_detail:
# 获取内容页,然后循环读取内容
chapter_url = book_detail['start_page_href']
chapter_id = utils.extract_page_num(chapter_url)
# 断点续传,从上次拉取的最后一页开始
if not force:
last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
if last_chapter_url:
chapter_url = last_chapter_url
while chapter_url:
logging.info(f'fetching page: {chapter_url}')
content, next_url = fetch_chapter_content(chapter_url)
if content and content['title'] and content['contents']:
# 写入到数据表里
db_tools.insert_chapter_data({
'book_id': bookid,
'chapter_id': chapter_id,
'title': content['title'],
'href': chapter_url,
'content': '\n\n'.join(content['contents']),
'has_content': 1
})
if debug:
return
else:
logging.warning(f'fetch content error. url: {chapter_url}')
chapter_url = next_url
# 读取完毕,更新列表
row_id = db_tools.update_book_detail({
'href' : href,
**book_detail
})
if row_id:
logging.debug(f'update book succ. id: {row_id}, url: {href}')
else:
logging.warning(f'update book failed. url: {href}')
else:
logging.warning(f'get book detail failed. url: {href}')
else:
logging.warning(f'get no data needed update.')
'''
# 建立缩写到函数的映射
function_map = {
"list": fetch_book_list,
"toc" : fetch_table_of_contents,
"content": fetch_contents,
}
# 主函数
def main(cmd, args_debug, args_force):
global debug
debug = args_debug
global force
force = args_force
# 执行指定的函数
if cmd:
function_names = args.cmd.split(",") # 拆分输入
for short_name in function_names:
func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
if callable(func):
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
else: # 全量执行
for name, func in function_map.items():
if callable(func):
func()
else:
logging.warning(f" {short_name} is not a valid function shortcut.")
logging.info(f'all process completed!')
# TODO:
# 1,
if __name__ == "__main__":
# 命令行参数处理
keys_str = ",".join(function_map.keys())
parser = argparse.ArgumentParser(description='fetch aabook data.')
parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
args = parser.parse_args()
main(args.cmd, args.debug, args.force)

364
aabook/src/scraper.py Normal file
View File

@ -0,0 +1,364 @@
import time
import json
import csv
import logging
import signal
import sys
import os
import re
import requests
import random
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
import config
import utils
# 定义基础 URL 和可变参数
host_url = 'https://aabook.xyz'
list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
# User-Agent 列表
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
"Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
]
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
for attempt in range(max_retries):
try:
if 'aabook.xyz' not in url.lower():
logging.error(f'wrong url format: {url}')
return None, None
# 随机选择一个 User-Agent
headers = {
'User-Agent': random.choice(user_agents)
}
response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.warning(f"Page not found (404): {url}")
return None, 404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except requests.RequestException as e:
logging.info(f"Warn fetching page {url}: {e}. Retrying ...")
time.sleep(sleep_time) # 休眠指定的时间,然后重试
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
# 解析列表页
def parse_book_list(soup, url):
# 查找书籍列表
list_main = soup.find('div', class_='list_main')
if not list_main:
logging.warning(f"No list_main Found in {url}")
return None, None
tbody = list_main.find('tbody')
if not tbody:
logging.warning(f"No tbody found in {url}")
None, None
list_data = []
next_url = None
# 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期)
for tr in tbody.find_all('tr'):
tds = tr.find_all('td')
if len(tds) < 6:
logging.info("Invalid tr format.")
ranking = tds[0].text.strip()
category = utils.remove_brackets_regex(tds[1].text.strip())
book_link_tag = tds[2].find('a')
book_name = book_link_tag.text.strip()
book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
book_num = utils.extract_book_num(book_link_tag['href'])
author = tds[3].text.strip()
monthly_tickets = tds[4].text.strip()
update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期)
list_data.append({
'rank': ranking,
'category': category,
'name': book_name,
'href': book_link,
'num': book_num,
'author': author,
'tickets': monthly_tickets,
'update_time': update_time
})
# 查找下一页链接
next_page_tag = soup.find('a', title='下一页')
if next_page_tag:
next_url = host_url + next_page_tag['href']
return list_data, next_url
# 解析详情页
def parse_book_detail(soup, url):
# 解析书籍详细信息
book_info_tag = soup.find('li', class_='zuopinxinxi')
if not book_info_tag:
logging.warning(f"No details found in {url}")
return None
table_of_contents_href = ''
table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
if table_of_contents_href_tag:
table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']
book_info_lis = book_info_tag.find_all('li')
if len(book_info_lis) < 4:
logging.info(f"invalid book info in {url}")
return None
book_category = book_info_lis[0].find('span').text.strip()
book_status = book_info_lis[1].find('span').text.strip()
# 去掉后面的汉字,只要数字
total_word_count = book_info_lis[2].find('span').text.strip()
total_word_count = int(re.search(r'\d+', total_word_count).group())
total_clicks = book_info_lis[3].find('span').text.strip()
month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'
# 读取创建时间
creation_time_tag = soup.find('li', class_='update_time')
created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')
# 获取起始页链接和编号
start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
start_page_link = host_url + '/' + start_page_tag['href']
start_page_number = start_page_link.split('-')[-1].replace('.html', '')
return {
'category': book_category,
'status' : book_status,
'total_words' : total_word_count,
'total_clicks': total_clicks,
'month_clicks': month_clicks,
'week_clicks': week_clicks,
'total_recommend': total_recommend,
'month_recommend': month_recommend,
'week_recommend': week_recommend,
'created_time': created_time,
'start_page_href': start_page_link,
'start_page_num': start_page_number,
'table_of_contents_href': table_of_contents_href
}
# 解析书籍的目录页
def pase_chapter_list(soup, url):
# 获取小说的目录
table_of_contents = []
div_table_of_contents = soup.find('div', class_='page_main')
if not div_table_of_contents:
return None
section_titles = div_table_of_contents.find_all('p', class_='section_title')
sections = div_table_of_contents.find_all('ul', class_='section_list')
if len(sections) > len(section_titles): # 一般是 后者比前者多1个最后一个是广告
logging.warning(f'sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}')
return None
else:
for i in range(len(sections)):
section_title = section_titles[i].get_text().strip()
chap_list = sections[i].find_all("a")
chap_data = []
for chap in chap_list:
chap_title = chap.get_text().strip()
chap_link = f'{host_url}/{chap['href']}'
chap_id = utils.extract_page_num(chap_link)
chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
table_of_contents.append({'title': section_title, 'chapters': chap_data})
return table_of_contents
# 解析书籍的章节页
def parse_chapter_page(soup, url):
# 获取章节标题
chapter_title_tag = soup.find('h1', class_='chapter_title')
if chapter_title_tag is None:
logging.warning(f'Chapter title not found in {url}')
return None, None
title = chapter_title_tag.get_text().strip()
content_url = None
next_url = None
chapid = utils.extract_page_num(url)
# 遍历每一个 <script> 标签,查找内容页的链接
script_tags = soup.find_all('script')
for script_tag in script_tags:
script_content = script_tag.string
if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
# 匹配到特定内容,提取出 _getcontent.php 的 URL 模板
match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
if match:
# 从匹配中提取 v 参数值
v_value = match.group(1)
# 构建完整的 content_url
content_url = f"{host_url}/_getcontent.php?id={chapid}&v={v_value}"
break
if content_url is None:
logging.warning(f'Content url not found in {url}')
return None, None
# 获取小说的目录
table_of_contents = []
div_table_of_contents = soup.find('div', class_='mulu_con')
if div_table_of_contents or False: # 考虑要不要加上这个
section_titles = div_table_of_contents.find_all('p')
sections = div_table_of_contents.find_all('ul')
if len(sections) != len(section_titles):
logging.warning(f'sections not matched titles')
else:
for i in range(len(sections)):
section_title = section_titles[i].get_text().strip()
chap_list = sections[i].find_all("a")
chap_data = []
for chap in chap_list:
chap_title = chap.get_text().strip()
chap_link = chap['href']
chap_data.append({'href': chap_link, 'title': chap_title})
table_of_contents.append({'title': section_title, 'chapters': chap_data})
# 查找下一章的链接
next_div = soup.find('div', class_='next_arrow')
if next_div:
next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))
if next_page_tag:
next_url = f'{host_url}/{next_page_tag['href']}' if next_page_tag['href'] else ''
data = {'title': title, 'content_url': content_url, 'table_of_contents': table_of_contents}
return data, next_url
def process_paragraph(paragraph):
# 获取完整的 HTML 结构,而不是 get_text()
paragraph_html = str(paragraph)
# 移除水印标签
cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', paragraph_html, flags=re.DOTALL)
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
soup = BeautifulSoup(cleaned_html, 'html.parser')
cleaned_text = soup.get_text().strip()
return cleaned_text
# 解析内容页
def parse_content_page(soup, url):
content = []
paragraphs = soup.find_all('p')
if paragraphs:
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text)
return content
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 对内容是否被污染的判断
def content_validator(soup):
text = str(soup)
dirty_words = ['2005-2024 疯情书库', '2005-2025 疯情书库', '2025 疯情书库', '2026 疯情书库', '2027 疯情书库']
for word in dirty_words:
if word in text:
return False
return True
def test_content_page(url):
soup, status_code = fetch_page(url, content_validator)
if soup:
data = parse_content_page(soup, url)
if data:
return data
else :
return []
def test_chapter_page(url):
soup, status_code = fetch_page(url, partial(generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
if soup:
data, next_url = parse_chapter_page(soup, url)
if data:
return data
else :
return None
def test_book_detail(url):
soup, status_code = fetch_page(url, partial(generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
if soup:
detail = parse_book_detail(soup, url)
return detail
def test_book_list():
for num in range(5):
url = list_url_update.format(num)
soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="list_main", attr_type="class"))
if soup:
# 获取书籍列表
list_data, next_url = parse_book_list(soup, url=url)
for item in list_data:
# 获取详情页
detail = test_book_detail(item['href'])
if detail:
print({
**item,
**detail
})
# 获取内容页
page_data = test_chapter_page(detail['start_page_href'])
if page_data:
print(page_data)
# 获取内容
contents = test_content_page(page_data['content_url'])
if contents and len(contents)>0:
print (contents[0])
else:
print('get detail error.')
return
if __name__ == "__main__":
test_book_list()

278
aabook/src/sqlite_utils.py Normal file
View File

@ -0,0 +1,278 @@
import sqlite3
import json
import config
import utils
import logging
import sys
from datetime import datetime
# 连接 SQLite 数据库
DB_PATH = config.global_sqlite_path # 替换为你的数据库文件
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
tbl_name_books = 'books'
tbl_name_chapters_prefix = 'chapters'
tbl_name_section = 'books_sections'
# 获取表的列名和默认值
def get_table_columns_and_defaults(tbl_name):
try:
cursor.execute(f"PRAGMA table_info({tbl_name})")
columns = cursor.fetchall()
column_info = {}
for col in columns:
col_name = col[1]
default_value = col[4]
column_info[col_name] = default_value
return column_info
except sqlite3.Error as e:
logging.error(f"Error getting table columns: {e}")
return None
# 检查并处理数据
def check_and_process_data(data, tbl_name):
column_info = get_table_columns_and_defaults(tbl_name=tbl_name)
if column_info is None:
return None
processed_data = {}
for col, default in column_info.items():
if col == 'id': # 自增主键,不需要用户提供
continue
if col == 'created_at' or col == 'updated_at': # 日期函数,用户自己指定即可
continue
elif col in data:
processed_data[col] = data[col]
else:
if default is not None:
processed_data[col] = default
else:
processed_data[col] = None
return processed_data
# 插入或更新数据
def insert_or_update_common(data, tbl_name, uniq_key='href'):
try:
processed_data = check_and_process_data(data, tbl_name)
if processed_data is None:
return None
columns = ', '.join(processed_data.keys())
values = list(processed_data.values())
placeholders = ', '.join(['?' for _ in values])
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != {uniq_key}]) + ', updated_at=datetime(\'now\', \'localtime\')'
sql = f'''
INSERT INTO {tbl_name} ({columns}, updated_at)
VALUES ({placeholders}, datetime('now', 'localtime'))
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
'''
cursor.execute(sql, values)
conn.commit()
# 获取插入或更新后的 report_id
cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
report_id = cursor.fetchone()[0]
return report_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# 插入books表并判断是否需要更新
def insert_books_index(data):
try:
# 查询是否存在以及是否需要更新
cursor.execute(f"SELECT id FROM books WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
existing_book = cursor.fetchone()
if existing_book: # **如果演员已存在**
return existing_book[0]
# 不存在,或者需要更新
data['is_latest'] = 0
return insert_or_update_common(data, tbl_name_books)
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# 更新详细信息
def update_book_detail(data):
try:
data['is_latest'] = 1
# 排除不更新的字段只更新data中含有的字段
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
# 构建更新语句
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
sql = f"UPDATE {tbl_name_books} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
# 准备参数
values = [data[field] for field in fields_to_update]
values.append(data['href'])
cursor.execute(sql, values)
conn.commit()
# 获取插入或更新后的 report_id
cursor.execute(f"SELECT id FROM {tbl_name_books} WHERE href = ?", (data['href'],))
report_id = cursor.fetchone()[0]
return report_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# 按条件查询 href 列表
def query_books(**filters):
try:
sql = f"SELECT href, name, id FROM {tbl_name_books} WHERE 1=1"
params = []
if "id" in filters:
sql += " AND id = ?"
params.append(filters["id"])
if "href" in filters:
sql += " AND href = ?"
params.append(filters["href"])
if "name" in filters:
sql += " AND name LIKE ?"
params.append(f"%{filters['name']}%")
if "is_latest" in filters:
sql += " AND is_latest = ?"
params.append(filters["is_latest"])
if 'limit' in filters:
sql += " limit ?"
params.append(filters["limit"])
cursor.execute(sql, params)
return [{'href': row[0], 'name': row[1], 'id': row[2]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 检查表是否存在,不存在就创建
def check_and_create_chapters_table(book_number):
table_name = f"{tbl_name_chapters_prefix}_{book_number}"
try:
create_table_query = f'''
CREATE TABLE if not exists {table_name} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
book_id INTEGER,
chapter_id INTEGER,
section_id INTEGER,
title TEXT,
href TEXT UNIQUE,
content TEXT,
has_content INTEGER default 0,
created_at TEXT DEFAULT (datetime('now', 'localtime')),
updated_at TEXT DEFAULT (datetime('now', 'localtime')),
FOREIGN KEY(book_id) REFERENCES books(id) ON DELETE CASCADE
);
'''
cursor.execute(create_table_query)
conn.commit()
return table_name
except sqlite3.Error as e:
logging.error(f"create table failed: {e}")
return None
# 插入到数据表中
def insert_chapter_data(data):
tbl_num = int(data['book_id']) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
if tbl_name :
return insert_or_update_common(data, tbl_name)
else:
return None
# 查询某本书最后的获取页码
def query_last_chapter_by_book(bookid):
tbl_num = int(bookid) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
if tbl_name is None:
return None
try:
sql = f"SELECT href FROM {tbl_name} WHERE book_id={bookid} order by id desc limit 1"
cursor.execute(sql)
row = cursor.fetchone()
if row: # **如果演员已存在**
return row[0]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 获取没有内容的章节链接
def query_no_content_chapters(limit = 100):
# 用于存储所有结果的列表
all_results = []
# 循环遍历 0 到 100 的数字
for i in range(100):
table_name = f'{tbl_name_chapters_prefix}_{i}'
try:
# 计算还需要多少条数据
remaining_count = limit - len(all_results)
if remaining_count <= 0:
break
# 执行 SQL 查询,从每个表中获取 has_content = 0 的数据,数量不超过剩余所需数量
query = f"SELECT href, title, book_id, chapter_id, section_id FROM {table_name} WHERE has_content = 0 LIMIT {remaining_count}"
cursor.execute(query)
results = [{'href': row[0], 'title': row[1], 'book_id': row[2], 'chapter_id': row[3], 'section_id': row[4]} for row in cursor.fetchall()]
all_results.extend(results)
except sqlite3.Error as e:
print(f"Error querying table {table_name}: {e}")
return all_results
# 插入书本的卷信息
def insert_or_update_book_sections(data):
return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')
# 统计信息
def get_statics():
result = {}
try:
# 获取 performers、studios 等表的最终行数
cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} ")
result['all_books'] = cursor.fetchone()[0]
cursor.execute(f"SELECT COUNT(*) FROM {tbl_name_books} where is_latest=1")
result['all_books_latest'] = cursor.fetchone()[0]
except sqlite3.Error as e:
logging.error(f"query error: {e}")
all_chapters = 0
all_chapters_has_contents = 0
# 循环遍历 0 到 100 的数字
for i in range(100):
table_name = f'{tbl_name_chapters_prefix}_{i}'
try:
cursor.execute(f"SELECT COUNT(*) FROM {table_name} ")
all_chapters += cursor.fetchone()[0]
cursor.execute(f"SELECT COUNT(*) FROM {table_name} where has_content=1")
all_chapters_has_contents += cursor.fetchone()[0]
except sqlite3.Error as e:
logging.debug(f"Error querying table {table_name}: {e}")
result['all_chapters'] = all_chapters
result['all_chapters_has_contents'] = all_chapters_has_contents
return result

53
aabook/src/utils.py Normal file
View File

@ -0,0 +1,53 @@
import requests
import re
import os
import json
import time
import csv
import logging
from datetime import datetime
import config
# 从"创建时间 2025-03-08 13:57:00" 中提取时间
def extract_create_time(input_str):
pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
match = re.search(pattern, input_str)
if match:
datetime_str = match.group(0)
return datetime_str
else:
return input_str
# 从 "read-374864.html" 中获取数字编号
def extract_page_num(page_str, default_num = 0):
# 定义正则表达式模式
pattern = r'read-(\d+)\.html'
# 使用 re.search 查找匹配项
match = re.search(pattern, page_str)
if match:
number = match.group(1)
return number
else:
return default_num
# 从 "book-5549.html" 中获取数字编号
def extract_book_num(page_str, default_num = 0):
# 定义正则表达式模式
pattern = r'book-(\d+)\.html'
# 使用 re.search 查找匹配项
match = re.search(pattern, page_str)
if match:
number = match.group(1)
return number
else:
return default_num
# 处理 [都市] 的方括号
def remove_brackets_regex(input_str):
pattern = r'\[(.*?)\]'
match = re.match(pattern, input_str)
if match:
return match.group(1)
return input_str

122
aabook/utils.py Normal file
View File

@ -0,0 +1,122 @@
import requests
from bs4 import BeautifulSoup
from ebooklib import epub
import re
import os
import json
import time
import csv
import logging
from datetime import datetime
import config
# 从"创建时间 2025-03-08 13:57:00" 中提取时间
def extract_create_time(input_str):
pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
match = re.search(pattern, input_str)
if match:
datetime_str = match.group(0)
return datetime_str
else:
return input_str
# 从 "read-374864.html" 中获取数字编号
def extract_page_num(page_str, default_num = 0):
# 定义正则表达式模式
pattern = r'read-(\d+)\.html'
# 使用 re.search 查找匹配项
match = re.search(pattern, page_str)
if match:
number = match.group(1)
return number
else:
return default_num
# 从 "book-5549.html" 中获取数字编号
def extract_book_num(page_str, default_num = 0):
# 定义正则表达式模式
pattern = r'book-(\d+)\.html'
# 使用 re.search 查找匹配项
match = re.search(pattern, page_str)
if match:
number = match.group(1)
return number
else:
return default_num
# 处理 [都市] 的方括号
def remove_brackets_regex(input_str):
pattern = r'\[(.*?)\]'
match = re.match(pattern, input_str)
if match:
return match.group(1)
return input_str
# 定义函数来抓取小说章节内容
def fetch_chapter(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 这里需要根据实际网页结构修改选择器
chapter_content = soup.find('div', class_='chapter-content').get_text()
return chapter_content
except requests.RequestException as e:
print(f"请求出错: {e}")
return None
# 定义函数来生成 EPUB 文件
def generate_epub(title, author, chapters, path):
book = epub.EpubBook()
book.set_title(title)
book.set_language('zh')
book.add_author(author)
epub_chapters = []
for chapter_title, chapter_content in chapters:
c = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
c.content = f'<h1>{chapter_title}</h1><p>{chapter_content}</p>'
book.add_item(c)
epub_chapters.append(c)
# 定义书的结构
book.toc = tuple(epub_chapters)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# 定义样式
style = 'body { font-family: Times, serif; }'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
book.add_item(nav_css)
# 定义书的结构
book.spine = ['nav'] + epub_chapters
# 保存 EPUB 文件
epub.write_epub(f'{path}/{title}.epub', book, {})
# 示例使用
if __name__ == "__main__":
# 这里需要替换为实际的小说章节链接
chapter_info = [
('第一章', 'https://example.com/chapter1'),
('第二章', 'https://example.com/chapter2')
]
title = '小说标题'
author = '小说作者'
chapters = []
for chapter_title, url in chapter_info:
content = fetch_chapter(url)
if content:
chapters.append((chapter_title, content))
if chapters:
generate_epub(title, author, chapters)
print(f'{title}.epub 文件生成成功。')
else:
print('未获取到有效章节内容,无法生成 EPUB 文件。')