diff --git a/aabook/aabook_fetch.py b/aabook/aabook_fetch.py
index 96643c5..d3b7f4c 100644
--- a/aabook/aabook_fetch.py
+++ b/aabook/aabook_fetch.py
@@ -10,6 +10,7 @@ from datetime import datetime
from datetime import date
import config # 日志配置
from down_list import novel_map
+import utils
# 日志
@@ -21,7 +22,8 @@ list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&ca
list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
curr_novel_pages = 0
-meta_dir = 'meta'
+meta_dir = f'{config.global_host_data_dir}/aabook/meta'
+novel_dir = f'{config.global_host_data_dir}/aabook/data'
list_file = f'{meta_dir}/list.txt'
details_file = f'{meta_dir}/details.txt'
@@ -246,7 +248,7 @@ def extract_content_url(soup, base_url, chapid):
# 判断内容是否被污染
def check_content(content):
- if '2005-2024 疯情书库' in content:
+ if '2005-2024 疯情书库' in content or '2005-2025 疯情书库' in content:
return False
return True
@@ -263,13 +265,15 @@ def get_novel_pages():
return curr_novel_pages
# 解析章节内容并保存到文件中
-def download_novel(chapid, novel_name, dir_prefix='./aabook'):
+def download_novel(chapid, novel_name, dir_prefix=novel_dir):
chapter_url = f'{base_url}/read-{chapid}.html'
novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
if os.path.exists(novel_file):
os.remove(novel_file) # 如果存在同名文件,删除重新下载
+ # 保存到其他类型的文件
+ chapters = []
reset_novel_pages()
while chapter_url:
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
@@ -314,6 +318,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
f.write(chapter_title + '\n\n')
# 写入每个段落内容到文件
+ content = ''
with open(novel_file, 'a', encoding='utf-8') as f:
for paragraph in paragraphs:
#cleaned_part = clean_watermarks(paragraph.get_text().strip())
@@ -321,7 +326,9 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
#f.write(cleaned_part + '\n\n')
cleaned_text = process_paragraph(paragraph)
f.write(cleaned_text + '\n\n')
+ content = content + '
' + cleaned_text + '
' # epub 里面,用html标签来分段落
logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
+ chapters.append((chapter_title, content))
else:
logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
continue
@@ -356,6 +363,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
break
time.sleep(3)
+ # 全部获取完,生成epub文件
+ utils.generate_epub(novel_name, 'nobody', chapters, dir_prefix)
# 检查子目录是否存在,不存在则创建
@@ -400,7 +409,7 @@ def download_books(need_down_list_file = details_file, cursor_file = down_list_f
continue # 已经下载过,跳过
# 创建分类目录
- down_dir = './data/' + category
+ down_dir = f'{novel_dir}/{category}'
create_directory_if_not_exists(down_dir)
# 调用下载函数下载书籍
@@ -420,7 +429,7 @@ def download_map():
# 遍历 novel_map,下载所有小说
for novel_id, novel_name in novel_map.items():
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
- download_novel(novel_id, novel_name, './local')
+ download_novel(novel_id, novel_name, novel_dir)
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
# 获取更新列表,并下载
@@ -444,6 +453,10 @@ def main():
print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map")
sys.exit(1)
+ # 确保目录存在
+ create_directory_if_not_exists(meta_dir)
+ create_directory_if_not_exists(novel_dir)
+
cmd = sys.argv[1]
if cmd == "get_list":
diff --git a/aabook/config.py b/aabook/config.py
index 27942b9..b62e060 100644
--- a/aabook/config.py
+++ b/aabook/config.py
@@ -3,13 +3,9 @@ import os
import inspect
from datetime import datetime
-# MySQL 配置
-db_config = {
- 'host': '172.18.0.3',
- 'user': 'root',
- 'password': 'mysqlpw',
- 'database': 'stockdb'
-}
+home_dir = os.path.expanduser("~")
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
+global_share_data_dir = f'{home_dir}/sharedata'
# 设置日志配置
def setup_logging(log_filename=None):
diff --git a/aabook/down_list.py b/aabook/down_list.py
index ff46870..1eef720 100644
--- a/aabook/down_list.py
+++ b/aabook/down_list.py
@@ -10,7 +10,7 @@ novel_map_new = {
}
# 定义小说映射
novel_map = {
- 364489: '诸天之乡村爱情',
+ 371300: '临时夫妻',
}
diff --git a/aabook/src/check_status.py b/aabook/src/check_status.py
new file mode 100644
index 0000000..5cd5b04
--- /dev/null
+++ b/aabook/src/check_status.py
@@ -0,0 +1,12 @@
+import json
+import time
+import sqlite_utils as db_tools
+
+
+if __name__ == "__main__":
+ # 命令行参数处理
+ result = db_tools.get_statics()
+ print(result)
+
+
+
diff --git a/aabook/src/config.py b/aabook/src/config.py
new file mode 100644
index 0000000..47e13a1
--- /dev/null
+++ b/aabook/src/config.py
@@ -0,0 +1,80 @@
+import logging
+import os
+import inspect
+import time
+from datetime import datetime
+from logging.handlers import RotatingFileHandler
+from collections import defaultdict
+
+home_dir = os.path.expanduser("~")
+global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
+global_share_data_dir = f'{home_dir}/sharedata'
+global_sqlite_path = f'{global_share_data_dir}/sqlite/books.db'
+
+log_dir = '../log'
+# 统计日志频率
+log_count = defaultdict(int) # 记录日志的次数
+last_log_time = defaultdict(float) # 记录上次写入的时间戳
+
+class RateLimitFilter(logging.Filter):
+ """
+ 频率限制过滤器:
+ 1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略
+ 2. 如果日志速率超过 100 条/秒,发出告警
+ """
+ LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次
+
+ def filter(self, record):
+ global log_count, last_log_time
+ message_key = record.getMessage() # 获取日志内容
+
+ # 计算当前时间
+ now = time.time()
+ elapsed = now - last_log_time[message_key]
+
+ # 限制相同日志的写入频率
+ if elapsed < 60: # 60 秒内
+ log_count[message_key] += 1
+ if log_count[message_key] > self.LOG_LIMIT:
+ print('reach limit.')
+ return False # 直接丢弃
+ else:
+ log_count[message_key] = 1 # 超过 60 秒,重新计数
+
+ last_log_time[message_key] = now
+
+ return True # 允许写入日志
+
+
+
+def setup_logging(log_filename=None):
+ if log_filename is None:
+ caller_frame = inspect.stack()[1]
+ caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
+ current_date = datetime.now().strftime('%Y%m%d')
+ log_filename = f'{log_dir}/{caller_filename}_{current_date}.log'
+
+ max_log_size = 100 * 1024 * 1024 # 10 MB
+ max_log_files = 10 # 最多保留 10 个日志文件
+
+ file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files)
+ file_handler.setFormatter(logging.Formatter(
+ '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+ ))
+
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(logging.Formatter(
+ '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
+ ))
+
+ # 创建 logger
+ logger = logging.getLogger()
+ logger.setLevel(logging.INFO)
+ logger.handlers = [] # 避免重复添加 handler
+ logger.addHandler(file_handler)
+ logger.addHandler(console_handler)
+
+ # 添加频率限制
+ rate_limit_filter = RateLimitFilter()
+ file_handler.addFilter(rate_limit_filter)
+ console_handler.addFilter(rate_limit_filter)
\ No newline at end of file
diff --git a/aabook/src/convert_utils.py b/aabook/src/convert_utils.py
new file mode 100644
index 0000000..bca87ad
--- /dev/null
+++ b/aabook/src/convert_utils.py
@@ -0,0 +1,126 @@
+from ebooklib import epub
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.platypus import Paragraph, Spacer
+
+
+def generate_epub(data, save_path):
+ # 创建 EPUB 书籍对象
+ book = epub.EpubBook()
+
+ # 设置书籍元数据
+ book.set_title(data.get('title', '未知标题'))
+ book.set_language('zh')
+ book.add_author(data.get('author', '未知作者'))
+
+ # 存储所有章节对象
+ all_chapters = []
+
+ sections = data.get('sections', [])
+
+ if len(sections) == 1:
+ # 如果只有一个 section,忽略 section 的 title,按一级目录处理
+ for chapter in sections[0].get('chapters', []):
+ chapter_title = chapter.get('title', '未知章节')
+ chapter_content = chapter.get('content', '')
+ paragraphs = chapter_content.split('\n\n')
+ html_content = ''.join([f'{para}
' for para in paragraphs])
+ chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
+ chapter_obj.content = f'{chapter_title}
{html_content}'
+ book.add_item(chapter_obj)
+ all_chapters.append(chapter_obj)
+ else:
+ # 如果有多个 section,按两级目录处理
+ for section in sections:
+ section_title = section.get('title', '未知卷')
+ section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_title}.xhtml', lang='zh')
+ section_chapter.content = f'{section_title}
'
+ book.add_item(section_chapter)
+ all_chapters.append(section_chapter)
+
+ for chapter in section.get('chapters', []):
+ chapter_title = chapter.get('title', '未知章节')
+ chapter_content = chapter.get('content', '')
+ paragraphs = chapter_content.split('\n\n')
+ html_content = ''.join([f'{para}
' for para in paragraphs])
+ chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
+ chapter_obj.content = f'{chapter_title}
{html_content}'
+ book.add_item(chapter_obj)
+ all_chapters.append(chapter_obj)
+
+ # 定义书籍的目录
+ book.toc = tuple(all_chapters)
+
+ # 定义书的结构
+ book.add_item(epub.EpubNcx())
+ book.add_item(epub.EpubNav())
+
+ # 定义样式
+ style = 'body { font-family: Times, serif; }'
+ nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
+ book.add_item(nav_css)
+
+ # 定义书的结构
+ book.spine = ['nav'] + all_chapters
+
+ # 保存 EPUB 文件
+ epub.write_epub(save_path, book, {})
+
+
+def generate_pdf(data, save_path):
+ # 创建 PDF 画布
+ c = canvas.Canvas(save_path, pagesize=letter)
+ styles = getSampleStyleSheet()
+ story = []
+
+ # 设置标题
+ title = data.get('title', '未知标题')
+ story.append(Paragraph(f'{title}', styles['Title']))
+ story.append(Spacer(1, 20))
+
+ # 设置作者
+ author = data.get('author', '未知作者')
+ story.append(Paragraph(f'作者: {author}', styles['Normal']))
+ story.append(Spacer(1, 40))
+
+ sections = data.get('sections', [])
+
+ if len(sections) == 1:
+ # 如果只有一个 section,忽略 section 的 title,按一级目录处理
+ for chapter in sections[0].get('chapters', []):
+ chapter_title = chapter.get('title', '未知章节')
+ chapter_content = chapter.get('content', '')
+ story.append(Paragraph(f'{chapter_title}', styles['Heading1']))
+ story.append(Spacer(1, 10))
+ paragraphs = chapter_content.split('\n\n')
+ for para in paragraphs:
+ story.append(Paragraph(para, styles['Normal']))
+ story.append(Spacer(1, 10))
+ story.append(Spacer(1, 20))
+ else:
+ # 如果有多个 section,按两级目录处理
+ for section in sections:
+ section_title = section.get('title', '未知卷')
+ story.append(Paragraph(f'{section_title}', styles['Heading1']))
+ story.append(Spacer(1, 15))
+ for chapter in section.get('chapters', []):
+ chapter_title = chapter.get('title', '未知章节')
+ chapter_content = chapter.get('content', '')
+ story.append(Paragraph(f'{chapter_title}', styles['Heading2']))
+ story.append(Spacer(1, 10))
+ paragraphs = chapter_content.split('\n\n')
+ for para in paragraphs:
+ story.append(Paragraph(para, styles['Normal']))
+ story.append(Spacer(1, 10))
+ story.append(Spacer(1, 15))
+
+ # 构建 PDF
+ for element in story:
+ element.wrapOn(c, letter[0] - 100, letter[1] - 100)
+ element.drawOn(c, 50, letter[1] - element.wrapOn(c, letter[0] - 100, letter[1] - 100)[1] - 50)
+ c.showPage()
+
+ # 保存 PDF 文件
+ c.save()
+
\ No newline at end of file
diff --git a/aabook/src/fetch.py b/aabook/src/fetch.py
new file mode 100644
index 0000000..c44ee54
--- /dev/null
+++ b/aabook/src/fetch.py
@@ -0,0 +1,312 @@
+import json
+import time
+import csv
+import argparse
+import logging
+from functools import partial
+import config
+import sqlite_utils as db_tools
+import scraper
+import utils
+import config
+
+config.setup_logging()
+
+debug = False
+force = False
+
+# 获取列表
+def fetch_book_list():
+ url = scraper.list_url_update
+ while True:
+ logging.info(f'fetching book list. url: {url}')
+ soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="list_main", attr_type="class"))
+ if soup:
+ # 获取书籍列表
+ list_data, next_url = scraper.parse_book_list(soup, url=url)
+ for item in list_data:
+ row_id = db_tools.insert_or_update_common(item, db_tools.tbl_name_books)
+ if row_id:
+ logging.debug(f'insert one book. row_id: {row_id}, name: {item['name']}')
+ else:
+ logging.warning(f'insert book error. name: {item['name']}, href: {item['href']}')
+ if next_url is None:
+ logging.info(f'get all pages.')
+ return True
+ else:
+ url = next_url
+ elif status_code and status_code == 404:
+ logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+ else:
+ logging.warning(f'fetch page error. {url} ...')
+
+
+# 获取详情
+def fetch_real_content(url):
+ soup, status_code = scraper.fetch_page(url, scraper.content_validator)
+ if soup:
+ data = scraper.parse_content_page(soup, url)
+ if data:
+ return data # 段落的数组
+ elif status_code and status_code == 404:
+ logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+ else:
+ logging.warning(f'fetch page error. {url} ...')
+ return None
+
+
+# 获取内容页
+def fetch_chapter_content(url):
+ chapter_data = {}
+ next_url = None
+
+ soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
+ if soup:
+ data, next_url = scraper.parse_chapter_page(soup, url)
+ if data:
+ chapter_data['title'] = data['title']
+ contents = fetch_real_content(data['content_url'])
+ if contents:
+ chapter_data['contents'] = contents
+ else:
+ logging.warning(f'fetching real content faild. url: {data['content_url']}')
+ return None, None
+ else:
+ logging.warning(f'fetch chapter page no data. url: {url}')
+ return None, None
+ else:
+ logging.warning(f'fetch chapter page error. url: {url}, status_code: {status_code}')
+ return None, None
+
+ return chapter_data, next_url
+
+# 获取小说详情页,获得首页地址
+def fetch_book_detail(url):
+ soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
+ if soup:
+ detail = scraper.parse_book_detail(soup, url)
+ return detail
+ else:
+ return None
+
+# 获取某本小说的目录页
+def fetch_book_toc(url):
+ soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="page_main", attr_type="class"))
+ if soup:
+ listdata = scraper.pase_chapter_list(soup, url)
+ return listdata
+ else:
+ return None
+
+# 获取小说的目录页,并插入到数据库
+def fetch_table_of_contents():
+ while True:
+ update_list = db_tools.query_books(id=2547, is_latest=0, limit = 2 if debug else 100)
+ if update_list is None or len(update_list) <1 :
+ logging.info(f'no more data need fecth.')
+ return
+
+ for row in update_list:
+ name = row['name']
+ href = row['href']
+ bookid = row['id']
+ # 先打开详情页
+ logging.info(f'----------fetching book {name}: {href}-------------')
+ book_detail = fetch_book_detail(href)
+ if book_detail is None:
+ logging.warning(f'get book detail failed. url: {href}')
+ continue
+
+ # 获取目录页
+ toc_url = book_detail['table_of_contents_href']
+ if toc_url is None or toc_url == '':
+ logging.warning(f'table_of_contents_href is not correct. url: {href}')
+ continue
+
+ logging.info(f'fetching page: {toc_url}')
+ toc_data = fetch_book_toc(toc_url)
+
+ # 解析目录页
+ if toc_data is None:
+ logging.warning(f'fetch_book_toc error. url: {toc_url}')
+ continue
+
+ # 插入所有的目录数据
+ succ = 1
+ for row in toc_data:
+ section_title = row['title']
+ chapters = row['chapters']
+ section_id = db_tools.insert_or_update_book_sections({
+ 'book_id' : int(bookid),
+ 'section' : section_title,
+ 'bookid_section': f'{bookid}_{section_title}'
+ })
+ if section_id is None:
+ logging.warning(f'insert section error. url: {toc_url}, section: {section_title}')
+ succ = 0
+ break
+ else:
+ logging.debug(f'insert one books_sections record. id:{section_id}, key: {bookid}_{section_title}')
+
+ # 插入目录数据
+ for chap in chapters:
+ chap_row_id = db_tools.insert_chapter_data({
+ 'book_id': bookid,
+ 'chapter_id': chap['chapter_id'],
+ 'section_id': section_id,
+ 'title': chap['title'],
+ 'href': chap['href'],
+ 'content': '',
+ 'has_content' : 0
+ })
+ if chap_row_id is None:
+ logging.warning(f'insert_chapter_data error. url: {toc_url}')
+ succ = 0
+ break
+ if succ == 0 :
+ logging.warning(f'fetch_book_toc data error. url: {toc_url}')
+ continue
+
+ # 读取完毕,更新列表
+ row_id = db_tools.update_book_detail({
+ 'href' : href,
+ **book_detail
+ })
+ if row_id:
+ logging.debug(f'update book succ. id: {row_id}, url: {href}')
+ else:
+ logging.warning(f'update book failed. url: {href}')
+ if debug:
+ return
+
+# 直接获取小说内容
+def fetch_contents():
+ while True:
+ list_data = db_tools.query_no_content_chapters(limit = 10 if debug else 100)
+ if list_data is None or len(list_data) <1 :
+ logging.info(f'no more data need fecth.')
+ return
+
+ for row in list_data:
+ url = row['href']
+ logging.info(f'fetching content ({row['title']}) from {url}')
+ content, next_url = fetch_chapter_content(url)
+ if content and content['title'] and content['contents']:
+ # 写入到数据表里
+ db_tools.insert_chapter_data({
+ 'book_id': row['book_id'],
+ 'chapter_id': row['chapter_id'],
+ 'section_id': row['section_id'],
+ 'title': row['title'],
+ 'href': url,
+ 'content': '\n\n'.join(content['contents']),
+ 'has_content': 1
+ })
+ else:
+ logging.warning(f'fetch content error. url: {url}')
+ if debug:
+ return
+
+
+'''
+# 下载完整的小说
+def fetch_book_data():
+ update_list = db_tools.query_books(need_update=1, limit = 1)
+ if update_list:
+ for row in update_list:
+ name = row['name']
+ href = row['href']
+ bookid = row['id']
+ # 先打开详情页
+ logging.info(f'----------fetching book {name}: {href}-------------')
+ book_detail = fetch_book_detail(href)
+ if book_detail:
+ # 获取内容页,然后循环读取内容
+ chapter_url = book_detail['start_page_href']
+ chapter_id = utils.extract_page_num(chapter_url)
+ # 断点续传,从上次拉取的最后一页开始
+ if not force:
+ last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
+ if last_chapter_url:
+ chapter_url = last_chapter_url
+ while chapter_url:
+ logging.info(f'fetching page: {chapter_url}')
+ content, next_url = fetch_chapter_content(chapter_url)
+ if content and content['title'] and content['contents']:
+ # 写入到数据表里
+ db_tools.insert_chapter_data({
+ 'book_id': bookid,
+ 'chapter_id': chapter_id,
+ 'title': content['title'],
+ 'href': chapter_url,
+ 'content': '\n\n'.join(content['contents']),
+ 'has_content': 1
+ })
+
+ if debug:
+ return
+ else:
+ logging.warning(f'fetch content error. url: {chapter_url}')
+ chapter_url = next_url
+ # 读取完毕,更新列表
+ row_id = db_tools.update_book_detail({
+ 'href' : href,
+ **book_detail
+ })
+ if row_id:
+ logging.debug(f'update book succ. id: {row_id}, url: {href}')
+ else:
+ logging.warning(f'update book failed. url: {href}')
+ else:
+ logging.warning(f'get book detail failed. url: {href}')
+ else:
+ logging.warning(f'get no data needed update.')
+'''
+
+# 建立缩写到函数的映射
+function_map = {
+ "list": fetch_book_list,
+ "toc" : fetch_table_of_contents,
+ "content": fetch_contents,
+}
+
+# 主函数
+def main(cmd, args_debug, args_force):
+ global debug
+ debug = args_debug
+
+ global force
+ force = args_force
+
+ # 执行指定的函数
+ if cmd:
+ function_names = args.cmd.split(",") # 拆分输入
+ for short_name in function_names:
+ func = function_map.get(short_name.strip()) # 从映射中获取对应的函数
+ if callable(func):
+ func()
+ else:
+ logging.warning(f" {short_name} is not a valid function shortcut.")
+ else: # 全量执行
+ for name, func in function_map.items():
+ if callable(func):
+ func()
+ else:
+ logging.warning(f" {short_name} is not a valid function shortcut.")
+
+ logging.info(f'all process completed!')
+
+ # TODO:
+ # 1,
+
+if __name__ == "__main__":
+ # 命令行参数处理
+ keys_str = ",".join(function_map.keys())
+
+ parser = argparse.ArgumentParser(description='fetch aabook data.')
+ parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}")
+ parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
+ parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
+ args = parser.parse_args()
+
+ main(args.cmd, args.debug, args.force)
diff --git a/aabook/src/scraper.py b/aabook/src/scraper.py
new file mode 100644
index 0000000..6eb194d
--- /dev/null
+++ b/aabook/src/scraper.py
@@ -0,0 +1,364 @@
+import time
+import json
+import csv
+import logging
+import signal
+import sys
+import os
+import re
+import requests
+import random
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+from functools import partial
+import config
+import utils
+
+# 定义基础 URL 和可变参数
+host_url = 'https://aabook.xyz'
+list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
+#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
+
+# User-Agent 列表
+user_agents = [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
+ "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
+]
+
+#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
+def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
+ for attempt in range(max_retries):
+ try:
+ if 'aabook.xyz' not in url.lower():
+ logging.error(f'wrong url format: {url}')
+ return None, None
+
+ # 随机选择一个 User-Agent
+ headers = {
+ 'User-Agent': random.choice(user_agents)
+ }
+ response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
+
+ # 处理 HTTP 状态码
+ if response.status_code == 404:
+ logging.warning(f"Page not found (404): {url}")
+ return None, 404 # 直接返回 404,调用方可以跳过
+
+ response.raise_for_status() # 处理 HTTP 错误
+
+ # 预处理 HTML(如果提供了 preprocessor)
+ html_text = preprocessor(response.text) if preprocessor else response.text
+
+ soup = BeautifulSoup(html_text, parser)
+ if validator(soup): # 进行自定义页面检查
+ return soup, response.status_code
+
+ logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
+ except requests.RequestException as e:
+ logging.info(f"Warn fetching page {url}: {e}. Retrying ...")
+ time.sleep(sleep_time) # 休眠指定的时间,然后重试
+
+ logging.error(f'Fetching failed after max retries. {url}')
+ return None, None # 达到最大重试次数仍然失败
+
+
+# 解析列表页
+def parse_book_list(soup, url):
+ # 查找书籍列表
+ list_main = soup.find('div', class_='list_main')
+ if not list_main:
+ logging.warning(f"No list_main Found in {url}")
+ return None, None
+
+ tbody = list_main.find('tbody')
+ if not tbody:
+ logging.warning(f"No tbody found in {url}")
+ None, None
+
+ list_data = []
+ next_url = None
+ # 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期)
+ for tr in tbody.find_all('tr'):
+ tds = tr.find_all('td')
+ if len(tds) < 6:
+ logging.info("Invalid tr format.")
+ ranking = tds[0].text.strip()
+ category = utils.remove_brackets_regex(tds[1].text.strip())
+ book_link_tag = tds[2].find('a')
+ book_name = book_link_tag.text.strip()
+ book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
+ book_num = utils.extract_book_num(book_link_tag['href'])
+ author = tds[3].text.strip()
+ monthly_tickets = tds[4].text.strip()
+ update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期)
+
+ list_data.append({
+ 'rank': ranking,
+ 'category': category,
+ 'name': book_name,
+ 'href': book_link,
+ 'num': book_num,
+ 'author': author,
+ 'tickets': monthly_tickets,
+ 'update_time': update_time
+ })
+
+ # 查找下一页链接
+ next_page_tag = soup.find('a', title='下一页')
+ if next_page_tag:
+ next_url = host_url + next_page_tag['href']
+
+ return list_data, next_url
+
+# 解析详情页
+def parse_book_detail(soup, url):
+ # 解析书籍详细信息
+ book_info_tag = soup.find('li', class_='zuopinxinxi')
+ if not book_info_tag:
+ logging.warning(f"No details found in {url}")
+ return None
+
+ table_of_contents_href = ''
+ table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
+ if table_of_contents_href_tag:
+ table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']
+
+ book_info_lis = book_info_tag.find_all('li')
+ if len(book_info_lis) < 4:
+ logging.info(f"invalid book info in {url}")
+ return None
+
+ book_category = book_info_lis[0].find('span').text.strip()
+ book_status = book_info_lis[1].find('span').text.strip()
+ # 去掉后面的汉字,只要数字
+ total_word_count = book_info_lis[2].find('span').text.strip()
+ total_word_count = int(re.search(r'\d+', total_word_count).group())
+
+ total_clicks = book_info_lis[3].find('span').text.strip()
+ month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
+ week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
+ total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
+ month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
+ week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'
+
+ # 读取创建时间
+ creation_time_tag = soup.find('li', class_='update_time')
+ created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')
+
+ # 获取起始页链接和编号
+ start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
+ start_page_link = host_url + '/' + start_page_tag['href']
+ start_page_number = start_page_link.split('-')[-1].replace('.html', '')
+
+ return {
+ 'category': book_category,
+ 'status' : book_status,
+ 'total_words' : total_word_count,
+ 'total_clicks': total_clicks,
+ 'month_clicks': month_clicks,
+ 'week_clicks': week_clicks,
+ 'total_recommend': total_recommend,
+ 'month_recommend': month_recommend,
+ 'week_recommend': week_recommend,
+ 'created_time': created_time,
+ 'start_page_href': start_page_link,
+ 'start_page_num': start_page_number,
+ 'table_of_contents_href': table_of_contents_href
+ }
+
+# 解析书籍的目录页
+def pase_chapter_list(soup, url):
+ # 获取小说的目录
+ table_of_contents = []
+ div_table_of_contents = soup.find('div', class_='page_main')
+ if not div_table_of_contents:
+ return None
+
+ section_titles = div_table_of_contents.find_all('p', class_='section_title')
+ sections = div_table_of_contents.find_all('ul', class_='section_list')
+ if len(sections) > len(section_titles): # 一般是 后者比前者多1个,最后一个是广告
+ logging.warning(f'sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}')
+ return None
+ else:
+ for i in range(len(sections)):
+ section_title = section_titles[i].get_text().strip()
+ chap_list = sections[i].find_all("a")
+ chap_data = []
+ for chap in chap_list:
+ chap_title = chap.get_text().strip()
+ chap_link = f'{host_url}/{chap['href']}'
+ chap_id = utils.extract_page_num(chap_link)
+ chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
+ table_of_contents.append({'title': section_title, 'chapters': chap_data})
+
+ return table_of_contents
+
+# 解析书籍的章节页
+def parse_chapter_page(soup, url):
+ # 获取章节标题
+ chapter_title_tag = soup.find('h1', class_='chapter_title')
+ if chapter_title_tag is None:
+ logging.warning(f'Chapter title not found in {url}')
+ return None, None
+
+ title = chapter_title_tag.get_text().strip()
+ content_url = None
+ next_url = None
+ chapid = utils.extract_page_num(url)
+
+ # 遍历每一个