From a4ea79d4db20b0cd21f1367950c93c887e4a458a Mon Sep 17 00:00:00 2001 From: oscarz Date: Tue, 18 Mar 2025 17:45:20 +0800 Subject: [PATCH] modify scripts --- aabook/aabook_fetch.py | 23 +- aabook/config.py | 10 +- aabook/down_list.py | 2 +- aabook/src/check_status.py | 12 + aabook/src/config.py | 80 +++++++ aabook/src/convert_utils.py | 126 +++++++++++ aabook/src/fetch.py | 312 +++++++++++++++++++++++++ aabook/src/scraper.py | 364 ++++++++++++++++++++++++++++++ aabook/src/sqlite_utils.py | 278 +++++++++++++++++++++++ aabook/src/utils.py | 53 +++++ aabook/{ => tools}/tools_diff.py | 0 aabook/{ => tools}/tools_dir.py | 0 aabook/{ => tools}/tools_other.py | 0 aabook/utils.py | 122 ++++++++++ 14 files changed, 1369 insertions(+), 13 deletions(-) create mode 100644 aabook/src/check_status.py create mode 100644 aabook/src/config.py create mode 100644 aabook/src/convert_utils.py create mode 100644 aabook/src/fetch.py create mode 100644 aabook/src/scraper.py create mode 100644 aabook/src/sqlite_utils.py create mode 100644 aabook/src/utils.py rename aabook/{ => tools}/tools_diff.py (100%) rename aabook/{ => tools}/tools_dir.py (100%) rename aabook/{ => tools}/tools_other.py (100%) create mode 100644 aabook/utils.py diff --git a/aabook/aabook_fetch.py b/aabook/aabook_fetch.py index 96643c5..d3b7f4c 100644 --- a/aabook/aabook_fetch.py +++ b/aabook/aabook_fetch.py @@ -10,6 +10,7 @@ from datetime import datetime from datetime import date import config # 日志配置 from down_list import novel_map +import utils # 日志 @@ -21,7 +22,8 @@ list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&ca list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update' curr_novel_pages = 0 -meta_dir = 'meta' +meta_dir = f'{config.global_host_data_dir}/aabook/meta' +novel_dir = f'{config.global_host_data_dir}/aabook/data' list_file = f'{meta_dir}/list.txt' details_file = f'{meta_dir}/details.txt' @@ -246,7 +248,7 @@ def extract_content_url(soup, base_url, chapid): # 判断内容是否被污染 def check_content(content): - if '2005-2024 疯情书库' in content: + if '2005-2024 疯情书库' in content or '2005-2025 疯情书库' in content: return False return True @@ -263,13 +265,15 @@ def get_novel_pages(): return curr_novel_pages # 解析章节内容并保存到文件中 -def download_novel(chapid, novel_name, dir_prefix='./aabook'): +def download_novel(chapid, novel_name, dir_prefix=novel_dir): chapter_url = f'{base_url}/read-{chapid}.html' novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt' if os.path.exists(novel_file): os.remove(novel_file) # 如果存在同名文件,删除重新下载 + # 保存到其他类型的文件 + chapters = [] reset_novel_pages() while chapter_url: logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}") @@ -314,6 +318,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'): f.write(chapter_title + '\n\n') # 写入每个段落内容到文件 + content = '' with open(novel_file, 'a', encoding='utf-8') as f: for paragraph in paragraphs: #cleaned_part = clean_watermarks(paragraph.get_text().strip()) @@ -321,7 +326,9 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'): #f.write(cleaned_part + '\n\n') cleaned_text = process_paragraph(paragraph) f.write(cleaned_text + '\n\n') + content = content + '

' + cleaned_text + '

' # epub 里面,用html标签来分段落 logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]") + chapters.append((chapter_title, content)) else: logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...") continue @@ -356,6 +363,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'): break time.sleep(3) + # 全部获取完,生成epub文件 + utils.generate_epub(novel_name, 'nobody', chapters, dir_prefix) # 检查子目录是否存在,不存在则创建 @@ -400,7 +409,7 @@ def download_books(need_down_list_file = details_file, cursor_file = down_list_f continue # 已经下载过,跳过 # 创建分类目录 - down_dir = './data/' + category + down_dir = f'{novel_dir}/{category}' create_directory_if_not_exists(down_dir) # 调用下载函数下载书籍 @@ -420,7 +429,7 @@ def download_map(): # 遍历 novel_map,下载所有小说 for novel_id, novel_name in novel_map.items(): logging.info(f"Starting download for {novel_name} (ID: {novel_id})") - download_novel(novel_id, novel_name, './local') + download_novel(novel_id, novel_name, novel_dir) logging.info(f"Completed download for {novel_id}_{novel_name}.\n") # 获取更新列表,并下载 @@ -444,6 +453,10 @@ def main(): print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map") sys.exit(1) + # 确保目录存在 + create_directory_if_not_exists(meta_dir) + create_directory_if_not_exists(novel_dir) + cmd = sys.argv[1] if cmd == "get_list": diff --git a/aabook/config.py b/aabook/config.py index 27942b9..b62e060 100644 --- a/aabook/config.py +++ b/aabook/config.py @@ -3,13 +3,9 @@ import os import inspect from datetime import datetime -# MySQL 配置 -db_config = { - 'host': '172.18.0.3', - 'user': 'root', - 'password': 'mysqlpw', - 'database': 'stockdb' -} +home_dir = os.path.expanduser("~") +global_host_data_dir = f'{home_dir}/hostdir/scripts_data' +global_share_data_dir = f'{home_dir}/sharedata' # 设置日志配置 def setup_logging(log_filename=None): diff --git a/aabook/down_list.py b/aabook/down_list.py index ff46870..1eef720 100644 --- a/aabook/down_list.py +++ b/aabook/down_list.py @@ -10,7 +10,7 @@ novel_map_new = { } # 定义小说映射 novel_map = { - 364489: '诸天之乡村爱情', + 371300: '临时夫妻', } diff --git a/aabook/src/check_status.py b/aabook/src/check_status.py new file mode 100644 index 0000000..5cd5b04 --- /dev/null +++ b/aabook/src/check_status.py @@ -0,0 +1,12 @@ +import json +import time +import sqlite_utils as db_tools + + +if __name__ == "__main__": + # 命令行参数处理 + result = db_tools.get_statics() + print(result) + + + diff --git a/aabook/src/config.py b/aabook/src/config.py new file mode 100644 index 0000000..47e13a1 --- /dev/null +++ b/aabook/src/config.py @@ -0,0 +1,80 @@ +import logging +import os +import inspect +import time +from datetime import datetime +from logging.handlers import RotatingFileHandler +from collections import defaultdict + +home_dir = os.path.expanduser("~") +global_host_data_dir = f'{home_dir}/hostdir/scripts_data' +global_share_data_dir = f'{home_dir}/sharedata' +global_sqlite_path = f'{global_share_data_dir}/sqlite/books.db' + +log_dir = '../log' +# 统计日志频率 +log_count = defaultdict(int) # 记录日志的次数 +last_log_time = defaultdict(float) # 记录上次写入的时间戳 + +class RateLimitFilter(logging.Filter): + """ + 频率限制过滤器: + 1. 在 60 秒内,同样的日志最多写入 60 次,超过则忽略 + 2. 如果日志速率超过 100 条/秒,发出告警 + """ + LOG_LIMIT = 60 # 每分钟最多记录相同消息 10 次 + + def filter(self, record): + global log_count, last_log_time + message_key = record.getMessage() # 获取日志内容 + + # 计算当前时间 + now = time.time() + elapsed = now - last_log_time[message_key] + + # 限制相同日志的写入频率 + if elapsed < 60: # 60 秒内 + log_count[message_key] += 1 + if log_count[message_key] > self.LOG_LIMIT: + print('reach limit.') + return False # 直接丢弃 + else: + log_count[message_key] = 1 # 超过 60 秒,重新计数 + + last_log_time[message_key] = now + + return True # 允许写入日志 + + + +def setup_logging(log_filename=None): + if log_filename is None: + caller_frame = inspect.stack()[1] + caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0] + current_date = datetime.now().strftime('%Y%m%d') + log_filename = f'{log_dir}/{caller_filename}_{current_date}.log' + + max_log_size = 100 * 1024 * 1024 # 10 MB + max_log_files = 10 # 最多保留 10 个日志文件 + + file_handler = RotatingFileHandler(log_filename, maxBytes=max_log_size, backupCount=max_log_files) + file_handler.setFormatter(logging.Formatter( + '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s' + )) + + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter( + '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s' + )) + + # 创建 logger + logger = logging.getLogger() + logger.setLevel(logging.INFO) + logger.handlers = [] # 避免重复添加 handler + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + # 添加频率限制 + rate_limit_filter = RateLimitFilter() + file_handler.addFilter(rate_limit_filter) + console_handler.addFilter(rate_limit_filter) \ No newline at end of file diff --git a/aabook/src/convert_utils.py b/aabook/src/convert_utils.py new file mode 100644 index 0000000..bca87ad --- /dev/null +++ b/aabook/src/convert_utils.py @@ -0,0 +1,126 @@ +from ebooklib import epub +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.platypus import Paragraph, Spacer + + +def generate_epub(data, save_path): + # 创建 EPUB 书籍对象 + book = epub.EpubBook() + + # 设置书籍元数据 + book.set_title(data.get('title', '未知标题')) + book.set_language('zh') + book.add_author(data.get('author', '未知作者')) + + # 存储所有章节对象 + all_chapters = [] + + sections = data.get('sections', []) + + if len(sections) == 1: + # 如果只有一个 section,忽略 section 的 title,按一级目录处理 + for chapter in sections[0].get('chapters', []): + chapter_title = chapter.get('title', '未知章节') + chapter_content = chapter.get('content', '') + paragraphs = chapter_content.split('\n\n') + html_content = ''.join([f'

{para}

' for para in paragraphs]) + chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh') + chapter_obj.content = f'

{chapter_title}

{html_content}' + book.add_item(chapter_obj) + all_chapters.append(chapter_obj) + else: + # 如果有多个 section,按两级目录处理 + for section in sections: + section_title = section.get('title', '未知卷') + section_chapter = epub.EpubHtml(title=section_title, file_name=f'{section_title}.xhtml', lang='zh') + section_chapter.content = f'

{section_title}

' + book.add_item(section_chapter) + all_chapters.append(section_chapter) + + for chapter in section.get('chapters', []): + chapter_title = chapter.get('title', '未知章节') + chapter_content = chapter.get('content', '') + paragraphs = chapter_content.split('\n\n') + html_content = ''.join([f'

{para}

' for para in paragraphs]) + chapter_obj = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh') + chapter_obj.content = f'

{chapter_title}

{html_content}' + book.add_item(chapter_obj) + all_chapters.append(chapter_obj) + + # 定义书籍的目录 + book.toc = tuple(all_chapters) + + # 定义书的结构 + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + + # 定义样式 + style = 'body { font-family: Times, serif; }' + nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) + book.add_item(nav_css) + + # 定义书的结构 + book.spine = ['nav'] + all_chapters + + # 保存 EPUB 文件 + epub.write_epub(save_path, book, {}) + + +def generate_pdf(data, save_path): + # 创建 PDF 画布 + c = canvas.Canvas(save_path, pagesize=letter) + styles = getSampleStyleSheet() + story = [] + + # 设置标题 + title = data.get('title', '未知标题') + story.append(Paragraph(f'{title}', styles['Title'])) + story.append(Spacer(1, 20)) + + # 设置作者 + author = data.get('author', '未知作者') + story.append(Paragraph(f'作者: {author}', styles['Normal'])) + story.append(Spacer(1, 40)) + + sections = data.get('sections', []) + + if len(sections) == 1: + # 如果只有一个 section,忽略 section 的 title,按一级目录处理 + for chapter in sections[0].get('chapters', []): + chapter_title = chapter.get('title', '未知章节') + chapter_content = chapter.get('content', '') + story.append(Paragraph(f'{chapter_title}', styles['Heading1'])) + story.append(Spacer(1, 10)) + paragraphs = chapter_content.split('\n\n') + for para in paragraphs: + story.append(Paragraph(para, styles['Normal'])) + story.append(Spacer(1, 10)) + story.append(Spacer(1, 20)) + else: + # 如果有多个 section,按两级目录处理 + for section in sections: + section_title = section.get('title', '未知卷') + story.append(Paragraph(f'{section_title}', styles['Heading1'])) + story.append(Spacer(1, 15)) + for chapter in section.get('chapters', []): + chapter_title = chapter.get('title', '未知章节') + chapter_content = chapter.get('content', '') + story.append(Paragraph(f'{chapter_title}', styles['Heading2'])) + story.append(Spacer(1, 10)) + paragraphs = chapter_content.split('\n\n') + for para in paragraphs: + story.append(Paragraph(para, styles['Normal'])) + story.append(Spacer(1, 10)) + story.append(Spacer(1, 15)) + + # 构建 PDF + for element in story: + element.wrapOn(c, letter[0] - 100, letter[1] - 100) + element.drawOn(c, 50, letter[1] - element.wrapOn(c, letter[0] - 100, letter[1] - 100)[1] - 50) + c.showPage() + + # 保存 PDF 文件 + c.save() + \ No newline at end of file diff --git a/aabook/src/fetch.py b/aabook/src/fetch.py new file mode 100644 index 0000000..c44ee54 --- /dev/null +++ b/aabook/src/fetch.py @@ -0,0 +1,312 @@ +import json +import time +import csv +import argparse +import logging +from functools import partial +import config +import sqlite_utils as db_tools +import scraper +import utils +import config + +config.setup_logging() + +debug = False +force = False + +# 获取列表 +def fetch_book_list(): + url = scraper.list_url_update + while True: + logging.info(f'fetching book list. url: {url}') + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="list_main", attr_type="class")) + if soup: + # 获取书籍列表 + list_data, next_url = scraper.parse_book_list(soup, url=url) + for item in list_data: + row_id = db_tools.insert_or_update_common(item, db_tools.tbl_name_books) + if row_id: + logging.debug(f'insert one book. row_id: {row_id}, name: {item['name']}') + else: + logging.warning(f'insert book error. name: {item['name']}, href: {item['href']}') + if next_url is None: + logging.info(f'get all pages.') + return True + else: + url = next_url + elif status_code and status_code == 404: + logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + else: + logging.warning(f'fetch page error. {url} ...') + + +# 获取详情 +def fetch_real_content(url): + soup, status_code = scraper.fetch_page(url, scraper.content_validator) + if soup: + data = scraper.parse_content_page(soup, url) + if data: + return data # 段落的数组 + elif status_code and status_code == 404: + logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...') + else: + logging.warning(f'fetch page error. {url} ...') + return None + + +# 获取内容页 +def fetch_chapter_content(url): + chapter_data = {} + next_url = None + + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="h1", identifier="chapter_title", attr_type="class")) + if soup: + data, next_url = scraper.parse_chapter_page(soup, url) + if data: + chapter_data['title'] = data['title'] + contents = fetch_real_content(data['content_url']) + if contents: + chapter_data['contents'] = contents + else: + logging.warning(f'fetching real content faild. url: {data['content_url']}') + return None, None + else: + logging.warning(f'fetch chapter page no data. url: {url}') + return None, None + else: + logging.warning(f'fetch chapter page error. url: {url}, status_code: {status_code}') + return None, None + + return chapter_data, next_url + +# 获取小说详情页,获得首页地址 +def fetch_book_detail(url): + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class")) + if soup: + detail = scraper.parse_book_detail(soup, url) + return detail + else: + return None + +# 获取某本小说的目录页 +def fetch_book_toc(url): + soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="page_main", attr_type="class")) + if soup: + listdata = scraper.pase_chapter_list(soup, url) + return listdata + else: + return None + +# 获取小说的目录页,并插入到数据库 +def fetch_table_of_contents(): + while True: + update_list = db_tools.query_books(id=2547, is_latest=0, limit = 2 if debug else 100) + if update_list is None or len(update_list) <1 : + logging.info(f'no more data need fecth.') + return + + for row in update_list: + name = row['name'] + href = row['href'] + bookid = row['id'] + # 先打开详情页 + logging.info(f'----------fetching book {name}: {href}-------------') + book_detail = fetch_book_detail(href) + if book_detail is None: + logging.warning(f'get book detail failed. url: {href}') + continue + + # 获取目录页 + toc_url = book_detail['table_of_contents_href'] + if toc_url is None or toc_url == '': + logging.warning(f'table_of_contents_href is not correct. url: {href}') + continue + + logging.info(f'fetching page: {toc_url}') + toc_data = fetch_book_toc(toc_url) + + # 解析目录页 + if toc_data is None: + logging.warning(f'fetch_book_toc error. url: {toc_url}') + continue + + # 插入所有的目录数据 + succ = 1 + for row in toc_data: + section_title = row['title'] + chapters = row['chapters'] + section_id = db_tools.insert_or_update_book_sections({ + 'book_id' : int(bookid), + 'section' : section_title, + 'bookid_section': f'{bookid}_{section_title}' + }) + if section_id is None: + logging.warning(f'insert section error. url: {toc_url}, section: {section_title}') + succ = 0 + break + else: + logging.debug(f'insert one books_sections record. id:{section_id}, key: {bookid}_{section_title}') + + # 插入目录数据 + for chap in chapters: + chap_row_id = db_tools.insert_chapter_data({ + 'book_id': bookid, + 'chapter_id': chap['chapter_id'], + 'section_id': section_id, + 'title': chap['title'], + 'href': chap['href'], + 'content': '', + 'has_content' : 0 + }) + if chap_row_id is None: + logging.warning(f'insert_chapter_data error. url: {toc_url}') + succ = 0 + break + if succ == 0 : + logging.warning(f'fetch_book_toc data error. url: {toc_url}') + continue + + # 读取完毕,更新列表 + row_id = db_tools.update_book_detail({ + 'href' : href, + **book_detail + }) + if row_id: + logging.debug(f'update book succ. id: {row_id}, url: {href}') + else: + logging.warning(f'update book failed. url: {href}') + if debug: + return + +# 直接获取小说内容 +def fetch_contents(): + while True: + list_data = db_tools.query_no_content_chapters(limit = 10 if debug else 100) + if list_data is None or len(list_data) <1 : + logging.info(f'no more data need fecth.') + return + + for row in list_data: + url = row['href'] + logging.info(f'fetching content ({row['title']}) from {url}') + content, next_url = fetch_chapter_content(url) + if content and content['title'] and content['contents']: + # 写入到数据表里 + db_tools.insert_chapter_data({ + 'book_id': row['book_id'], + 'chapter_id': row['chapter_id'], + 'section_id': row['section_id'], + 'title': row['title'], + 'href': url, + 'content': '\n\n'.join(content['contents']), + 'has_content': 1 + }) + else: + logging.warning(f'fetch content error. url: {url}') + if debug: + return + + +''' +# 下载完整的小说 +def fetch_book_data(): + update_list = db_tools.query_books(need_update=1, limit = 1) + if update_list: + for row in update_list: + name = row['name'] + href = row['href'] + bookid = row['id'] + # 先打开详情页 + logging.info(f'----------fetching book {name}: {href}-------------') + book_detail = fetch_book_detail(href) + if book_detail: + # 获取内容页,然后循环读取内容 + chapter_url = book_detail['start_page_href'] + chapter_id = utils.extract_page_num(chapter_url) + # 断点续传,从上次拉取的最后一页开始 + if not force: + last_chapter_url = db_tools.query_last_chapter_by_book(bookid) + if last_chapter_url: + chapter_url = last_chapter_url + while chapter_url: + logging.info(f'fetching page: {chapter_url}') + content, next_url = fetch_chapter_content(chapter_url) + if content and content['title'] and content['contents']: + # 写入到数据表里 + db_tools.insert_chapter_data({ + 'book_id': bookid, + 'chapter_id': chapter_id, + 'title': content['title'], + 'href': chapter_url, + 'content': '\n\n'.join(content['contents']), + 'has_content': 1 + }) + + if debug: + return + else: + logging.warning(f'fetch content error. url: {chapter_url}') + chapter_url = next_url + # 读取完毕,更新列表 + row_id = db_tools.update_book_detail({ + 'href' : href, + **book_detail + }) + if row_id: + logging.debug(f'update book succ. id: {row_id}, url: {href}') + else: + logging.warning(f'update book failed. url: {href}') + else: + logging.warning(f'get book detail failed. url: {href}') + else: + logging.warning(f'get no data needed update.') +''' + +# 建立缩写到函数的映射 +function_map = { + "list": fetch_book_list, + "toc" : fetch_table_of_contents, + "content": fetch_contents, +} + +# 主函数 +def main(cmd, args_debug, args_force): + global debug + debug = args_debug + + global force + force = args_force + + # 执行指定的函数 + if cmd: + function_names = args.cmd.split(",") # 拆分输入 + for short_name in function_names: + func = function_map.get(short_name.strip()) # 从映射中获取对应的函数 + if callable(func): + func() + else: + logging.warning(f" {short_name} is not a valid function shortcut.") + else: # 全量执行 + for name, func in function_map.items(): + if callable(func): + func() + else: + logging.warning(f" {short_name} is not a valid function shortcut.") + + logging.info(f'all process completed!') + + # TODO: + # 1, + +if __name__ == "__main__": + # 命令行参数处理 + keys_str = ",".join(function_map.keys()) + + parser = argparse.ArgumentParser(description='fetch aabook data.') + parser.add_argument("--cmd", type=str, help=f"Comma-separated list of function shortcuts: {keys_str}") + parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)') + parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)') + args = parser.parse_args() + + main(args.cmd, args.debug, args.force) diff --git a/aabook/src/scraper.py b/aabook/src/scraper.py new file mode 100644 index 0000000..6eb194d --- /dev/null +++ b/aabook/src/scraper.py @@ -0,0 +1,364 @@ +import time +import json +import csv +import logging +import signal +import sys +import os +import re +import requests +import random +from bs4 import BeautifulSoup +from requests.exceptions import RequestException +from functools import partial +import config +import utils + +# 定义基础 URL 和可变参数 +host_url = 'https://aabook.xyz' +list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update' +#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount' + +# User-Agent 列表 +user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0", + "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36" +] + +#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 +def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10): + for attempt in range(max_retries): + try: + if 'aabook.xyz' not in url.lower(): + logging.error(f'wrong url format: {url}') + return None, None + + # 随机选择一个 User-Agent + headers = { + 'User-Agent': random.choice(user_agents) + } + response = requests.get(url, headers=headers, timeout=default_timeout, stream=True) + + # 处理 HTTP 状态码 + if response.status_code == 404: + logging.warning(f"Page not found (404): {url}") + return None, 404 # 直接返回 404,调用方可以跳过 + + response.raise_for_status() # 处理 HTTP 错误 + + # 预处理 HTML(如果提供了 preprocessor) + html_text = preprocessor(response.text) if preprocessor else response.text + + soup = BeautifulSoup(html_text, parser) + if validator(soup): # 进行自定义页面检查 + return soup, response.status_code + + logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") + except requests.RequestException as e: + logging.info(f"Warn fetching page {url}: {e}. Retrying ...") + time.sleep(sleep_time) # 休眠指定的时间,然后重试 + + logging.error(f'Fetching failed after max retries. {url}') + return None, None # 达到最大重试次数仍然失败 + + +# 解析列表页 +def parse_book_list(soup, url): + # 查找书籍列表 + list_main = soup.find('div', class_='list_main') + if not list_main: + logging.warning(f"No list_main Found in {url}") + return None, None + + tbody = list_main.find('tbody') + if not tbody: + logging.warning(f"No tbody found in {url}") + None, None + + list_data = [] + next_url = None + # 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期) + for tr in tbody.find_all('tr'): + tds = tr.find_all('td') + if len(tds) < 6: + logging.info("Invalid tr format.") + ranking = tds[0].text.strip() + category = utils.remove_brackets_regex(tds[1].text.strip()) + book_link_tag = tds[2].find('a') + book_name = book_link_tag.text.strip() + book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else '' + book_num = utils.extract_book_num(book_link_tag['href']) + author = tds[3].text.strip() + monthly_tickets = tds[4].text.strip() + update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期) + + list_data.append({ + 'rank': ranking, + 'category': category, + 'name': book_name, + 'href': book_link, + 'num': book_num, + 'author': author, + 'tickets': monthly_tickets, + 'update_time': update_time + }) + + # 查找下一页链接 + next_page_tag = soup.find('a', title='下一页') + if next_page_tag: + next_url = host_url + next_page_tag['href'] + + return list_data, next_url + +# 解析详情页 +def parse_book_detail(soup, url): + # 解析书籍详细信息 + book_info_tag = soup.find('li', class_='zuopinxinxi') + if not book_info_tag: + logging.warning(f"No details found in {url}") + return None + + table_of_contents_href = '' + table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu') + if table_of_contents_href_tag: + table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href'] + + book_info_lis = book_info_tag.find_all('li') + if len(book_info_lis) < 4: + logging.info(f"invalid book info in {url}") + return None + + book_category = book_info_lis[0].find('span').text.strip() + book_status = book_info_lis[1].find('span').text.strip() + # 去掉后面的汉字,只要数字 + total_word_count = book_info_lis[2].find('span').text.strip() + total_word_count = int(re.search(r'\d+', total_word_count).group()) + + total_clicks = book_info_lis[3].find('span').text.strip() + month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0' + week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0' + total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0' + month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0' + week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0' + + # 读取创建时间 + creation_time_tag = soup.find('li', class_='update_time') + created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '') + + # 获取起始页链接和编号 + start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a') + start_page_link = host_url + '/' + start_page_tag['href'] + start_page_number = start_page_link.split('-')[-1].replace('.html', '') + + return { + 'category': book_category, + 'status' : book_status, + 'total_words' : total_word_count, + 'total_clicks': total_clicks, + 'month_clicks': month_clicks, + 'week_clicks': week_clicks, + 'total_recommend': total_recommend, + 'month_recommend': month_recommend, + 'week_recommend': week_recommend, + 'created_time': created_time, + 'start_page_href': start_page_link, + 'start_page_num': start_page_number, + 'table_of_contents_href': table_of_contents_href + } + +# 解析书籍的目录页 +def pase_chapter_list(soup, url): + # 获取小说的目录 + table_of_contents = [] + div_table_of_contents = soup.find('div', class_='page_main') + if not div_table_of_contents: + return None + + section_titles = div_table_of_contents.find_all('p', class_='section_title') + sections = div_table_of_contents.find_all('ul', class_='section_list') + if len(sections) > len(section_titles): # 一般是 后者比前者多1个,最后一个是广告 + logging.warning(f'sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}') + return None + else: + for i in range(len(sections)): + section_title = section_titles[i].get_text().strip() + chap_list = sections[i].find_all("a") + chap_data = [] + for chap in chap_list: + chap_title = chap.get_text().strip() + chap_link = f'{host_url}/{chap['href']}' + chap_id = utils.extract_page_num(chap_link) + chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id}) + table_of_contents.append({'title': section_title, 'chapters': chap_data}) + + return table_of_contents + +# 解析书籍的章节页 +def parse_chapter_page(soup, url): + # 获取章节标题 + chapter_title_tag = soup.find('h1', class_='chapter_title') + if chapter_title_tag is None: + logging.warning(f'Chapter title not found in {url}') + return None, None + + title = chapter_title_tag.get_text().strip() + content_url = None + next_url = None + chapid = utils.extract_page_num(url) + + # 遍历每一个