import time import json import csv import logging import signal import sys import os import re import requests import random from bs4 import BeautifulSoup from requests.exceptions import RequestException from functools import partial import config import utils # 定义基础 URL 和可变参数 host_url = 'https://aabook.xyz' list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update' #list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount' # User-Agent 列表 user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0", "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36" ] #使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理 def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10): for attempt in range(max_retries): try: if 'aabook.xyz' not in url.lower(): logging.error(f'wrong url format: {url}') return None, None # 随机选择一个 User-Agent headers = { 'User-Agent': random.choice(user_agents) } response = requests.get(url, headers=headers, timeout=default_timeout, stream=True) # 处理 HTTP 状态码 if response.status_code == 404: logging.warning(f"Page not found (404): {url}") return None, 404 # 直接返回 404,调用方可以跳过 response.raise_for_status() # 处理 HTTP 错误 # 预处理 HTML(如果提供了 preprocessor) html_text = preprocessor(response.text) if preprocessor else response.text soup = BeautifulSoup(html_text, parser) if validator(soup): # 进行自定义页面检查 return soup, response.status_code logging.warning(f"Validation failed on attempt {attempt + 1} for {url}") except requests.RequestException as e: logging.warning(f"fetching page ({url}) error: {e}, Retrying ...") time.sleep(sleep_time) # 休眠指定的时间,然后重试 logging.error(f'Fetching failed after max retries. {url}') return None, None # 达到最大重试次数仍然失败 # 解析列表页 def parse_book_list(soup, url): # 查找书籍列表 list_main = soup.find('div', class_='list_main') if not list_main: logging.warning(f"No list_main Found in {url}") return None, None tbody = list_main.find('tbody') if not tbody: logging.warning(f"No tbody found in {url}") None, None list_data = [] next_url = None # 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期) for tr in tbody.find_all('tr'): tds = tr.find_all('td') if len(tds) < 6: logging.info("Invalid tr format.") ranking = tds[0].text.strip() category = utils.remove_brackets_regex(tds[1].text.strip()) book_link_tag = tds[2].find('a') book_name = book_link_tag.text.strip() book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else '' book_num = utils.extract_book_num(book_link_tag['href']) author = tds[3].text.strip() monthly_tickets = tds[4].text.strip() update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期) list_data.append({ 'rank': ranking, 'category': category, 'name': book_name, 'href': book_link, 'num': book_num, 'author': author, 'tickets': monthly_tickets, 'update_time': update_time }) # 查找下一页链接 next_page_tag = soup.find('a', title='下一页') if next_page_tag: next_url = host_url + next_page_tag['href'] return list_data, next_url # 解析详情页 def parse_book_detail(soup, url): # 解析书籍详细信息 book_info_tag = soup.find('li', class_='zuopinxinxi') if not book_info_tag: logging.warning(f"No details found in {url}") return None table_of_contents_href = '' table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu') if table_of_contents_href_tag: table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href'] book_info_lis = book_info_tag.find_all('li') if len(book_info_lis) < 4: logging.info(f"invalid book info in {url}") return None book_category = book_info_lis[0].find('span').text.strip() book_status = book_info_lis[1].find('span').text.strip() # 去掉后面的汉字,只要数字 total_word_count = book_info_lis[2].find('span').text.strip() total_word_count = int(re.search(r'\d+', total_word_count).group()) total_clicks = book_info_lis[3].find('span').text.strip() month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0' week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0' total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0' month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0' week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0' # 读取创建时间 creation_time_tag = soup.find('li', class_='update_time') created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '') # 获取起始页链接和编号 start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a') start_page_link = host_url + '/' + start_page_tag['href'] start_page_number = start_page_link.split('-')[-1].replace('.html', '') return { 'category': book_category, 'status' : book_status, 'total_words' : total_word_count, 'total_clicks': total_clicks, 'month_clicks': month_clicks, 'week_clicks': week_clicks, 'total_recommend': total_recommend, 'month_recommend': month_recommend, 'week_recommend': week_recommend, 'created_time': created_time, 'start_page_href': start_page_link, 'start_page_num': start_page_number, 'table_of_contents_href': table_of_contents_href } # 解析书籍的目录页 def pase_chapter_list(soup, url): # 获取小说的目录 table_of_contents = [] div_table_of_contents = soup.find('div', class_='page_main') if not div_table_of_contents: return None section_titles = div_table_of_contents.find_all('p', class_='section_title') sections = div_table_of_contents.find_all('ul', class_='section_list') if len(sections) > len(section_titles): # 一般是 后者比前者多1个,最后一个是广告 logging.warning(f"sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}") return None else: for i in range(len(sections)): section_title = section_titles[i].get_text().strip() chap_list = sections[i].find_all("a") chap_data = [] for chap in chap_list: chap_title = chap.get_text().strip() # 获取章节标题 chap_link = f"{host_url}/{chap['href']}" # 获取章节链接 chap_id = utils.extract_page_num(chap_link) chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数 chap_data.append({ 'href': chap_link, 'title': chap_title, 'chapter_id': chap_id, 'words': chap_words, 'update_time' : chap_uptime, }) table_of_contents.append({'title': section_title, 'chapters': chap_data}) return table_of_contents # 解析书籍的章节页 def parse_chapter_page(soup, url): # 获取章节标题 chapter_title_tag = soup.find('h1', class_='chapter_title') if chapter_title_tag is None: logging.warning(f'Chapter title not found in {url}') return None, None title = chapter_title_tag.get_text().strip() content_url = None next_url = None chapid = utils.extract_page_num(url) # 遍历每一个