This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/aabook/src/scraper.py
2025-03-24 10:19:33 +08:00

402 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import json
import csv
import logging
import signal
import sys
import os
import re
import requests
import random
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
import config
import utils
# 定义基础 URL 和可变参数
host_url = 'https://aabook.xyz'
list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
# User-Agent 列表
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
"Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
]
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
for attempt in range(max_retries):
try:
if 'aabook.xyz' not in url.lower():
logging.error(f'wrong url format: {url}')
return None, None
# 随机选择一个 User-Agent
headers = {
'User-Agent': random.choice(user_agents)
}
response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
# 处理 HTTP 状态码
if response.status_code == 404:
logging.warning(f"Page not found (404): {url}")
return None, 404 # 直接返回 404调用方可以跳过
response.raise_for_status() # 处理 HTTP 错误
# 预处理 HTML如果提供了 preprocessor
html_text = preprocessor(response.text) if preprocessor else response.text
soup = BeautifulSoup(html_text, parser)
if validator(soup): # 进行自定义页面检查
return soup, response.status_code
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
except requests.RequestException as e:
logging.warning(f"fetching page ({url}) error: {e}, Retrying ...")
time.sleep(sleep_time) # 休眠指定的时间,然后重试
logging.error(f'Fetching failed after max retries. {url}')
return None, None # 达到最大重试次数仍然失败
# 解析列表页
def parse_book_list(soup, url):
# 查找书籍列表
list_main = soup.find('div', class_='list_main')
if not list_main:
logging.warning(f"No list_main Found in {url}")
return None, None
tbody = list_main.find('tbody')
if not tbody:
logging.warning(f"No tbody found in {url}")
None, None
list_data = []
next_url = None
# 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期)
for tr in tbody.find_all('tr'):
tds = tr.find_all('td')
if len(tds) < 6:
logging.info("Invalid tr format.")
ranking = tds[0].text.strip()
category = utils.remove_brackets_regex(tds[1].text.strip())
book_link_tag = tds[2].find('a')
book_name = book_link_tag.text.strip()
book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
book_num = utils.extract_book_num(book_link_tag['href'])
author = tds[3].text.strip()
monthly_tickets = tds[4].text.strip()
update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期)
list_data.append({
'rank': ranking,
'category': category,
'name': book_name,
'href': book_link,
'num': book_num,
'author': author,
'tickets': monthly_tickets,
'update_time': update_time
})
# 查找下一页链接
next_page_tag = soup.find('a', title='下一页')
if next_page_tag:
next_url = host_url + next_page_tag['href']
return list_data, next_url
# 解析详情页
def parse_book_detail(soup, url):
# 解析书籍详细信息
book_info_tag = soup.find('li', class_='zuopinxinxi')
if not book_info_tag:
logging.warning(f"No details found in {url}")
return None
table_of_contents_href = ''
table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
if table_of_contents_href_tag:
table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']
book_info_lis = book_info_tag.find_all('li')
if len(book_info_lis) < 4:
logging.info(f"invalid book info in {url}")
return None
book_category = book_info_lis[0].find('span').text.strip()
book_status = book_info_lis[1].find('span').text.strip()
# 去掉后面的汉字,只要数字
total_word_count = book_info_lis[2].find('span').text.strip()
total_word_count = int(re.search(r'\d+', total_word_count).group())
total_clicks = book_info_lis[3].find('span').text.strip()
month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'
# 读取创建时间
creation_time_tag = soup.find('li', class_='update_time')
created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')
# 获取起始页链接和编号
start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
start_page_link = host_url + '/' + start_page_tag['href']
start_page_number = start_page_link.split('-')[-1].replace('.html', '')
return {
'category': book_category,
'status' : book_status,
'total_words' : total_word_count,
'total_clicks': total_clicks,
'month_clicks': month_clicks,
'week_clicks': week_clicks,
'total_recommend': total_recommend,
'month_recommend': month_recommend,
'week_recommend': week_recommend,
'created_time': created_time,
'start_page_href': start_page_link,
'start_page_num': start_page_number,
'table_of_contents_href': table_of_contents_href
}
# 解析书籍的目录页
def pase_chapter_list(soup, url):
# 获取小说的目录
table_of_contents = []
div_table_of_contents = soup.find('div', class_='page_main')
if not div_table_of_contents:
return None
section_titles = div_table_of_contents.find_all('p', class_='section_title')
sections = div_table_of_contents.find_all('ul', class_='section_list')
if len(sections) > len(section_titles): # 一般是 后者比前者多1个最后一个是广告
logging.warning(f"sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}")
return None
else:
for i in range(len(sections)):
section_title = section_titles[i].get_text().strip()
chap_list = sections[i].find_all("a")
chap_data = []
for chap in chap_list:
chap_title = chap.get_text().strip() # 获取章节标题
chap_link = f"{host_url}/{chap['href']}" # 获取章节链接
chap_id = utils.extract_page_num(chap_link)
chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
chap_data.append({
'href': chap_link,
'title': chap_title,
'chapter_id': chap_id,
'words': chap_words,
'update_time' : chap_uptime,
})
table_of_contents.append({'title': section_title, 'chapters': chap_data})
return table_of_contents
# 解析书籍的章节页
def parse_chapter_page(soup, url):
# 获取章节标题
chapter_title_tag = soup.find('h1', class_='chapter_title')
if chapter_title_tag is None:
logging.warning(f'Chapter title not found in {url}')
return None, None
title = chapter_title_tag.get_text().strip()
content_url = None
next_url = None
chapid = utils.extract_page_num(url)
# 遍历每一个 <script> 标签,查找内容页的链接
script_tags = soup.find_all('script')
for script_tag in script_tags:
script_content = script_tag.string
if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
# 匹配到特定内容,提取出 _getcontent.php 的 URL 模板
match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
if match:
# 从匹配中提取 v 参数值
v_value = match.group(1)
# 构建完整的 content_url
content_url = f"{host_url}/_getcontent.php?id={chapid}&v={v_value}"
break
if content_url is None:
logging.warning(f'Content url not found in {url}')
return None, None
# 获取小说的目录
table_of_contents = []
div_table_of_contents = soup.find('div', class_='mulu_con')
if div_table_of_contents or False: # 考虑要不要加上这个
section_titles = div_table_of_contents.find_all('p')
sections = div_table_of_contents.find_all('ul')
if len(sections) != len(section_titles):
logging.warning(f'sections not matched titles')
else:
for i in range(len(sections)):
section_title = section_titles[i].get_text().strip()
chap_list = sections[i].find_all("a")
chap_data = []
for chap in chap_list:
chap_title = chap.get_text().strip()
chap_link = chap['href']
chap_data.append({'href': chap_link, 'title': chap_title})
table_of_contents.append({'title': section_title, 'chapters': chap_data})
# 查找下一章的链接
next_div = soup.find('div', class_='next_arrow')
if next_div:
next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))
if next_page_tag:
next_url = f"{host_url}/{next_page_tag['href']}" if next_page_tag['href'] else ''
data = {'title': title, 'content_url': content_url, 'table_of_contents': table_of_contents}
return data, next_url
def process_paragraph(paragraph):
# 获取完整的 HTML 结构,而不是 get_text()
paragraph_html = str(paragraph)
# 移除水印标签
cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', paragraph_html, flags=re.DOTALL)
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
soup = BeautifulSoup(cleaned_html, 'html.parser')
cleaned_text = soup.get_text().strip()
return cleaned_text
# 解析内容页
def parse_content_page2(soup, url):
content = []
paragraphs = soup.find_all('p')
if paragraphs:
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text)
else:
# 某些页面没有p标签只有一个h1要兼容此问题
paragraphs = soup.find_all('h1')
if paragraphs:
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text)
# 某些页面只有<br>标签soup.stripped_strings返回去除空白后的所有文本节点。
if len(content) == 0:
content = [block.strip() for block in soup.stripped_strings if block.strip()]
return content
def parse_content_page(soup, url):
content = []
# 提取所有 p 标签和 h1 标签
paragraphs = soup.find_all(['p', 'h1'])
if paragraphs:
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
if cleaned_text:
content.append(cleaned_text)
# 如果没有找到 p 或 h1再兜底提取所有文本同时移除水印
if not content:
cleaned_html = process_paragraph(soup)
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()]
return content
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":
return soup.find(tag, id=identifier) is not None
elif attr_type == "class":
return bool(soup.find_all(tag, class_=identifier))
elif attr_type == "name":
return bool(soup.find('select', {'name': identifier}))
return False
# 对内容是否被污染的判断
def content_validator(soup):
text = str(soup)
dirty_words = ['2005-2024 疯情书库', '2005-2025 疯情书库', '2025 疯情书库', '2026 疯情书库', '2027 疯情书库']
for word in dirty_words:
if word in text:
return False
return True
def test_content_page(url):
soup, status_code = fetch_page(url, content_validator)
if soup:
data = parse_content_page(soup, url)
if data:
return data
else :
return []
def test_chapter_page(url):
soup, status_code = fetch_page(url, partial(generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
if soup:
data, next_url = parse_chapter_page(soup, url)
if data:
return data
else :
return None
def test_book_detail(url):
soup, status_code = fetch_page(url, partial(generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
if soup:
detail = parse_book_detail(soup, url)
return detail
def test_book_list():
for num in range(5):
url = list_url_update.format(num)
soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="list_main", attr_type="class"))
if soup:
# 获取书籍列表
list_data, next_url = parse_book_list(soup, url=url)
for item in list_data:
# 获取详情页
detail = test_book_detail(item['href'])
if detail:
print({
**item,
**detail
})
# 获取内容页
page_data = test_chapter_page(detail['start_page_href'])
if page_data:
print(page_data)
# 获取内容
contents = test_content_page(page_data['content_url'])
if contents and len(contents)>0:
print (contents[0])
else:
print('get detail error.')
return
if __name__ == "__main__":
test_book_list()