402 lines
16 KiB
Python
402 lines
16 KiB
Python
import time
|
||
import json
|
||
import csv
|
||
import logging
|
||
import signal
|
||
import sys
|
||
import os
|
||
import re
|
||
import requests
|
||
import random
|
||
from bs4 import BeautifulSoup
|
||
from requests.exceptions import RequestException
|
||
from functools import partial
|
||
import config
|
||
import utils
|
||
|
||
# 定义基础 URL 和可变参数
|
||
host_url = 'https://aabook.xyz'
|
||
list_url_update = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
|
||
#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
||
|
||
# User-Agent 列表
|
||
user_agents = [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
|
||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
|
||
"Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
|
||
]
|
||
|
||
#使用 CloudScraper 进行网络请求,并执行页面验证,支持不同解析器和预处理
|
||
def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
|
||
for attempt in range(max_retries):
|
||
try:
|
||
if 'aabook.xyz' not in url.lower():
|
||
logging.error(f'wrong url format: {url}')
|
||
return None, None
|
||
|
||
# 随机选择一个 User-Agent
|
||
headers = {
|
||
'User-Agent': random.choice(user_agents)
|
||
}
|
||
response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
|
||
|
||
# 处理 HTTP 状态码
|
||
if response.status_code == 404:
|
||
logging.warning(f"Page not found (404): {url}")
|
||
return None, 404 # 直接返回 404,调用方可以跳过
|
||
|
||
response.raise_for_status() # 处理 HTTP 错误
|
||
|
||
# 预处理 HTML(如果提供了 preprocessor)
|
||
html_text = preprocessor(response.text) if preprocessor else response.text
|
||
|
||
soup = BeautifulSoup(html_text, parser)
|
||
if validator(soup): # 进行自定义页面检查
|
||
return soup, response.status_code
|
||
|
||
logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
|
||
except requests.RequestException as e:
|
||
logging.warning(f"fetching page ({url}) error: {e}, Retrying ...")
|
||
time.sleep(sleep_time) # 休眠指定的时间,然后重试
|
||
|
||
logging.error(f'Fetching failed after max retries. {url}')
|
||
return None, None # 达到最大重试次数仍然失败
|
||
|
||
|
||
# 解析列表页
|
||
def parse_book_list(soup, url):
|
||
# 查找书籍列表
|
||
list_main = soup.find('div', class_='list_main')
|
||
if not list_main:
|
||
logging.warning(f"No list_main Found in {url}")
|
||
return None, None
|
||
|
||
tbody = list_main.find('tbody')
|
||
if not tbody:
|
||
logging.warning(f"No tbody found in {url}")
|
||
None, None
|
||
|
||
list_data = []
|
||
next_url = None
|
||
# 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期)
|
||
for tr in tbody.find_all('tr'):
|
||
tds = tr.find_all('td')
|
||
if len(tds) < 6:
|
||
logging.info("Invalid tr format.")
|
||
ranking = tds[0].text.strip()
|
||
category = utils.remove_brackets_regex(tds[1].text.strip())
|
||
book_link_tag = tds[2].find('a')
|
||
book_name = book_link_tag.text.strip()
|
||
book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
|
||
book_num = utils.extract_book_num(book_link_tag['href'])
|
||
author = tds[3].text.strip()
|
||
monthly_tickets = tds[4].text.strip()
|
||
update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期)
|
||
|
||
list_data.append({
|
||
'rank': ranking,
|
||
'category': category,
|
||
'name': book_name,
|
||
'href': book_link,
|
||
'num': book_num,
|
||
'author': author,
|
||
'tickets': monthly_tickets,
|
||
'update_time': update_time
|
||
})
|
||
|
||
# 查找下一页链接
|
||
next_page_tag = soup.find('a', title='下一页')
|
||
if next_page_tag:
|
||
next_url = host_url + next_page_tag['href']
|
||
|
||
return list_data, next_url
|
||
|
||
# 解析详情页
|
||
def parse_book_detail(soup, url):
|
||
# 解析书籍详细信息
|
||
book_info_tag = soup.find('li', class_='zuopinxinxi')
|
||
if not book_info_tag:
|
||
logging.warning(f"No details found in {url}")
|
||
return None
|
||
|
||
table_of_contents_href = ''
|
||
table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
|
||
if table_of_contents_href_tag:
|
||
table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']
|
||
|
||
book_info_lis = book_info_tag.find_all('li')
|
||
if len(book_info_lis) < 4:
|
||
logging.info(f"invalid book info in {url}")
|
||
return None
|
||
|
||
book_category = book_info_lis[0].find('span').text.strip()
|
||
book_status = book_info_lis[1].find('span').text.strip()
|
||
# 去掉后面的汉字,只要数字
|
||
total_word_count = book_info_lis[2].find('span').text.strip()
|
||
total_word_count = int(re.search(r'\d+', total_word_count).group())
|
||
|
||
total_clicks = book_info_lis[3].find('span').text.strip()
|
||
month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
|
||
week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
|
||
total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
|
||
month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
|
||
week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'
|
||
|
||
# 读取创建时间
|
||
creation_time_tag = soup.find('li', class_='update_time')
|
||
created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')
|
||
|
||
# 获取起始页链接和编号
|
||
start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
|
||
start_page_link = host_url + '/' + start_page_tag['href']
|
||
start_page_number = start_page_link.split('-')[-1].replace('.html', '')
|
||
|
||
return {
|
||
'category': book_category,
|
||
'status' : book_status,
|
||
'total_words' : total_word_count,
|
||
'total_clicks': total_clicks,
|
||
'month_clicks': month_clicks,
|
||
'week_clicks': week_clicks,
|
||
'total_recommend': total_recommend,
|
||
'month_recommend': month_recommend,
|
||
'week_recommend': week_recommend,
|
||
'created_time': created_time,
|
||
'start_page_href': start_page_link,
|
||
'start_page_num': start_page_number,
|
||
'table_of_contents_href': table_of_contents_href
|
||
}
|
||
|
||
# 解析书籍的目录页
|
||
def pase_chapter_list(soup, url):
|
||
# 获取小说的目录
|
||
table_of_contents = []
|
||
div_table_of_contents = soup.find('div', class_='page_main')
|
||
if not div_table_of_contents:
|
||
return None
|
||
|
||
section_titles = div_table_of_contents.find_all('p', class_='section_title')
|
||
sections = div_table_of_contents.find_all('ul', class_='section_list')
|
||
if len(sections) > len(section_titles): # 一般是 后者比前者多1个,最后一个是广告
|
||
logging.warning(f"sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}")
|
||
return None
|
||
else:
|
||
for i in range(len(sections)):
|
||
section_title = section_titles[i].get_text().strip()
|
||
chap_list = sections[i].find_all("a")
|
||
chap_data = []
|
||
for chap in chap_list:
|
||
chap_title = chap.get_text().strip() # 获取章节标题
|
||
chap_link = f"{host_url}/{chap['href']}" # 获取章节链接
|
||
chap_id = utils.extract_page_num(chap_link)
|
||
chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
|
||
chap_data.append({
|
||
'href': chap_link,
|
||
'title': chap_title,
|
||
'chapter_id': chap_id,
|
||
'words': chap_words,
|
||
'update_time' : chap_uptime,
|
||
})
|
||
table_of_contents.append({'title': section_title, 'chapters': chap_data})
|
||
|
||
return table_of_contents
|
||
|
||
# 解析书籍的章节页
|
||
def parse_chapter_page(soup, url):
|
||
# 获取章节标题
|
||
chapter_title_tag = soup.find('h1', class_='chapter_title')
|
||
if chapter_title_tag is None:
|
||
logging.warning(f'Chapter title not found in {url}')
|
||
return None, None
|
||
|
||
title = chapter_title_tag.get_text().strip()
|
||
content_url = None
|
||
next_url = None
|
||
chapid = utils.extract_page_num(url)
|
||
|
||
# 遍历每一个 <script> 标签,查找内容页的链接
|
||
script_tags = soup.find_all('script')
|
||
for script_tag in script_tags:
|
||
script_content = script_tag.string
|
||
if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
|
||
# 匹配到特定内容,提取出 _getcontent.php 的 URL 模板
|
||
match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
|
||
if match:
|
||
# 从匹配中提取 v 参数值
|
||
v_value = match.group(1)
|
||
# 构建完整的 content_url
|
||
content_url = f"{host_url}/_getcontent.php?id={chapid}&v={v_value}"
|
||
break
|
||
if content_url is None:
|
||
logging.warning(f'Content url not found in {url}')
|
||
return None, None
|
||
|
||
# 获取小说的目录
|
||
table_of_contents = []
|
||
div_table_of_contents = soup.find('div', class_='mulu_con')
|
||
if div_table_of_contents or False: # 考虑要不要加上这个
|
||
section_titles = div_table_of_contents.find_all('p')
|
||
sections = div_table_of_contents.find_all('ul')
|
||
if len(sections) != len(section_titles):
|
||
logging.warning(f'sections not matched titles')
|
||
else:
|
||
for i in range(len(sections)):
|
||
section_title = section_titles[i].get_text().strip()
|
||
chap_list = sections[i].find_all("a")
|
||
chap_data = []
|
||
for chap in chap_list:
|
||
chap_title = chap.get_text().strip()
|
||
chap_link = chap['href']
|
||
chap_data.append({'href': chap_link, 'title': chap_title})
|
||
table_of_contents.append({'title': section_title, 'chapters': chap_data})
|
||
|
||
# 查找下一章的链接
|
||
next_div = soup.find('div', class_='next_arrow')
|
||
if next_div:
|
||
next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))
|
||
if next_page_tag:
|
||
next_url = f"{host_url}/{next_page_tag['href']}" if next_page_tag['href'] else ''
|
||
|
||
data = {'title': title, 'content_url': content_url, 'table_of_contents': table_of_contents}
|
||
return data, next_url
|
||
|
||
|
||
def process_paragraph(paragraph):
|
||
# 获取完整的 HTML 结构,而不是 get_text()
|
||
paragraph_html = str(paragraph)
|
||
|
||
# 移除水印标签
|
||
cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', paragraph_html, flags=re.DOTALL)
|
||
|
||
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
|
||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||
cleaned_text = soup.get_text().strip()
|
||
|
||
return cleaned_text
|
||
|
||
# 解析内容页
|
||
def parse_content_page2(soup, url):
|
||
content = []
|
||
paragraphs = soup.find_all('p')
|
||
if paragraphs:
|
||
for paragraph in paragraphs:
|
||
cleaned_text = process_paragraph(paragraph)
|
||
content.append(cleaned_text)
|
||
else:
|
||
# 某些页面,没有p标签,只有一个h1,要兼容此问题
|
||
paragraphs = soup.find_all('h1')
|
||
if paragraphs:
|
||
for paragraph in paragraphs:
|
||
cleaned_text = process_paragraph(paragraph)
|
||
content.append(cleaned_text)
|
||
|
||
# 某些页面只有<br>标签,soup.stripped_strings:返回去除空白后的所有文本节点。
|
||
if len(content) == 0:
|
||
content = [block.strip() for block in soup.stripped_strings if block.strip()]
|
||
|
||
return content
|
||
|
||
def parse_content_page(soup, url):
|
||
content = []
|
||
|
||
# 提取所有 p 标签和 h1 标签
|
||
paragraphs = soup.find_all(['p', 'h1'])
|
||
if paragraphs:
|
||
for paragraph in paragraphs:
|
||
cleaned_text = process_paragraph(paragraph)
|
||
if cleaned_text:
|
||
content.append(cleaned_text)
|
||
|
||
# 如果没有找到 p 或 h1,再兜底提取所有文本,同时移除水印
|
||
if not content:
|
||
cleaned_html = process_paragraph(soup)
|
||
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||
content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()]
|
||
|
||
return content
|
||
|
||
|
||
# 通用的 HTML 结构验证器
|
||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||
if attr_type == "id":
|
||
return soup.find(tag, id=identifier) is not None
|
||
elif attr_type == "class":
|
||
return bool(soup.find_all(tag, class_=identifier))
|
||
elif attr_type == "name":
|
||
return bool(soup.find('select', {'name': identifier}))
|
||
return False
|
||
|
||
# 对内容是否被污染的判断
|
||
def content_validator(soup):
|
||
text = str(soup)
|
||
dirty_words = ['2005-2024 疯情书库', '2005-2025 疯情书库', '2025 疯情书库', '2026 疯情书库', '2027 疯情书库']
|
||
for word in dirty_words:
|
||
if word in text:
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
def test_content_page(url):
|
||
soup, status_code = fetch_page(url, content_validator)
|
||
if soup:
|
||
data = parse_content_page(soup, url)
|
||
if data:
|
||
return data
|
||
else :
|
||
return []
|
||
|
||
|
||
|
||
def test_chapter_page(url):
|
||
soup, status_code = fetch_page(url, partial(generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
|
||
if soup:
|
||
data, next_url = parse_chapter_page(soup, url)
|
||
if data:
|
||
return data
|
||
else :
|
||
return None
|
||
|
||
def test_book_detail(url):
|
||
soup, status_code = fetch_page(url, partial(generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
|
||
if soup:
|
||
detail = parse_book_detail(soup, url)
|
||
return detail
|
||
|
||
|
||
def test_book_list():
|
||
for num in range(5):
|
||
url = list_url_update.format(num)
|
||
soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="list_main", attr_type="class"))
|
||
if soup:
|
||
# 获取书籍列表
|
||
list_data, next_url = parse_book_list(soup, url=url)
|
||
for item in list_data:
|
||
# 获取详情页
|
||
detail = test_book_detail(item['href'])
|
||
if detail:
|
||
print({
|
||
**item,
|
||
**detail
|
||
})
|
||
|
||
# 获取内容页
|
||
page_data = test_chapter_page(detail['start_page_href'])
|
||
if page_data:
|
||
print(page_data)
|
||
# 获取内容
|
||
contents = test_content_page(page_data['content_url'])
|
||
if contents and len(contents)>0:
|
||
print (contents[0])
|
||
|
||
else:
|
||
print('get detail error.')
|
||
return
|
||
|
||
|
||
if __name__ == "__main__":
|
||
test_book_list()
|
||
|
||
|