resources/aabook/src/scraper.py

import time
import json
import csv
import logging
import signal
import sys
import os
import re
import requests
import random
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from functools import partial
import config
import utils

# 定义基础 URL 和可变参数
host_url = 'https://aabook.xyz'
list_url_update    = f'{host_url}/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
#list_url_wordcount = 'https://aabook.xyz/category.html?pageNum=1&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'

# User-Agent 列表
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
    "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
]

#使用 CloudScraper 进行网络请求，并执行页面验证，支持不同解析器和预处理
def fetch_page(url, validator, parser="html.parser", preprocessor=None, max_retries=3, sleep_time=5, default_timeout=10):
    for attempt in range(max_retries):
        try:
            if 'aabook.xyz' not in url.lower():
                logging.error(f'wrong url format: {url}')
                return None, None

            # 随机选择一个 User-Agent
            headers = {
                'User-Agent': random.choice(user_agents)
            }
            response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)

            # 处理 HTTP 状态码
            if response.status_code == 404:
                logging.warning(f"Page not found (404): {url}")
                return None, 404  # 直接返回 404，调用方可以跳过

            response.raise_for_status()  # 处理 HTTP 错误

            # 预处理 HTML（如果提供了 preprocessor）
            html_text = preprocessor(response.text) if preprocessor else response.text

            soup = BeautifulSoup(html_text, parser)
            if validator(soup):  # 进行自定义页面检查
                return soup, response.status_code

            logging.warning(f"Validation failed on attempt {attempt + 1} for {url}")
        except requests.RequestException as e:
            logging.warning(f"fetching page ({url}) error: {e}, Retrying ...")
            time.sleep(sleep_time)  # 休眠指定的时间，然后重试

    logging.error(f'Fetching failed after max retries. {url}')
    return None, None  # 达到最大重试次数仍然失败


# 解析列表页
def parse_book_list(soup, url):
    # 查找书籍列表
    list_main = soup.find('div', class_='list_main')
    if not list_main:
        logging.warning(f"No list_main Found in {url}")
        return None, None

    tbody = list_main.find('tbody')
    if not tbody:
        logging.warning(f"No tbody found in {url}")
        None, None

    list_data = []
    next_url = None
    # 获取每本书的基础信息：排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数，按日期排序时是最后更新日期)
    for tr in tbody.find_all('tr'):
        tds = tr.find_all('td')
        if len(tds) < 6:
            logging.info("Invalid tr format.")
        ranking = tds[0].text.strip()
        category = utils.remove_brackets_regex(tds[1].text.strip())
        book_link_tag = tds[2].find('a')
        book_name = book_link_tag.text.strip()
        book_link = host_url + '/' + book_link_tag['href'] if book_link_tag.get('href') else ''
        book_num = utils.extract_book_num(book_link_tag['href'])
        author = tds[3].text.strip()
        monthly_tickets = tds[4].text.strip()
        update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数，按日期排序时是最后更新日期)

        list_data.append({
            'rank':     ranking,
            'category': category,
            'name':     book_name,
            'href':     book_link,
            'num':      book_num,
            'author':   author,
            'tickets':  monthly_tickets,
            'update_time':  update_time
        })

    # 查找下一页链接
    next_page_tag = soup.find('a', title='下一页')
    if next_page_tag:
        next_url = host_url + next_page_tag['href']

    return list_data, next_url

# 解析详情页
def parse_book_detail(soup, url):
    # 解析书籍详细信息
    book_info_tag = soup.find('li', class_='zuopinxinxi')
    if not book_info_tag:
        logging.warning(f"No details found in {url}")
        return None

    table_of_contents_href = ''
    table_of_contents_href_tag = soup.find('li', class_='xiaoshuomulu')
    if table_of_contents_href_tag:
        table_of_contents_href = host_url + table_of_contents_href_tag.find('a')['href']

    book_info_lis = book_info_tag.find_all('li')
    if len(book_info_lis) < 4:
        logging.info(f"invalid book info in {url}")
        return None

    book_category = book_info_lis[0].find('span').text.strip()
    book_status = book_info_lis[1].find('span').text.strip()
    # 去掉后面的汉字，只要数字
    total_word_count = book_info_lis[2].find('span').text.strip()
    total_word_count = int(re.search(r'\d+', total_word_count).group())

    total_clicks = book_info_lis[3].find('span').text.strip()
    month_clicks = book_info_lis[4].find('span').text.strip() if len(book_info_lis) >4 else '0'
    week_clicks = book_info_lis[5].find('span').text.strip() if len(book_info_lis) >5 else '0'
    total_recommend = book_info_lis[6].find('span').text.strip() if len(book_info_lis) >6 else '0'
    month_recommend = book_info_lis[7].find('span').text.strip() if len(book_info_lis) >7 else '0'
    week_recommend = book_info_lis[8].find('span').text.strip() if len(book_info_lis) >8 else '0'

    # 读取创建时间
    creation_time_tag = soup.find('li', class_='update_time')
    created_time = utils.extract_create_time(creation_time_tag.text.strip() if creation_time_tag else '')

    # 获取起始页链接和编号
    start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
    start_page_link = host_url + '/' + start_page_tag['href']
    start_page_number = start_page_link.split('-')[-1].replace('.html', '')

    return {
        'category':         book_category,
        'status' :          book_status,
        'total_words' :     total_word_count,
        'total_clicks':     total_clicks,
        'month_clicks':     month_clicks,
        'week_clicks':      week_clicks,
        'total_recommend':  total_recommend,
        'month_recommend':  month_recommend,
        'week_recommend':   week_recommend,
        'created_time':     created_time,
        'start_page_href':  start_page_link,
        'start_page_num':   start_page_number,
        'table_of_contents_href': table_of_contents_href
    }

# 解析书籍的目录页
def pase_chapter_list(soup, url):
    # 获取小说的目录
    table_of_contents = []
    div_table_of_contents = soup.find('div', class_='page_main')
    if not div_table_of_contents:
        return None

    section_titles = div_table_of_contents.find_all('p', class_='section_title')
    sections = div_table_of_contents.find_all('ul', class_='section_list')
    if len(sections) > len(section_titles): # 一般是 后者比前者多1个，最后一个是广告
        logging.warning(f"sections not matched titles, url: {url}, titles: {len(section_titles)}, sections: {len(sections)}")
        return None
    else:
        for i in range(len(sections)):
            section_title = section_titles[i].get_text().strip()
            chap_list = sections[i].find_all("a")
            chap_data = []
            for chap in chap_list:
                chap_title = chap.get_text().strip()        # 获取章节标题
                chap_link = f"{host_url}/{chap['href']}"    # 获取章节链接
                chap_id = utils.extract_page_num(chap_link)
                chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
                chap_data.append({
                    'href': chap_link,
                    'title': chap_title,
                    'chapter_id': chap_id,
                    'words': chap_words,
                    'update_time' : chap_uptime,
                    })
            table_of_contents.append({'title': section_title, 'chapters': chap_data})

    return table_of_contents

# 解析书籍的章节页
def parse_chapter_page(soup, url):
    # 获取章节标题
    chapter_title_tag = soup.find('h1', class_='chapter_title')
    if chapter_title_tag is None:
        logging.warning(f'Chapter title not found in {url}')
        return None, None

    title = chapter_title_tag.get_text().strip()
    content_url = None
    next_url = None
    chapid = utils.extract_page_num(url)

    # 遍历每一个 <script> 标签，查找内容页的链接
    script_tags = soup.find_all('script')
    for script_tag in script_tags:
        script_content = script_tag.string
        if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
            # 匹配到特定内容，提取出 _getcontent.php 的 URL 模板
            match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
            if match:
                # 从匹配中提取 v 参数值
                v_value = match.group(1)
                # 构建完整的 content_url
                content_url = f"{host_url}/_getcontent.php?id={chapid}&v={v_value}"
                break
    if content_url is None:
        logging.warning(f'Content url not found in {url}')
        return None, None

    # 获取小说的目录
    table_of_contents = []
    div_table_of_contents = soup.find('div', class_='mulu_con')
    if div_table_of_contents or False:  # 考虑要不要加上这个
        section_titles = div_table_of_contents.find_all('p')
        sections = div_table_of_contents.find_all('ul')
        if len(sections) != len(section_titles):
            logging.warning(f'sections not matched titles')
        else:
            for i in range(len(sections)):
                section_title = section_titles[i].get_text().strip()
                chap_list = sections[i].find_all("a")
                chap_data = []
                for chap in chap_list:
                    chap_title = chap.get_text().strip()
                    chap_link = chap['href']
                    chap_data.append({'href': chap_link, 'title': chap_title})
                table_of_contents.append({'title': section_title, 'chapters': chap_data})

    # 查找下一章的链接
    next_div = soup.find('div', class_='next_arrow')
    if next_div:
        next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))
        if next_page_tag:
            next_url = f"{host_url}/{next_page_tag['href']}" if next_page_tag['href'] else ''

    data = {'title': title, 'content_url': content_url, 'table_of_contents': table_of_contents}
    return data, next_url


def process_paragraph(paragraph):
    # 获取完整的 HTML 结构，而不是 get_text()
    paragraph_html = str(paragraph)

    # 移除水印标签
    cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', paragraph_html, flags=re.DOTALL)

    # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
    soup = BeautifulSoup(cleaned_html, 'html.parser')
    cleaned_text = soup.get_text().strip()

    return cleaned_text

# 解析内容页
def parse_content_page2(soup, url):
    content = []
    paragraphs = soup.find_all('p')
    if paragraphs:
        for paragraph in paragraphs:
            cleaned_text = process_paragraph(paragraph)
            content.append(cleaned_text)
    else:
        # 某些页面，没有p标签，只有一个h1，要兼容此问题
        paragraphs = soup.find_all('h1')
        if paragraphs:
            for paragraph in paragraphs:
                cleaned_text = process_paragraph(paragraph)
                content.append(cleaned_text)

    # 某些页面只有<br>标签，soup.stripped_strings：返回去除空白后的所有文本节点。
    if len(content) == 0:
        content = [block.strip() for block in soup.stripped_strings if block.strip()]

    return content

def parse_content_page(soup, url):
    content = []

    # 提取所有 p 标签和 h1 标签
    paragraphs = soup.find_all(['p', 'h1'])
    if paragraphs:
        for paragraph in paragraphs:
            cleaned_text = process_paragraph(paragraph)
            if cleaned_text:
                content.append(cleaned_text)

    # 如果没有找到 p 或 h1，再兜底提取所有文本，同时移除水印
    if not content:
        cleaned_html = process_paragraph(soup)
        cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
        content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()]

    return content


# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":
        return soup.find(tag, id=identifier) is not None
    elif attr_type == "class":
        return bool(soup.find_all(tag, class_=identifier))
    elif attr_type == "name":
        return bool(soup.find('select', {'name': identifier}))
    return False

# 对内容是否被污染的判断
def content_validator(soup):
    text = str(soup)
    dirty_words = ['2005-2024 疯情书库', '2005-2025 疯情书库', '2025 疯情书库', '2026 疯情书库', '2027 疯情书库']
    for word in dirty_words:
        if word in text:
            return False

    return True


def test_content_page(url):
    soup, status_code = fetch_page(url, content_validator)
    if soup:
        data = parse_content_page(soup, url)
        if data:
            return data
        else :
            return []


def test_chapter_page(url):
    soup, status_code = fetch_page(url, partial(generic_validator, tag="h1", identifier="chapter_title", attr_type="class"))
    if soup:
        data, next_url = parse_chapter_page(soup, url)
        if data:
            return data
        else :
            return None

def test_book_detail(url):
    soup, status_code = fetch_page(url, partial(generic_validator, tag="li", identifier="zuopinxinxi", attr_type="class"))
    if soup:
        detail = parse_book_detail(soup, url)
        return detail


def test_book_list():
    for num in range(5):
        url = list_url_update.format(num)
        soup, status_code = fetch_page(url, partial(generic_validator, tag="div", identifier="list_main", attr_type="class"))
        if soup:
            # 获取书籍列表
            list_data, next_url = parse_book_list(soup, url=url)
            for item in list_data:
                # 获取详情页
                detail = test_book_detail(item['href'])
                if detail:
                    print({
                        **item,
                        **detail
                    })

                    # 获取内容页
                    page_data = test_chapter_page(detail['start_page_href'])
                    if page_data:
                        print(page_data)
                        # 获取内容
                        contents = test_content_page(page_data['content_url'])
                        if contents and len(contents)>0:
                            print (contents[0])

                else:
                    print('get detail error.')
                return


if __name__ == "__main__":
    test_book_list()