Files
stock/scripts/get_aabook.py
2024-10-23 08:27:29 +08:00

210 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Script Name:
Description: 获取 aabook.xyz 数据, prompt:
我们需要访问 https://aabook.xyz/read-{chapid}.html 这个地址,并解析其中的元素,发起下载链接,格式化返回的数据,以下是需求详细描述:
chapid 参数,它代表的是小说编号,我们定义一个映射,比如 novel_map = {350316:'novel1', ...} 等等
我们遍历 novel_map对每一个key值假设为 novel_id传入上面的URL组成一个访问地址获取它的内容它是一个HTML页面把对应的 value 记为 novel_name创建 {novel_name}.txt 文件;
我们解析它的 body 中的 <h1 class="chapter_title">第三章 惊人的任务</h1> 标签,获得对应的 title
我们解析它的 body 中的 <div class="next_arrow"><a href="read-350317.html" title="下一章 第四章 急病急医" class="pngFix"></a></div> 标签,得到里面的链接地址,拼上访问域名,就是 next_page 的地址;如果标签不存在,则说明已经全部读取完毕;
在 body中有一段 javascript 代码,$.get("./_getcontent.php?id="+chapid+"&v=f2cd0JFa_wH0alpBjF4xgS2WFKyo0mQijsHgPQhZmBEjKCEP0wes", 我们需要解析出它访问的地址,加上域名,得到真正的内容 content_url;
访问 content_url ,获取它的内容,解析 body 中所有 <p></p> 标签的内容,每一部分都是一个段落,我们定义为 part
解析part中的内容把其中诸如 <rt class="Odj9EB5dqNidqH7W57IvJMpHzRq5W">feng情书库</rt> 这样的内容直接去掉,它是网站的隐藏水印;其特征是用 任意 HTML 标签包裹着的部分, <{label} class="" >XXXX</{label}> 其中label可能为任意的字符串
我们把 title 写入到 {novel_name}.txt 中;并循环写入所有的 part 注意每次写入一个part都添加换行。
如果有 next_page 那么就继续这个过程,一直到全部完成,这样我们就完成了 novel_id 对应的小说的下载。
继续遍历 novel_map完成所有小说的下载。
请你理解上述需求并写出对应的python代码。
Author: [Your Name]
Created Date: YYYY-MM-DD
Last Modified: YYYY-MM-DD
Version: 1.0
Modification History:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
- YYYY-MM-DD [Your Name]:
"""
import re
import os
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import config # 日志配置
from aabook_list import novel_map
config.setup_logging()
# User-Agent 列表
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
"Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
]
dir_prefix = './aabook'
# 定义获取页面内容的函数,带重试机制
def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10):
retries = 0
# 随机选择一个 User-Agent
headers = {
'User-Agent': random.choice(user_agents)
}
while retries < max_retries:
try:
response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
response.raise_for_status()
return response.text # 请求成功,返回内容
except requests.RequestException as e:
retries += 1
logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...")
if retries >= max_retries:
logging.error(f"Failed to fetch page {url} after {max_retries} retries.")
return None
time.sleep(sleep_time) # 休眠指定的时间,然后重试
# 解析内容中的水印部分
def clean_watermarks(html):
"""
过滤掉带有 class 属性的水印标签及其内部内容,保留其他标签结构。
"""
# 使用正则表达式匹配并移除任何带有 class 属性的 HTML 标签及其内容
cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', html, flags=re.DOTALL)
return cleaned_html
def process_paragraph(paragraph):
# 获取完整的 HTML 结构,而不是 get_text()
paragraph_html = str(paragraph)
# 移除水印标签
cleaned_html = clean_watermarks(paragraph_html)
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
soup = BeautifulSoup(cleaned_html, 'html.parser')
cleaned_text = soup.get_text().strip()
return cleaned_text
# 从 script 标签中提取 content_url
def extract_content_url(soup, base_url, chapid):
# 找到所有 <script> 标签
script_tags = soup.find_all('script')
# 遍历每一个 <script> 标签,查找包含特定内容的标签
for script_tag in script_tags:
script_content = script_tag.string
if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
# 匹配到特定内容,提取出 _getcontent.php 的 URL 模板
match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
if match:
# 从匹配中提取 v 参数值
v_value = match.group(1)
# 构建完整的 content_url
content_url = f"{base_url}/_getcontent.php?id={chapid}&v={v_value}"
return content_url
# 如果未找到匹配的 script 标签,则返回 None
return None
# 解析章节内容并保存到文件中
def download_novel(chapid, novel_name):
base_url = 'https://aabook.xyz'
chapter_url = f'{base_url}/read-{chapid}.html'
while chapter_url:
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
# 获取章节页面内容
html_content = get_page_content(chapter_url)
if html_content is None:
logging.error(f"Get page error {chapter_url}, retry...")
time.sleep(2)
continue
# 解析章节内容
soup = BeautifulSoup(html_content, 'html.parser')
# 获取章节标题
chapter_title_tag = soup.find('h1', class_='chapter_title')
if chapter_title_tag:
chapter_title = chapter_title_tag.get_text().strip()
logging.info(f"Processing: [{novel_name}] [{chapid}] Chapter Title: {chapter_title}")
else:
logging.error(f"Chapter title not found in {chapter_url}, retry...")
time.sleep(2)
continue
# 写入标题到文件
with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
f.write(chapter_title + '\n\n')
# 提取正文内容的请求地址
content_url = extract_content_url(soup, base_url, chapid)
if content_url:
logging.info(f"Fetching content from: {content_url}")
# 获取正文内容
content_response = get_page_content(content_url)
if content_response:
content_soup = BeautifulSoup(content_response, 'html.parser')
paragraphs = content_soup.find_all('p')
# 写入每个段落内容到文件
with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
for paragraph in paragraphs:
#cleaned_part = clean_watermarks(paragraph.get_text().strip())
#f.write(paragraph.get_text() + '\n\n')
#f.write(cleaned_part + '\n\n')
cleaned_text = process_paragraph(paragraph)
f.write(cleaned_text + '\n\n')
logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
else:
logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
continue
else:
logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
continue
# 查找下一章的链接
next_div = soup.find('div', class_='next_arrow')
# 判断是否找到了包含下一章链接的 div 标签
if next_div:
next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))
if next_page_tag:
next_page_url = next_page_tag['href']
# 使用正则提取其中的章节 ID数字部分
chapid_match = re.search(r'read-(\d+)\.html', next_page_url)
if chapid_match:
chapid = chapid_match.group(1) # 提取到的章节 ID
chapter_url = f"{base_url}/{next_page_url}"
logging.debug(f"Next chapter URL: {chapter_url}, chapid: {chapid}")
else:
logging.info(f"Failed to extract chapid from next_page_url: {next_page_url}")
break
else:
logging.info(f"No next page found. Ending download for {novel_name}.")
break
else:
logging.info(f"No 'next_arrow' div found in {chapter_url}. Ending download.")
break
time.sleep(2)
# 遍历 novel_map下载所有小说
for novel_id, novel_name in novel_map.items():
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
if os.path.exists(f'{dir_prefix}/{novel_name}.txt'):
os.remove(f'{dir_prefix}/{novel_name}.txt') # 如果存在同名文件,删除重新下载
download_novel(novel_id, novel_name)
logging.info(f"Completed download for {novel_name}.\n")