modify aabook files.

2024-11-04 10:56:43 +08:00
parent fcf6f8a945
commit 2c3b1b7cdf
4 changed files with 136 additions and 25 deletions
--- a/scripts/bak_get_aabook.py
+++ b/scripts/bak_get_aabook.py
@ -0,0 +1,222 @@
+"""
+Script Name: 
+Description: 获取 aabook.xyz 数据， prompt:
+    我们需要访问 https://aabook.xyz/read-{chapid}.html 这个地址，并解析其中的元素，发起下载链接，格式化返回的数据，以下是需求详细描述：
+    chapid 参数，它代表的是小说编号，我们定义一个映射，比如 novel_map = {350316:'novel1', ...} 等等
+
+    我们遍历 novel_map，对每一个key值，假设为 novel_id，传入上面的URL，组成一个访问地址，获取它的内容，它是一个HTML页面；把对应的 value 记为 novel_name；创建 {novel_name}.txt 文件；
+    我们解析它的 body 中的 <h1 class="chapter_title">第三章 惊人的任务</h1> 标签，获得对应的 title；
+    我们解析它的 body 中的 <div class="next_arrow"><a href="read-350317.html" title="下一章 第四章 急病急医" class="pngFix"></a></div> 标签，得到里面的链接地址，拼上访问域名，就是 next_page 的地址；如果标签不存在，则说明已经全部读取完毕；
+    在 body中有一段 javascript 代码，$.get("./_getcontent.php?id="+chapid+"&v=f2cd0JFa_wH0alpBjF4xgS2WFKyo0mQijsHgPQhZmBEjKCEP0wes", 我们需要解析出它访问的地址，加上域名，得到真正的内容 content_url;
+    访问 content_url ，获取它的内容，解析 body 中所有 <p></p> 标签的内容，每一部分都是一个段落，我们定义为 part； 
+    解析part中的内容，把其中诸如 <rt class="Odj9EB5dqNidqH7W57IvJMpHzRq5W">feng情书库</rt> 这样的内容直接去掉，它是网站的隐藏水印；其特征是用 任意 HTML 标签包裹着的部分， <{label} class="" >XXXX</{label}> 其中label可能为任意的字符串；
+    我们把 title 写入到 {novel_name}.txt 中；并循环写入所有的 part ，注意每次写入一个part，都添加换行。
+    如果有 next_page， 那么就继续这个过程，一直到全部完成，这样我们就完成了 novel_id 对应的小说的下载。
+    继续遍历 novel_map，完成所有小说的下载。
+
+    请你理解上述需求，并写出对应的python代码。
+
+Author: [Your Name]
+Created Date: YYYY-MM-DD
+Last Modified: YYYY-MM-DD
+Version: 1.0
+
+
+Modification History:
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+    - YYYY-MM-DD [Your Name]: 
+"""
+
+import re
+import os
+import requests
+from bs4 import BeautifulSoup
+import time
+import random
+import logging
+import config # 日志配置
+from aabook_list import novel_map
+
+config.setup_logging()
+
+# User-Agent 列表
+user_agents = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
+    "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
+]
+
+dir_prefix = './aabook'
+
+# 定义获取页面内容的函数，带重试机制
+def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10):
+    retries = 0
+    # 随机选择一个 User-Agent
+    headers = {
+        'User-Agent': random.choice(user_agents)
+    }
+    
+    while retries < max_retries:
+        try:
+            response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
+            response.raise_for_status()
+            return response.text  # 请求成功，返回内容
+        except requests.RequestException as e:
+            retries += 1
+            logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...")
+            if retries >= max_retries:
+                logging.error(f"Failed to fetch page {url} after {max_retries} retries.")
+                return None
+            time.sleep(sleep_time)  # 休眠指定的时间，然后重试
+
+
+# 解析内容中的水印部分
+def clean_watermarks(html):
+    """
+    过滤掉带有 class 属性的水印标签及其内部内容，保留其他标签结构。
+    """
+    # 使用正则表达式匹配并移除任何带有 class 属性的 HTML 标签及其内容
+    cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', html, flags=re.DOTALL)
+    return cleaned_html
+
+def process_paragraph(paragraph):
+    # 获取完整的 HTML 结构，而不是 get_text()
+    paragraph_html = str(paragraph)
+
+    # 移除水印标签
+    cleaned_html = clean_watermarks(paragraph_html)
+
+    # 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
+    soup = BeautifulSoup(cleaned_html, 'html.parser')
+    cleaned_text = soup.get_text().strip()
+
+    return cleaned_text
+
+# 从 script 标签中提取 content_url
+def extract_content_url(soup, base_url, chapid):
+    # 找到所有 <script> 标签
+    script_tags = soup.find_all('script')
+
+    # 遍历每一个 <script> 标签，查找包含特定内容的标签
+    for script_tag in script_tags:
+        script_content = script_tag.string
+        if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
+            # 匹配到特定内容，提取出 _getcontent.php 的 URL 模板
+            match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
+            if match:
+                # 从匹配中提取 v 参数值
+                v_value = match.group(1)
+                # 构建完整的 content_url
+                content_url = f"{base_url}/_getcontent.php?id={chapid}&v={v_value}"
+                return content_url
+
+    # 如果未找到匹配的 script 标签，则返回 None
+    return None
+
+# 判断内容是否被污染
+def check_content(content):
+    if '2005-2024 疯情书库' in content:
+        return False
+        
+    return True
+
+# 解析章节内容并保存到文件中
+def download_novel(chapid, novel_name, novel_file_str):
+    base_url = 'https://aabook.xyz'
+    chapter_url = f'{base_url}/read-{chapid}.html'
+    
+    while chapter_url:
+        logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
+        
+        # 获取章节页面内容
+        html_content = get_page_content(chapter_url)
+        if html_content is None:
+            logging.error(f"Get page error {chapter_url}, retry...")
+            time.sleep(2)
+            continue
+        
+        # 解析章节内容
+        soup = BeautifulSoup(html_content, 'html.parser')
+        
+        # 获取章节标题
+        chapter_title_tag = soup.find('h1', class_='chapter_title')
+        if chapter_title_tag:
+            chapter_title = chapter_title_tag.get_text().strip()
+            logging.info(f"Processing: [{novel_name}] [{chapid}] Chapter Title: {chapter_title}")
+        else:
+            logging.error(f"Chapter title not found in {chapter_url}, retry...")
+            time.sleep(2)
+            continue
+        
+        # 写入标题到文件
+        with open(novel_file_str, 'a', encoding='utf-8') as f:
+            f.write(chapter_title + '\n\n')
+        
+        # 提取正文内容的请求地址
+        content_url = extract_content_url(soup, base_url, chapid)
+        if content_url:
+            logging.info(f"Fetching content from: {content_url}")
+            
+            # 获取正文内容
+            content_response = get_page_content(content_url)
+            if content_response:
+                if not check_content(content_response):
+                    logging.error(f'error response. {content_response}')
+                    continue
+
+                content_soup = BeautifulSoup(content_response, 'html.parser')
+                paragraphs = content_soup.find_all('p')
+                
+                # 写入每个段落内容到文件
+                with open(novel_file_str, 'a', encoding='utf-8') as f:
+                    for paragraph in paragraphs:
+                        #cleaned_part = clean_watermarks(paragraph.get_text().strip())
+                        #f.write(paragraph.get_text() + '\n\n')
+                        #f.write(cleaned_part + '\n\n')
+                        cleaned_text = process_paragraph(paragraph)
+                        f.write(cleaned_text + '\n\n')
+                logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
+            else:
+                logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
+                continue
+        else:
+            logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
+            continue
+        
+        # 查找下一章的链接
+        next_div = soup.find('div', class_='next_arrow')   
+        # 判断是否找到了包含下一章链接的 div 标签
+        if next_div:
+            next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))            
+            if next_page_tag:
+                next_page_url = next_page_tag['href']
+                
+                # 使用正则提取其中的章节 ID（数字部分）
+                chapid_match = re.search(r'read-(\d+)\.html', next_page_url)
+                if chapid_match:
+                    chapid = chapid_match.group(1)  # 提取到的章节 ID
+                    chapter_url = f"{base_url}/{next_page_url}"
+                    logging.debug(f"Next chapter URL: {chapter_url}, chapid: {chapid}")
+                else:
+                    logging.info(f"Failed to extract chapid from next_page_url: {next_page_url}")
+                    break
+            else:
+                logging.info(f"No next page found. Ending download for {novel_name}.")
+                break
+        else:
+            logging.info(f"No 'next_arrow' div found in {chapter_url}. Ending download.")
+            break
+
+        time.sleep(2)
+
+# 遍历 novel_map，下载所有小说
+for novel_id, novel_name in novel_map.items():
+    logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
+    file_str = f'{dir_prefix}/{novel_id}_{novel_name}.txt'
+    if os.path.exists(file_str):
+        os.remove(file_str)  # 如果存在同名文件，删除重新下载
+    download_novel(novel_id, novel_name, file_str)
+    logging.info(f"Completed download for {novel_id}_{novel_name}.\n")