From 2c3b1b7cdfd9a8ca28f600567f1cadf2e1dca7e3 Mon Sep 17 00:00:00 2001
From: oscar <oscar@easyprompt8.com>
Date: Mon, 4 Nov 2024 10:56:43 +0800
Subject: [PATCH] modify aabook files.

---
 scripts/aabook_list.py                       | 22 ++++---
 scripts/aabook_tools.py                      | 61 ++++++++++++++++++++
 scripts/{get_aabook.py => bak_get_aabook.py} | 26 ++++++---
 scripts/get_aabook_list.py                   | 52 +++++++++++++----
 4 files changed, 136 insertions(+), 25 deletions(-)
 create mode 100644 scripts/aabook_tools.py
 rename scripts/{get_aabook.py => bak_get_aabook.py} (92%)

diff --git a/scripts/aabook_list.py b/scripts/aabook_list.py
index 419e445..bde6011 100644
--- a/scripts/aabook_list.py
+++ b/scripts/aabook_list.py
@@ -1,6 +1,20 @@
 
 # 定义小说映射
 novel_map_new = {
+    138219: '我的将军生涯',
+    6548: '我和我哥们的女友的女友的故事',
+}
+# 定义小说映射
+novel_map = {
+    605: '我的支书生涯',
+    138219: '我的将军生涯',
+    6548: '我和我哥们的女友的女友的故事',
+    203144: '我的校长生涯',
+}
+
+
+novel_map_done = {
+    5479: '倚天屠龙记（成人版）',
     269: '雪域往事',
     156643: '都市偷心龙爪手',
     85227: '明星潜规则之皇',
@@ -18,13 +32,6 @@ novel_map_new = {
     61336: '妻欲：欲望迷城（H 版）',
     104929: '都市奇缘',
     239682: '叶辰风流',
-}
-# 定义小说映射
-novel_map = {
-}
-
-
-novel_map_done = {
     261481: '我本风流',
     171107: '爱与欲的升华',
     171029: '亲爱的不要离开我',
@@ -110,7 +117,6 @@ novel_map_done = {
     4701: '艰难的借种经历',
     162845: '人妻牌坊——我和人妻的故事',
     183692: '幸福家庭背后的隐私',
-    203144: '我的校长生涯',
     140605: '东北大炕',
     24344: '淫乱一家亲（超级乱伦家庭）',
     25154: '全家人互爱共乐的日子',
diff --git a/scripts/aabook_tools.py b/scripts/aabook_tools.py
new file mode 100644
index 0000000..98b914d
--- /dev/null
+++ b/scripts/aabook_tools.py
@@ -0,0 +1,61 @@
+import os
+
+def rename_files(list_file, data_dir):
+    """
+    重命名文件
+
+    Args:
+        list_file: 存放 novel_id 和 novel_name 的文件路径
+        data_dir: 需要重命名文件的目录
+    """
+
+    # 读取列表文件，构建一个字典，key为novel_name，value为novel_id
+    id_dict = {}
+    with open(list_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            novel_id, novel_name = line.strip().split('\t')
+            id_dict[novel_name] = novel_id
+
+    # 遍历 data 目录下的所有文件
+    for root, dirs, files in os.walk(data_dir):
+        for file in files:
+            if file.endswith('.txt'):
+                # 获取文件名（不含扩展名）
+                novel_name = file[:-4]
+                # 判断文件名是否在字典中
+                if novel_name in id_dict:
+                    old_file = os.path.join(root, file)
+                    new_file = os.path.join(root, f"{id_dict[novel_name]}_{novel_name}.txt")
+                    os.rename(old_file, new_file)
+                    print(f"Renamed {old_file} to {new_file}")
+
+
+def check_and_record(data_dir, search_string, output_file):
+    """
+    检查文件内容并记录
+
+    Args:
+        data_dir: 需要检查的目录
+        search_string: 需要搜索的字符串
+        output_file: 记录结果的文件
+    """
+
+    with open(output_file, 'w', encoding='utf-8') as output:
+        for root, dirs, files in os.walk(data_dir):
+            for file in files:
+                if file.endswith('.txt'):
+                    novel_name = file[:-4]
+                    file_path = os.path.join(root, file)
+                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                        if search_string in f.read():
+                            output.write(novel_name + '\n')
+                            print(f"need update: {novel_name}")
+
+
+if __name__ == '__main__':
+    # rename_files("aabook_down_list.txt", "data")
+
+    data_dir = "data"
+    search_string = "2005-2024 疯情书库"
+    output_file = "aabook_need_update.txt"
+    check_and_record(data_dir, search_string, output_file)
\ No newline at end of file
diff --git a/scripts/get_aabook.py b/scripts/bak_get_aabook.py
similarity index 92%
rename from scripts/get_aabook.py
rename to scripts/bak_get_aabook.py
index 1ef5823..47694ee 100644
--- a/scripts/get_aabook.py
+++ b/scripts/bak_get_aabook.py
@@ -116,8 +116,15 @@ def extract_content_url(soup, base_url, chapid):
     # 如果未找到匹配的 script 标签，则返回 None
     return None
 
+# 判断内容是否被污染
+def check_content(content):
+    if '2005-2024 疯情书库' in content:
+        return False
+        
+    return True
+
 # 解析章节内容并保存到文件中
-def download_novel(chapid, novel_name):
+def download_novel(chapid, novel_name, novel_file_str):
     base_url = 'https://aabook.xyz'
     chapter_url = f'{base_url}/read-{chapid}.html'
     
@@ -145,7 +152,7 @@ def download_novel(chapid, novel_name):
             continue
         
         # 写入标题到文件
-        with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
+        with open(novel_file_str, 'a', encoding='utf-8') as f:
             f.write(chapter_title + '\n\n')
         
         # 提取正文内容的请求地址
@@ -156,11 +163,15 @@ def download_novel(chapid, novel_name):
             # 获取正文内容
             content_response = get_page_content(content_url)
             if content_response:
+                if not check_content(content_response):
+                    logging.error(f'error response. {content_response}')
+                    continue
+
                 content_soup = BeautifulSoup(content_response, 'html.parser')
                 paragraphs = content_soup.find_all('p')
                 
                 # 写入每个段落内容到文件
-                with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
+                with open(novel_file_str, 'a', encoding='utf-8') as f:
                     for paragraph in paragraphs:
                         #cleaned_part = clean_watermarks(paragraph.get_text().strip())
                         #f.write(paragraph.get_text() + '\n\n')
@@ -204,7 +215,8 @@ def download_novel(chapid, novel_name):
 # 遍历 novel_map，下载所有小说
 for novel_id, novel_name in novel_map.items():
     logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
-    if os.path.exists(f'{dir_prefix}/{novel_name}.txt'):
-        os.remove(f'{dir_prefix}/{novel_name}.txt')  # 如果存在同名文件，删除重新下载
-    download_novel(novel_id, novel_name)
-    logging.info(f"Completed download for {novel_name}.\n")
\ No newline at end of file
+    file_str = f'{dir_prefix}/{novel_id}_{novel_name}.txt'
+    if os.path.exists(file_str):
+        os.remove(file_str)  # 如果存在同名文件，删除重新下载
+    download_novel(novel_id, novel_name, file_str)
+    logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
\ No newline at end of file
diff --git a/scripts/get_aabook_list.py b/scripts/get_aabook_list.py
index 1bcaeb6..b40ea92 100644
--- a/scripts/get_aabook_list.py
+++ b/scripts/get_aabook_list.py
@@ -7,6 +7,8 @@ import time
 import re
 import logging
 import config # 日志配置
+from aabook_list import novel_map
+
 
 # 日志
 config.setup_logging()
@@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz'
 list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
 curr_novel_pages = 0
 
-list_file = 'aabook_list.txt'
-details_file = 'aabook_details.txt'
-down_list_file = 'aabook_down_list.txt'
+cursor_dir = 'cursor'
+
+list_file = f'{cursor_dir}/aabook_list.txt'
+details_file = f'{cursor_dir}/aabook_details.txt'
+down_list_file = f'{cursor_dir}/aabook_down_list.txt'
 
 # User-Agent 列表
 user_agents = [
@@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid):
     # 如果未找到匹配的 script 标签，则返回 None
     return None
 
+# 判断内容是否被污染
+def check_content(content):
+    if '2005-2024 疯情书库' in content:
+        return False
+        
+    return True
+
 # 计数器
 def reset_novel_pages():
     global curr_novel_pages
@@ -223,7 +234,7 @@ def get_novel_pages():
 def download_novel(chapid, novel_name, dir_prefix='./aabook'):
     chapter_url = f'{base_url}/read-{chapid}.html'
 
-    novel_file = dir_prefix + '/' + novel_name + '.txt'
+    novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
     if os.path.exists(novel_file):
         os.remove(novel_file)  # 如果存在同名文件，删除重新下载
 
@@ -250,11 +261,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
             logging.error(f"Chapter title not found in {chapter_url}, retry...")
             time.sleep(2)
             continue
-        
-        # 写入标题到文件
-        with open(novel_file, 'a', encoding='utf-8') as f:
-            f.write(chapter_title + '\n\n')
-        
+                
         # 提取正文内容的请求地址
         content_url = extract_content_url(soup, base_url, chapid)
         if content_url:
@@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
             # 获取正文内容
             content_response = get_page_content(content_url)
             if content_response:
+                if not check_content(content_response):
+                    logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
+                    continue
+
                 content_soup = BeautifulSoup(content_response, 'html.parser')
                 paragraphs = content_soup.find_all('p')
                 
+                # 写入标题到文件
+                with open(novel_file, 'a', encoding='utf-8') as f:
+                    f.write(chapter_title + '\n\n')
+
                 # 写入每个段落内容到文件
                 with open(novel_file, 'a', encoding='utf-8') as f:
                     for paragraph in paragraphs:
@@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name):
 
 # 下载小说，检查是否已经下载过
 def download_books():    
+    if not os.path.isfile(details_file):
+        logging.error(f'input file {details_file} not exist!')
+        return
+    
+    if not os.path.isfile(down_list_file):
+        logging.info(f'input file {down_list_file} not exist, use empty dict instead.')
+    
     # 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
     downloaded_books = {}
     if os.path.exists(down_list_file):
@@ -361,10 +383,18 @@ def download_books():
                 down_list.write(f"{novel_id}\t{book_name}\n")
                 logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
 
+# 下载指定的小说
+def download_map():
+    # 遍历 novel_map，下载所有小说
+    for novel_id, novel_name in novel_map.items():
+        logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
+        download_novel(novel_id, novel_name, './aabook')
+        logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
+
 def main():
     if len(sys.argv) != 2:
         print("Usage: python script.py <cmd>")
-        print("cmd: get_list, get_detail, get_all, download")
+        print("cmd: get_list, get_detail, get_all, download, download_map")
         sys.exit(1)
 
     cmd = sys.argv[1]
@@ -378,6 +408,8 @@ def main():
         get_detail()
     elif cmd == "download":
         download_books()  # 下载书籍功能
+    elif cmd == "download_map":
+        download_map()  # 下载书籍功能
     else:
         print(f"Unknown command: {cmd}")