modify aabook files.

2024-11-04 10:56:43 +08:00
parent fcf6f8a945
commit 2c3b1b7cdf
4 changed files with 136 additions and 25 deletions
--- a/scripts/get_aabook_list.py
+++ b/scripts/get_aabook_list.py
@ -7,6 +7,8 @@ import time
 import re
 import logging
 import config # 日志配置
+from aabook_list import novel_map
+

 # 日志
 config.setup_logging()
@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz'
 list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
 curr_novel_pages = 0

-list_file = 'aabook_list.txt'
-details_file = 'aabook_details.txt'
-down_list_file = 'aabook_down_list.txt'
+cursor_dir = 'cursor'
+
+list_file = f'{cursor_dir}/aabook_list.txt'
+details_file = f'{cursor_dir}/aabook_details.txt'
+down_list_file = f'{cursor_dir}/aabook_down_list.txt'

 # User-Agent 列表
 user_agents = [
@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid):
    # 如果未找到匹配的 script 标签，则返回 None
    return None

+# 判断内容是否被污染
+def check_content(content):
+    if '2005-2024 疯情书库' in content:
+        return False
+        
+    return True
+
 # 计数器
 def reset_novel_pages():
    global curr_novel_pages
@ -223,7 +234,7 @@ def get_novel_pages():
 def download_novel(chapid, novel_name, dir_prefix='./aabook'):
    chapter_url = f'{base_url}/read-{chapid}.html'

-    novel_file = dir_prefix + '/' + novel_name + '.txt'
+    novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
    if os.path.exists(novel_file):
        os.remove(novel_file)  # 如果存在同名文件，删除重新下载

@ -250,11 +261,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
            logging.error(f"Chapter title not found in {chapter_url}, retry...")
            time.sleep(2)
            continue
-        
-        # 写入标题到文件
-        with open(novel_file, 'a', encoding='utf-8') as f:
-            f.write(chapter_title + '\n\n')
-        
+                
        # 提取正文内容的请求地址
        content_url = extract_content_url(soup, base_url, chapid)
        if content_url:
@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
            # 获取正文内容
            content_response = get_page_content(content_url)
            if content_response:
+                if not check_content(content_response):
+                    logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
+                    continue
+
                content_soup = BeautifulSoup(content_response, 'html.parser')
                paragraphs = content_soup.find_all('p')
                
+                # 写入标题到文件
+                with open(novel_file, 'a', encoding='utf-8') as f:
+                    f.write(chapter_title + '\n\n')
+
                # 写入每个段落内容到文件
                with open(novel_file, 'a', encoding='utf-8') as f:
                    for paragraph in paragraphs:
@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name):

 # 下载小说，检查是否已经下载过
 def download_books():    
+    if not os.path.isfile(details_file):
+        logging.error(f'input file {details_file} not exist!')
+        return
+    
+    if not os.path.isfile(down_list_file):
+        logging.info(f'input file {down_list_file} not exist, use empty dict instead.')
+    
    # 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
    downloaded_books = {}
    if os.path.exists(down_list_file):
@ -361,10 +383,18 @@ def download_books():
                down_list.write(f"{novel_id}\t{book_name}\n")
                logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")

+# 下载指定的小说
+def download_map():
+    # 遍历 novel_map，下载所有小说
+    for novel_id, novel_name in novel_map.items():
+        logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
+        download_novel(novel_id, novel_name, './aabook')
+        logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
+
 def main():
    if len(sys.argv) != 2:
        print("Usage: python script.py <cmd>")
-        print("cmd: get_list, get_detail, get_all, download")
+        print("cmd: get_list, get_detail, get_all, download, download_map")
        sys.exit(1)

    cmd = sys.argv[1]
@ -378,6 +408,8 @@ def main():
        get_detail()
    elif cmd == "download":
        download_books()  # 下载书籍功能
+    elif cmd == "download_map":
+        download_map()  # 下载书籍功能
    else:
        print(f"Unknown command: {cmd}")