modify aabook files.

2024-11-04 10:56:43 +08:00
parent fcf6f8a945
commit 2c3b1b7cdf
4 changed files with 136 additions and 25 deletions
--- a/scripts/aabook_list.py
+++ b/scripts/aabook_list.py
@ -1,6 +1,20 @@
 # 定义小说映射
 novel_map_new = {
    138219: '我的将军生涯',
    6548: '我和我哥们的女友的女友的故事',
 }
 # 定义小说映射
 novel_map = {
    605: '我的支书生涯',
    138219: '我的将军生涯',
    6548: '我和我哥们的女友的女友的故事',
    203144: '我的校长生涯',
 }
 novel_map_done = {
    5479: '倚天屠龙记（成人版）',
    269: '雪域往事',
    156643: '都市偷心龙爪手',
    85227: '明星潜规则之皇',
@ -18,13 +32,6 @@ novel_map_new = {
    61336: '妻欲：欲望迷城（H 版）',
    104929: '都市奇缘',
    239682: '叶辰风流',
 }
 # 定义小说映射
 novel_map = {
 }
 novel_map_done = {
    261481: '我本风流',
    171107: '爱与欲的升华',
    171029: '亲爱的不要离开我',
@ -110,7 +117,6 @@ novel_map_done = {
    4701: '艰难的借种经历',
    162845: '人妻牌坊——我和人妻的故事',
    183692: '幸福家庭背后的隐私',
    203144: '我的校长生涯',
    140605: '东北大炕',
    24344: '淫乱一家亲（超级乱伦家庭）',
    25154: '全家人互爱共乐的日子',
--- a/scripts/aabook_tools.py
+++ b/scripts/aabook_tools.py
@ -0,0 +1,61 @@
 import os
 def rename_files(list_file, data_dir):
    """
    重命名文件
    Args:
        list_file: 存放 novel_id 和 novel_name 的文件路径
        data_dir: 需要重命名文件的目录
    """
    # 读取列表文件，构建一个字典，key为novel_name，value为novel_id
    id_dict = {}
    with open(list_file, 'r', encoding='utf-8') as f:
        for line in f:
            novel_id, novel_name = line.strip().split('\t')
            id_dict[novel_name] = novel_id
    # 遍历 data 目录下的所有文件
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.txt'):
                # 获取文件名（不含扩展名）
                novel_name = file[:-4]
                # 判断文件名是否在字典中
                if novel_name in id_dict:
                    old_file = os.path.join(root, file)
                    new_file = os.path.join(root, f"{id_dict[novel_name]}_{novel_name}.txt")
                    os.rename(old_file, new_file)
                    print(f"Renamed {old_file} to {new_file}")
 def check_and_record(data_dir, search_string, output_file):
    """
    检查文件内容并记录
    Args:
        data_dir: 需要检查的目录
        search_string: 需要搜索的字符串
        output_file: 记录结果的文件
    """
    with open(output_file, 'w', encoding='utf-8') as output:
        for root, dirs, files in os.walk(data_dir):
            for file in files:
                if file.endswith('.txt'):
                    novel_name = file[:-4]
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        if search_string in f.read():
                            output.write(novel_name + '\n')
                            print(f"need update: {novel_name}")
 if __name__ == '__main__':
    # rename_files("aabook_down_list.txt", "data")
    data_dir = "data"
    search_string = "2005-2024 疯情书库"
    output_file = "aabook_need_update.txt"
    check_and_record(data_dir, search_string, output_file)
--- a/scripts/bak_get_aabook.py
+++ b/scripts/bak_get_aabook.py
@ -116,8 +116,15 @@ def extract_content_url(soup, base_url, chapid):
    # 如果未找到匹配的 script 标签，则返回 None
    return None
 # 判断内容是否被污染
 def check_content(content):
    if '2005-2024 疯情书库' in content:
        return False
    return True
 # 解析章节内容并保存到文件中
-def download_novel(chapid, novel_name):
+def download_novel(chapid, novel_name, novel_file_str):
    base_url = 'https://aabook.xyz'
    chapter_url = f'{base_url}/read-{chapid}.html'
@ -145,7 +152,7 @@ def download_novel(chapid, novel_name):
            continue
        # 写入标题到文件
-        with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
+        with open(novel_file_str, 'a', encoding='utf-8') as f:
            f.write(chapter_title + '\n\n')
        # 提取正文内容的请求地址
@ -156,11 +163,15 @@ def download_novel(chapid, novel_name):
            # 获取正文内容
            content_response = get_page_content(content_url)
            if content_response:
                if not check_content(content_response):
                    logging.error(f'error response. {content_response}')
                    continue
                content_soup = BeautifulSoup(content_response, 'html.parser')
                paragraphs = content_soup.find_all('p')
                # 写入每个段落内容到文件
-                with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
+                with open(novel_file_str, 'a', encoding='utf-8') as f:
                    for paragraph in paragraphs:
                        #cleaned_part = clean_watermarks(paragraph.get_text().strip())
                        #f.write(paragraph.get_text() + '\n\n')
@ -204,7 +215,8 @@ def download_novel(chapid, novel_name):
 # 遍历 novel_map，下载所有小说
 for novel_id, novel_name in novel_map.items():
    logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
-    if os.path.exists(f'{dir_prefix}/{novel_name}.txt'):
+    file_str = f'{dir_prefix}/{novel_id}_{novel_name}.txt'
-        os.remove(f'{dir_prefix}/{novel_name}.txt')  # 如果存在同名文件，删除重新下载
+    if os.path.exists(file_str):
-    download_novel(novel_id, novel_name)
+        os.remove(file_str)  # 如果存在同名文件，删除重新下载
-    logging.info(f"Completed download for {novel_name}.\n")
+    download_novel(novel_id, novel_name, file_str)
    logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
--- a/scripts/get_aabook_list.py
+++ b/scripts/get_aabook_list.py
@ -7,6 +7,8 @@ import time
 import re
 import logging
 import config # 日志配置
 from aabook_list import novel_map
 # 日志
 config.setup_logging()
@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz'
 list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
 curr_novel_pages = 0
-list_file = 'aabook_list.txt'
+cursor_dir = 'cursor'
-details_file = 'aabook_details.txt'
+
-down_list_file = 'aabook_down_list.txt'
+list_file = f'{cursor_dir}/aabook_list.txt'
 details_file = f'{cursor_dir}/aabook_details.txt'
 down_list_file = f'{cursor_dir}/aabook_down_list.txt'
 # User-Agent 列表
 user_agents = [
@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid):
    # 如果未找到匹配的 script 标签，则返回 None
    return None
 # 判断内容是否被污染
 def check_content(content):
    if '2005-2024 疯情书库' in content:
        return False
    return True
 # 计数器
 def reset_novel_pages():
    global curr_novel_pages
@ -223,7 +234,7 @@ def get_novel_pages():
 def download_novel(chapid, novel_name, dir_prefix='./aabook'):
    chapter_url = f'{base_url}/read-{chapid}.html'
-    novel_file = dir_prefix + '/' + novel_name + '.txt'
+    novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
    if os.path.exists(novel_file):
        os.remove(novel_file)  # 如果存在同名文件，删除重新下载
@ -251,10 +262,6 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
            time.sleep(2)
            continue
        # 写入标题到文件
        with open(novel_file, 'a', encoding='utf-8') as f:
            f.write(chapter_title + '\n\n')
        # 提取正文内容的请求地址
        content_url = extract_content_url(soup, base_url, chapid)
        if content_url:
@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
            # 获取正文内容
            content_response = get_page_content(content_url)
            if content_response:
                if not check_content(content_response):
                    logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
                    continue
                content_soup = BeautifulSoup(content_response, 'html.parser')
                paragraphs = content_soup.find_all('p')
                # 写入标题到文件
                with open(novel_file, 'a', encoding='utf-8') as f:
                    f.write(chapter_title + '\n\n')
                # 写入每个段落内容到文件
                with open(novel_file, 'a', encoding='utf-8') as f:
                    for paragraph in paragraphs:
@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name):
 # 下载小说，检查是否已经下载过
 def download_books():    
    if not os.path.isfile(details_file):
        logging.error(f'input file {details_file} not exist!')
        return
    if not os.path.isfile(down_list_file):
        logging.info(f'input file {down_list_file} not exist, use empty dict instead.')
    # 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
    downloaded_books = {}
    if os.path.exists(down_list_file):
@ -361,10 +383,18 @@ def download_books():
                down_list.write(f"{novel_id}\t{book_name}\n")
                logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
 # 下载指定的小说
 def download_map():
    # 遍历 novel_map，下载所有小说
    for novel_id, novel_name in novel_map.items():
        logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
        download_novel(novel_id, novel_name, './aabook')
        logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
 def main():
    if len(sys.argv) != 2:
        print("Usage: python script.py <cmd>")
-        print("cmd: get_list, get_detail, get_all, download")
+        print("cmd: get_list, get_detail, get_all, download, download_map")
        sys.exit(1)
    cmd = sys.argv[1]
@ -378,6 +408,8 @@ def main():
        get_detail()
    elif cmd == "download":
        download_books()  # 下载书籍功能
    elif cmd == "download_map":
        download_map()  # 下载书籍功能
    else:
        print(f"Unknown command: {cmd}")