From 2c3b1b7cdfd9a8ca28f600567f1cadf2e1dca7e3 Mon Sep 17 00:00:00 2001 From: oscar Date: Mon, 4 Nov 2024 10:56:43 +0800 Subject: [PATCH] modify aabook files. --- scripts/aabook_list.py | 22 ++++--- scripts/aabook_tools.py | 61 ++++++++++++++++++++ scripts/{get_aabook.py => bak_get_aabook.py} | 26 ++++++--- scripts/get_aabook_list.py | 52 +++++++++++++---- 4 files changed, 136 insertions(+), 25 deletions(-) create mode 100644 scripts/aabook_tools.py rename scripts/{get_aabook.py => bak_get_aabook.py} (92%) diff --git a/scripts/aabook_list.py b/scripts/aabook_list.py index 419e445..bde6011 100644 --- a/scripts/aabook_list.py +++ b/scripts/aabook_list.py @@ -1,6 +1,20 @@ # 定义小说映射 novel_map_new = { + 138219: '我的将军生涯', + 6548: '我和我哥们的女友的女友的故事', +} +# 定义小说映射 +novel_map = { + 605: '我的支书生涯', + 138219: '我的将军生涯', + 6548: '我和我哥们的女友的女友的故事', + 203144: '我的校长生涯', +} + + +novel_map_done = { + 5479: '倚天屠龙记(成人版)', 269: '雪域往事', 156643: '都市偷心龙爪手', 85227: '明星潜规则之皇', @@ -18,13 +32,6 @@ novel_map_new = { 61336: '妻欲:欲望迷城(H 版)', 104929: '都市奇缘', 239682: '叶辰风流', -} -# 定义小说映射 -novel_map = { -} - - -novel_map_done = { 261481: '我本风流', 171107: '爱与欲的升华', 171029: '亲爱的不要离开我', @@ -110,7 +117,6 @@ novel_map_done = { 4701: '艰难的借种经历', 162845: '人妻牌坊——我和人妻的故事', 183692: '幸福家庭背后的隐私', - 203144: '我的校长生涯', 140605: '东北大炕', 24344: '淫乱一家亲(超级乱伦家庭)', 25154: '全家人互爱共乐的日子', diff --git a/scripts/aabook_tools.py b/scripts/aabook_tools.py new file mode 100644 index 0000000..98b914d --- /dev/null +++ b/scripts/aabook_tools.py @@ -0,0 +1,61 @@ +import os + +def rename_files(list_file, data_dir): + """ + 重命名文件 + + Args: + list_file: 存放 novel_id 和 novel_name 的文件路径 + data_dir: 需要重命名文件的目录 + """ + + # 读取列表文件,构建一个字典,key为novel_name,value为novel_id + id_dict = {} + with open(list_file, 'r', encoding='utf-8') as f: + for line in f: + novel_id, novel_name = line.strip().split('\t') + id_dict[novel_name] = novel_id + + # 遍历 data 目录下的所有文件 + for root, dirs, files in os.walk(data_dir): + for file in files: + if file.endswith('.txt'): + # 获取文件名(不含扩展名) + novel_name = file[:-4] + # 判断文件名是否在字典中 + if novel_name in id_dict: + old_file = os.path.join(root, file) + new_file = os.path.join(root, f"{id_dict[novel_name]}_{novel_name}.txt") + os.rename(old_file, new_file) + print(f"Renamed {old_file} to {new_file}") + + +def check_and_record(data_dir, search_string, output_file): + """ + 检查文件内容并记录 + + Args: + data_dir: 需要检查的目录 + search_string: 需要搜索的字符串 + output_file: 记录结果的文件 + """ + + with open(output_file, 'w', encoding='utf-8') as output: + for root, dirs, files in os.walk(data_dir): + for file in files: + if file.endswith('.txt'): + novel_name = file[:-4] + file_path = os.path.join(root, file) + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + if search_string in f.read(): + output.write(novel_name + '\n') + print(f"need update: {novel_name}") + + +if __name__ == '__main__': + # rename_files("aabook_down_list.txt", "data") + + data_dir = "data" + search_string = "2005-2024 疯情书库" + output_file = "aabook_need_update.txt" + check_and_record(data_dir, search_string, output_file) \ No newline at end of file diff --git a/scripts/get_aabook.py b/scripts/bak_get_aabook.py similarity index 92% rename from scripts/get_aabook.py rename to scripts/bak_get_aabook.py index 1ef5823..47694ee 100644 --- a/scripts/get_aabook.py +++ b/scripts/bak_get_aabook.py @@ -116,8 +116,15 @@ def extract_content_url(soup, base_url, chapid): # 如果未找到匹配的 script 标签,则返回 None return None +# 判断内容是否被污染 +def check_content(content): + if '2005-2024 疯情书库' in content: + return False + + return True + # 解析章节内容并保存到文件中 -def download_novel(chapid, novel_name): +def download_novel(chapid, novel_name, novel_file_str): base_url = 'https://aabook.xyz' chapter_url = f'{base_url}/read-{chapid}.html' @@ -145,7 +152,7 @@ def download_novel(chapid, novel_name): continue # 写入标题到文件 - with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f: + with open(novel_file_str, 'a', encoding='utf-8') as f: f.write(chapter_title + '\n\n') # 提取正文内容的请求地址 @@ -156,11 +163,15 @@ def download_novel(chapid, novel_name): # 获取正文内容 content_response = get_page_content(content_url) if content_response: + if not check_content(content_response): + logging.error(f'error response. {content_response}') + continue + content_soup = BeautifulSoup(content_response, 'html.parser') paragraphs = content_soup.find_all('p') # 写入每个段落内容到文件 - with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f: + with open(novel_file_str, 'a', encoding='utf-8') as f: for paragraph in paragraphs: #cleaned_part = clean_watermarks(paragraph.get_text().strip()) #f.write(paragraph.get_text() + '\n\n') @@ -204,7 +215,8 @@ def download_novel(chapid, novel_name): # 遍历 novel_map,下载所有小说 for novel_id, novel_name in novel_map.items(): logging.info(f"Starting download for {novel_name} (ID: {novel_id})") - if os.path.exists(f'{dir_prefix}/{novel_name}.txt'): - os.remove(f'{dir_prefix}/{novel_name}.txt') # 如果存在同名文件,删除重新下载 - download_novel(novel_id, novel_name) - logging.info(f"Completed download for {novel_name}.\n") \ No newline at end of file + file_str = f'{dir_prefix}/{novel_id}_{novel_name}.txt' + if os.path.exists(file_str): + os.remove(file_str) # 如果存在同名文件,删除重新下载 + download_novel(novel_id, novel_name, file_str) + logging.info(f"Completed download for {novel_id}_{novel_name}.\n") \ No newline at end of file diff --git a/scripts/get_aabook_list.py b/scripts/get_aabook_list.py index 1bcaeb6..b40ea92 100644 --- a/scripts/get_aabook_list.py +++ b/scripts/get_aabook_list.py @@ -7,6 +7,8 @@ import time import re import logging import config # 日志配置 +from aabook_list import novel_map + # 日志 config.setup_logging() @@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz' list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount' curr_novel_pages = 0 -list_file = 'aabook_list.txt' -details_file = 'aabook_details.txt' -down_list_file = 'aabook_down_list.txt' +cursor_dir = 'cursor' + +list_file = f'{cursor_dir}/aabook_list.txt' +details_file = f'{cursor_dir}/aabook_details.txt' +down_list_file = f'{cursor_dir}/aabook_down_list.txt' # User-Agent 列表 user_agents = [ @@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid): # 如果未找到匹配的 script 标签,则返回 None return None +# 判断内容是否被污染 +def check_content(content): + if '2005-2024 疯情书库' in content: + return False + + return True + # 计数器 def reset_novel_pages(): global curr_novel_pages @@ -223,7 +234,7 @@ def get_novel_pages(): def download_novel(chapid, novel_name, dir_prefix='./aabook'): chapter_url = f'{base_url}/read-{chapid}.html' - novel_file = dir_prefix + '/' + novel_name + '.txt' + novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt' if os.path.exists(novel_file): os.remove(novel_file) # 如果存在同名文件,删除重新下载 @@ -250,11 +261,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'): logging.error(f"Chapter title not found in {chapter_url}, retry...") time.sleep(2) continue - - # 写入标题到文件 - with open(novel_file, 'a', encoding='utf-8') as f: - f.write(chapter_title + '\n\n') - + # 提取正文内容的请求地址 content_url = extract_content_url(soup, base_url, chapid) if content_url: @@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'): # 获取正文内容 content_response = get_page_content(content_url) if content_response: + if not check_content(content_response): + logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...') + continue + content_soup = BeautifulSoup(content_response, 'html.parser') paragraphs = content_soup.find_all('p') + # 写入标题到文件 + with open(novel_file, 'a', encoding='utf-8') as f: + f.write(chapter_title + '\n\n') + # 写入每个段落内容到文件 with open(novel_file, 'a', encoding='utf-8') as f: for paragraph in paragraphs: @@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name): # 下载小说,检查是否已经下载过 def download_books(): + if not os.path.isfile(details_file): + logging.error(f'input file {details_file} not exist!') + return + + if not os.path.isfile(down_list_file): + logging.info(f'input file {down_list_file} not exist, use empty dict instead.') + # 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名 downloaded_books = {} if os.path.exists(down_list_file): @@ -361,10 +383,18 @@ def download_books(): down_list.write(f"{novel_id}\t{book_name}\n") logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s") +# 下载指定的小说 +def download_map(): + # 遍历 novel_map,下载所有小说 + for novel_id, novel_name in novel_map.items(): + logging.info(f"Starting download for {novel_name} (ID: {novel_id})") + download_novel(novel_id, novel_name, './aabook') + logging.info(f"Completed download for {novel_id}_{novel_name}.\n") + def main(): if len(sys.argv) != 2: print("Usage: python script.py ") - print("cmd: get_list, get_detail, get_all, download") + print("cmd: get_list, get_detail, get_all, download, download_map") sys.exit(1) cmd = sys.argv[1] @@ -378,6 +408,8 @@ def main(): get_detail() elif cmd == "download": download_books() # 下载书籍功能 + elif cmd == "download_map": + download_map() # 下载书籍功能 else: print(f"Unknown command: {cmd}")