modify aabook files.

This commit is contained in:
2024-11-04 10:56:43 +08:00
parent fcf6f8a945
commit 2c3b1b7cdf
4 changed files with 136 additions and 25 deletions

View File

@ -1,6 +1,20 @@
# 定义小说映射 # 定义小说映射
novel_map_new = { novel_map_new = {
138219: '我的将军生涯',
6548: '我和我哥们的女友的女友的故事',
}
# 定义小说映射
novel_map = {
605: '我的支书生涯',
138219: '我的将军生涯',
6548: '我和我哥们的女友的女友的故事',
203144: '我的校长生涯',
}
novel_map_done = {
5479: '倚天屠龙记(成人版)',
269: '雪域往事', 269: '雪域往事',
156643: '都市偷心龙爪手', 156643: '都市偷心龙爪手',
85227: '明星潜规则之皇', 85227: '明星潜规则之皇',
@ -18,13 +32,6 @@ novel_map_new = {
61336: '妻欲欲望迷城H 版)', 61336: '妻欲欲望迷城H 版)',
104929: '都市奇缘', 104929: '都市奇缘',
239682: '叶辰风流', 239682: '叶辰风流',
}
# 定义小说映射
novel_map = {
}
novel_map_done = {
261481: '我本风流', 261481: '我本风流',
171107: '爱与欲的升华', 171107: '爱与欲的升华',
171029: '亲爱的不要离开我', 171029: '亲爱的不要离开我',
@ -110,7 +117,6 @@ novel_map_done = {
4701: '艰难的借种经历', 4701: '艰难的借种经历',
162845: '人妻牌坊——我和人妻的故事', 162845: '人妻牌坊——我和人妻的故事',
183692: '幸福家庭背后的隐私', 183692: '幸福家庭背后的隐私',
203144: '我的校长生涯',
140605: '东北大炕', 140605: '东北大炕',
24344: '淫乱一家亲(超级乱伦家庭)', 24344: '淫乱一家亲(超级乱伦家庭)',
25154: '全家人互爱共乐的日子', 25154: '全家人互爱共乐的日子',

61
scripts/aabook_tools.py Normal file
View File

@ -0,0 +1,61 @@
import os
def rename_files(list_file, data_dir):
"""
重命名文件
Args:
list_file: 存放 novel_id 和 novel_name 的文件路径
data_dir: 需要重命名文件的目录
"""
# 读取列表文件构建一个字典key为novel_namevalue为novel_id
id_dict = {}
with open(list_file, 'r', encoding='utf-8') as f:
for line in f:
novel_id, novel_name = line.strip().split('\t')
id_dict[novel_name] = novel_id
# 遍历 data 目录下的所有文件
for root, dirs, files in os.walk(data_dir):
for file in files:
if file.endswith('.txt'):
# 获取文件名(不含扩展名)
novel_name = file[:-4]
# 判断文件名是否在字典中
if novel_name in id_dict:
old_file = os.path.join(root, file)
new_file = os.path.join(root, f"{id_dict[novel_name]}_{novel_name}.txt")
os.rename(old_file, new_file)
print(f"Renamed {old_file} to {new_file}")
def check_and_record(data_dir, search_string, output_file):
"""
检查文件内容并记录
Args:
data_dir: 需要检查的目录
search_string: 需要搜索的字符串
output_file: 记录结果的文件
"""
with open(output_file, 'w', encoding='utf-8') as output:
for root, dirs, files in os.walk(data_dir):
for file in files:
if file.endswith('.txt'):
novel_name = file[:-4]
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
if search_string in f.read():
output.write(novel_name + '\n')
print(f"need update: {novel_name}")
if __name__ == '__main__':
# rename_files("aabook_down_list.txt", "data")
data_dir = "data"
search_string = "2005-2024 疯情书库"
output_file = "aabook_need_update.txt"
check_and_record(data_dir, search_string, output_file)

View File

@ -116,8 +116,15 @@ def extract_content_url(soup, base_url, chapid):
# 如果未找到匹配的 script 标签,则返回 None # 如果未找到匹配的 script 标签,则返回 None
return None return None
# 判断内容是否被污染
def check_content(content):
if '2005-2024 疯情书库' in content:
return False
return True
# 解析章节内容并保存到文件中 # 解析章节内容并保存到文件中
def download_novel(chapid, novel_name): def download_novel(chapid, novel_name, novel_file_str):
base_url = 'https://aabook.xyz' base_url = 'https://aabook.xyz'
chapter_url = f'{base_url}/read-{chapid}.html' chapter_url = f'{base_url}/read-{chapid}.html'
@ -145,7 +152,7 @@ def download_novel(chapid, novel_name):
continue continue
# 写入标题到文件 # 写入标题到文件
with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f: with open(novel_file_str, 'a', encoding='utf-8') as f:
f.write(chapter_title + '\n\n') f.write(chapter_title + '\n\n')
# 提取正文内容的请求地址 # 提取正文内容的请求地址
@ -156,11 +163,15 @@ def download_novel(chapid, novel_name):
# 获取正文内容 # 获取正文内容
content_response = get_page_content(content_url) content_response = get_page_content(content_url)
if content_response: if content_response:
if not check_content(content_response):
logging.error(f'error response. {content_response}')
continue
content_soup = BeautifulSoup(content_response, 'html.parser') content_soup = BeautifulSoup(content_response, 'html.parser')
paragraphs = content_soup.find_all('p') paragraphs = content_soup.find_all('p')
# 写入每个段落内容到文件 # 写入每个段落内容到文件
with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f: with open(novel_file_str, 'a', encoding='utf-8') as f:
for paragraph in paragraphs: for paragraph in paragraphs:
#cleaned_part = clean_watermarks(paragraph.get_text().strip()) #cleaned_part = clean_watermarks(paragraph.get_text().strip())
#f.write(paragraph.get_text() + '\n\n') #f.write(paragraph.get_text() + '\n\n')
@ -204,7 +215,8 @@ def download_novel(chapid, novel_name):
# 遍历 novel_map下载所有小说 # 遍历 novel_map下载所有小说
for novel_id, novel_name in novel_map.items(): for novel_id, novel_name in novel_map.items():
logging.info(f"Starting download for {novel_name} (ID: {novel_id})") logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
if os.path.exists(f'{dir_prefix}/{novel_name}.txt'): file_str = f'{dir_prefix}/{novel_id}_{novel_name}.txt'
os.remove(f'{dir_prefix}/{novel_name}.txt') # 如果存在同名文件,删除重新下载 if os.path.exists(file_str):
download_novel(novel_id, novel_name) os.remove(file_str) # 如果存在同名文件,删除重新下载
logging.info(f"Completed download for {novel_name}.\n") download_novel(novel_id, novel_name, file_str)
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")

View File

@ -7,6 +7,8 @@ import time
import re import re
import logging import logging
import config # 日志配置 import config # 日志配置
from aabook_list import novel_map
# 日志 # 日志
config.setup_logging() config.setup_logging()
@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz'
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount' list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
curr_novel_pages = 0 curr_novel_pages = 0
list_file = 'aabook_list.txt' cursor_dir = 'cursor'
details_file = 'aabook_details.txt'
down_list_file = 'aabook_down_list.txt' list_file = f'{cursor_dir}/aabook_list.txt'
details_file = f'{cursor_dir}/aabook_details.txt'
down_list_file = f'{cursor_dir}/aabook_down_list.txt'
# User-Agent 列表 # User-Agent 列表
user_agents = [ user_agents = [
@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid):
# 如果未找到匹配的 script 标签,则返回 None # 如果未找到匹配的 script 标签,则返回 None
return None return None
# 判断内容是否被污染
def check_content(content):
if '2005-2024 疯情书库' in content:
return False
return True
# 计数器 # 计数器
def reset_novel_pages(): def reset_novel_pages():
global curr_novel_pages global curr_novel_pages
@ -223,7 +234,7 @@ def get_novel_pages():
def download_novel(chapid, novel_name, dir_prefix='./aabook'): def download_novel(chapid, novel_name, dir_prefix='./aabook'):
chapter_url = f'{base_url}/read-{chapid}.html' chapter_url = f'{base_url}/read-{chapid}.html'
novel_file = dir_prefix + '/' + novel_name + '.txt' novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
if os.path.exists(novel_file): if os.path.exists(novel_file):
os.remove(novel_file) # 如果存在同名文件,删除重新下载 os.remove(novel_file) # 如果存在同名文件,删除重新下载
@ -251,10 +262,6 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
time.sleep(2) time.sleep(2)
continue continue
# 写入标题到文件
with open(novel_file, 'a', encoding='utf-8') as f:
f.write(chapter_title + '\n\n')
# 提取正文内容的请求地址 # 提取正文内容的请求地址
content_url = extract_content_url(soup, base_url, chapid) content_url = extract_content_url(soup, base_url, chapid)
if content_url: if content_url:
@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
# 获取正文内容 # 获取正文内容
content_response = get_page_content(content_url) content_response = get_page_content(content_url)
if content_response: if content_response:
if not check_content(content_response):
logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
continue
content_soup = BeautifulSoup(content_response, 'html.parser') content_soup = BeautifulSoup(content_response, 'html.parser')
paragraphs = content_soup.find_all('p') paragraphs = content_soup.find_all('p')
# 写入标题到文件
with open(novel_file, 'a', encoding='utf-8') as f:
f.write(chapter_title + '\n\n')
# 写入每个段落内容到文件 # 写入每个段落内容到文件
with open(novel_file, 'a', encoding='utf-8') as f: with open(novel_file, 'a', encoding='utf-8') as f:
for paragraph in paragraphs: for paragraph in paragraphs:
@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name):
# 下载小说,检查是否已经下载过 # 下载小说,检查是否已经下载过
def download_books(): def download_books():
if not os.path.isfile(details_file):
logging.error(f'input file {details_file} not exist!')
return
if not os.path.isfile(down_list_file):
logging.info(f'input file {down_list_file} not exist, use empty dict instead.')
# 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名 # 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
downloaded_books = {} downloaded_books = {}
if os.path.exists(down_list_file): if os.path.exists(down_list_file):
@ -361,10 +383,18 @@ def download_books():
down_list.write(f"{novel_id}\t{book_name}\n") down_list.write(f"{novel_id}\t{book_name}\n")
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s") logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
# 下载指定的小说
def download_map():
# 遍历 novel_map下载所有小说
for novel_id, novel_name in novel_map.items():
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
download_novel(novel_id, novel_name, './aabook')
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
def main(): def main():
if len(sys.argv) != 2: if len(sys.argv) != 2:
print("Usage: python script.py <cmd>") print("Usage: python script.py <cmd>")
print("cmd: get_list, get_detail, get_all, download") print("cmd: get_list, get_detail, get_all, download, download_map")
sys.exit(1) sys.exit(1)
cmd = sys.argv[1] cmd = sys.argv[1]
@ -378,6 +408,8 @@ def main():
get_detail() get_detail()
elif cmd == "download": elif cmd == "download":
download_books() # 下载书籍功能 download_books() # 下载书籍功能
elif cmd == "download_map":
download_map() # 下载书籍功能
else: else:
print(f"Unknown command: {cmd}") print(f"Unknown command: {cmd}")