modify aabook files.
This commit is contained in:
@ -1,6 +1,20 @@
|
|||||||
|
|
||||||
# 定义小说映射
|
# 定义小说映射
|
||||||
novel_map_new = {
|
novel_map_new = {
|
||||||
|
138219: '我的将军生涯',
|
||||||
|
6548: '我和我哥们的女友的女友的故事',
|
||||||
|
}
|
||||||
|
# 定义小说映射
|
||||||
|
novel_map = {
|
||||||
|
605: '我的支书生涯',
|
||||||
|
138219: '我的将军生涯',
|
||||||
|
6548: '我和我哥们的女友的女友的故事',
|
||||||
|
203144: '我的校长生涯',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
novel_map_done = {
|
||||||
|
5479: '倚天屠龙记(成人版)',
|
||||||
269: '雪域往事',
|
269: '雪域往事',
|
||||||
156643: '都市偷心龙爪手',
|
156643: '都市偷心龙爪手',
|
||||||
85227: '明星潜规则之皇',
|
85227: '明星潜规则之皇',
|
||||||
@ -18,13 +32,6 @@ novel_map_new = {
|
|||||||
61336: '妻欲:欲望迷城(H 版)',
|
61336: '妻欲:欲望迷城(H 版)',
|
||||||
104929: '都市奇缘',
|
104929: '都市奇缘',
|
||||||
239682: '叶辰风流',
|
239682: '叶辰风流',
|
||||||
}
|
|
||||||
# 定义小说映射
|
|
||||||
novel_map = {
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
novel_map_done = {
|
|
||||||
261481: '我本风流',
|
261481: '我本风流',
|
||||||
171107: '爱与欲的升华',
|
171107: '爱与欲的升华',
|
||||||
171029: '亲爱的不要离开我',
|
171029: '亲爱的不要离开我',
|
||||||
@ -110,7 +117,6 @@ novel_map_done = {
|
|||||||
4701: '艰难的借种经历',
|
4701: '艰难的借种经历',
|
||||||
162845: '人妻牌坊——我和人妻的故事',
|
162845: '人妻牌坊——我和人妻的故事',
|
||||||
183692: '幸福家庭背后的隐私',
|
183692: '幸福家庭背后的隐私',
|
||||||
203144: '我的校长生涯',
|
|
||||||
140605: '东北大炕',
|
140605: '东北大炕',
|
||||||
24344: '淫乱一家亲(超级乱伦家庭)',
|
24344: '淫乱一家亲(超级乱伦家庭)',
|
||||||
25154: '全家人互爱共乐的日子',
|
25154: '全家人互爱共乐的日子',
|
||||||
|
|||||||
61
scripts/aabook_tools.py
Normal file
61
scripts/aabook_tools.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
def rename_files(list_file, data_dir):
|
||||||
|
"""
|
||||||
|
重命名文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
list_file: 存放 novel_id 和 novel_name 的文件路径
|
||||||
|
data_dir: 需要重命名文件的目录
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 读取列表文件,构建一个字典,key为novel_name,value为novel_id
|
||||||
|
id_dict = {}
|
||||||
|
with open(list_file, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
novel_id, novel_name = line.strip().split('\t')
|
||||||
|
id_dict[novel_name] = novel_id
|
||||||
|
|
||||||
|
# 遍历 data 目录下的所有文件
|
||||||
|
for root, dirs, files in os.walk(data_dir):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.txt'):
|
||||||
|
# 获取文件名(不含扩展名)
|
||||||
|
novel_name = file[:-4]
|
||||||
|
# 判断文件名是否在字典中
|
||||||
|
if novel_name in id_dict:
|
||||||
|
old_file = os.path.join(root, file)
|
||||||
|
new_file = os.path.join(root, f"{id_dict[novel_name]}_{novel_name}.txt")
|
||||||
|
os.rename(old_file, new_file)
|
||||||
|
print(f"Renamed {old_file} to {new_file}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_and_record(data_dir, search_string, output_file):
|
||||||
|
"""
|
||||||
|
检查文件内容并记录
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dir: 需要检查的目录
|
||||||
|
search_string: 需要搜索的字符串
|
||||||
|
output_file: 记录结果的文件
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as output:
|
||||||
|
for root, dirs, files in os.walk(data_dir):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.txt'):
|
||||||
|
novel_name = file[:-4]
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
if search_string in f.read():
|
||||||
|
output.write(novel_name + '\n')
|
||||||
|
print(f"need update: {novel_name}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# rename_files("aabook_down_list.txt", "data")
|
||||||
|
|
||||||
|
data_dir = "data"
|
||||||
|
search_string = "2005-2024 疯情书库"
|
||||||
|
output_file = "aabook_need_update.txt"
|
||||||
|
check_and_record(data_dir, search_string, output_file)
|
||||||
@ -116,8 +116,15 @@ def extract_content_url(soup, base_url, chapid):
|
|||||||
# 如果未找到匹配的 script 标签,则返回 None
|
# 如果未找到匹配的 script 标签,则返回 None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# 判断内容是否被污染
|
||||||
|
def check_content(content):
|
||||||
|
if '2005-2024 疯情书库' in content:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
# 解析章节内容并保存到文件中
|
# 解析章节内容并保存到文件中
|
||||||
def download_novel(chapid, novel_name):
|
def download_novel(chapid, novel_name, novel_file_str):
|
||||||
base_url = 'https://aabook.xyz'
|
base_url = 'https://aabook.xyz'
|
||||||
chapter_url = f'{base_url}/read-{chapid}.html'
|
chapter_url = f'{base_url}/read-{chapid}.html'
|
||||||
|
|
||||||
@ -145,7 +152,7 @@ def download_novel(chapid, novel_name):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# 写入标题到文件
|
# 写入标题到文件
|
||||||
with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
|
with open(novel_file_str, 'a', encoding='utf-8') as f:
|
||||||
f.write(chapter_title + '\n\n')
|
f.write(chapter_title + '\n\n')
|
||||||
|
|
||||||
# 提取正文内容的请求地址
|
# 提取正文内容的请求地址
|
||||||
@ -156,11 +163,15 @@ def download_novel(chapid, novel_name):
|
|||||||
# 获取正文内容
|
# 获取正文内容
|
||||||
content_response = get_page_content(content_url)
|
content_response = get_page_content(content_url)
|
||||||
if content_response:
|
if content_response:
|
||||||
|
if not check_content(content_response):
|
||||||
|
logging.error(f'error response. {content_response}')
|
||||||
|
continue
|
||||||
|
|
||||||
content_soup = BeautifulSoup(content_response, 'html.parser')
|
content_soup = BeautifulSoup(content_response, 'html.parser')
|
||||||
paragraphs = content_soup.find_all('p')
|
paragraphs = content_soup.find_all('p')
|
||||||
|
|
||||||
# 写入每个段落内容到文件
|
# 写入每个段落内容到文件
|
||||||
with open(f'{dir_prefix}/{novel_name}.txt', 'a', encoding='utf-8') as f:
|
with open(novel_file_str, 'a', encoding='utf-8') as f:
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
#cleaned_part = clean_watermarks(paragraph.get_text().strip())
|
#cleaned_part = clean_watermarks(paragraph.get_text().strip())
|
||||||
#f.write(paragraph.get_text() + '\n\n')
|
#f.write(paragraph.get_text() + '\n\n')
|
||||||
@ -204,7 +215,8 @@ def download_novel(chapid, novel_name):
|
|||||||
# 遍历 novel_map,下载所有小说
|
# 遍历 novel_map,下载所有小说
|
||||||
for novel_id, novel_name in novel_map.items():
|
for novel_id, novel_name in novel_map.items():
|
||||||
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
|
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
|
||||||
if os.path.exists(f'{dir_prefix}/{novel_name}.txt'):
|
file_str = f'{dir_prefix}/{novel_id}_{novel_name}.txt'
|
||||||
os.remove(f'{dir_prefix}/{novel_name}.txt') # 如果存在同名文件,删除重新下载
|
if os.path.exists(file_str):
|
||||||
download_novel(novel_id, novel_name)
|
os.remove(file_str) # 如果存在同名文件,删除重新下载
|
||||||
logging.info(f"Completed download for {novel_name}.\n")
|
download_novel(novel_id, novel_name, file_str)
|
||||||
|
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
|
||||||
@ -7,6 +7,8 @@ import time
|
|||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import config # 日志配置
|
import config # 日志配置
|
||||||
|
from aabook_list import novel_map
|
||||||
|
|
||||||
|
|
||||||
# 日志
|
# 日志
|
||||||
config.setup_logging()
|
config.setup_logging()
|
||||||
@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz'
|
|||||||
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
||||||
curr_novel_pages = 0
|
curr_novel_pages = 0
|
||||||
|
|
||||||
list_file = 'aabook_list.txt'
|
cursor_dir = 'cursor'
|
||||||
details_file = 'aabook_details.txt'
|
|
||||||
down_list_file = 'aabook_down_list.txt'
|
list_file = f'{cursor_dir}/aabook_list.txt'
|
||||||
|
details_file = f'{cursor_dir}/aabook_details.txt'
|
||||||
|
down_list_file = f'{cursor_dir}/aabook_down_list.txt'
|
||||||
|
|
||||||
# User-Agent 列表
|
# User-Agent 列表
|
||||||
user_agents = [
|
user_agents = [
|
||||||
@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid):
|
|||||||
# 如果未找到匹配的 script 标签,则返回 None
|
# 如果未找到匹配的 script 标签,则返回 None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# 判断内容是否被污染
|
||||||
|
def check_content(content):
|
||||||
|
if '2005-2024 疯情书库' in content:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
# 计数器
|
# 计数器
|
||||||
def reset_novel_pages():
|
def reset_novel_pages():
|
||||||
global curr_novel_pages
|
global curr_novel_pages
|
||||||
@ -223,7 +234,7 @@ def get_novel_pages():
|
|||||||
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||||
chapter_url = f'{base_url}/read-{chapid}.html'
|
chapter_url = f'{base_url}/read-{chapid}.html'
|
||||||
|
|
||||||
novel_file = dir_prefix + '/' + novel_name + '.txt'
|
novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
|
||||||
if os.path.exists(novel_file):
|
if os.path.exists(novel_file):
|
||||||
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
||||||
|
|
||||||
@ -251,10 +262,6 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
|||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 写入标题到文件
|
|
||||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
|
||||||
f.write(chapter_title + '\n\n')
|
|
||||||
|
|
||||||
# 提取正文内容的请求地址
|
# 提取正文内容的请求地址
|
||||||
content_url = extract_content_url(soup, base_url, chapid)
|
content_url = extract_content_url(soup, base_url, chapid)
|
||||||
if content_url:
|
if content_url:
|
||||||
@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
|||||||
# 获取正文内容
|
# 获取正文内容
|
||||||
content_response = get_page_content(content_url)
|
content_response = get_page_content(content_url)
|
||||||
if content_response:
|
if content_response:
|
||||||
|
if not check_content(content_response):
|
||||||
|
logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
|
||||||
|
continue
|
||||||
|
|
||||||
content_soup = BeautifulSoup(content_response, 'html.parser')
|
content_soup = BeautifulSoup(content_response, 'html.parser')
|
||||||
paragraphs = content_soup.find_all('p')
|
paragraphs = content_soup.find_all('p')
|
||||||
|
|
||||||
|
# 写入标题到文件
|
||||||
|
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||||
|
f.write(chapter_title + '\n\n')
|
||||||
|
|
||||||
# 写入每个段落内容到文件
|
# 写入每个段落内容到文件
|
||||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name):
|
|||||||
|
|
||||||
# 下载小说,检查是否已经下载过
|
# 下载小说,检查是否已经下载过
|
||||||
def download_books():
|
def download_books():
|
||||||
|
if not os.path.isfile(details_file):
|
||||||
|
logging.error(f'input file {details_file} not exist!')
|
||||||
|
return
|
||||||
|
|
||||||
|
if not os.path.isfile(down_list_file):
|
||||||
|
logging.info(f'input file {down_list_file} not exist, use empty dict instead.')
|
||||||
|
|
||||||
# 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
|
# 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
|
||||||
downloaded_books = {}
|
downloaded_books = {}
|
||||||
if os.path.exists(down_list_file):
|
if os.path.exists(down_list_file):
|
||||||
@ -361,10 +383,18 @@ def download_books():
|
|||||||
down_list.write(f"{novel_id}\t{book_name}\n")
|
down_list.write(f"{novel_id}\t{book_name}\n")
|
||||||
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
|
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
|
||||||
|
|
||||||
|
# 下载指定的小说
|
||||||
|
def download_map():
|
||||||
|
# 遍历 novel_map,下载所有小说
|
||||||
|
for novel_id, novel_name in novel_map.items():
|
||||||
|
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
|
||||||
|
download_novel(novel_id, novel_name, './aabook')
|
||||||
|
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
print("Usage: python script.py <cmd>")
|
print("Usage: python script.py <cmd>")
|
||||||
print("cmd: get_list, get_detail, get_all, download")
|
print("cmd: get_list, get_detail, get_all, download, download_map")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
cmd = sys.argv[1]
|
cmd = sys.argv[1]
|
||||||
@ -378,6 +408,8 @@ def main():
|
|||||||
get_detail()
|
get_detail()
|
||||||
elif cmd == "download":
|
elif cmd == "download":
|
||||||
download_books() # 下载书籍功能
|
download_books() # 下载书籍功能
|
||||||
|
elif cmd == "download_map":
|
||||||
|
download_map() # 下载书籍功能
|
||||||
else:
|
else:
|
||||||
print(f"Unknown command: {cmd}")
|
print(f"Unknown command: {cmd}")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user