modify aabook files.
This commit is contained in:
@ -7,6 +7,8 @@ import time
|
||||
import re
|
||||
import logging
|
||||
import config # 日志配置
|
||||
from aabook_list import novel_map
|
||||
|
||||
|
||||
# 日志
|
||||
config.setup_logging()
|
||||
@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz'
|
||||
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
||||
curr_novel_pages = 0
|
||||
|
||||
list_file = 'aabook_list.txt'
|
||||
details_file = 'aabook_details.txt'
|
||||
down_list_file = 'aabook_down_list.txt'
|
||||
cursor_dir = 'cursor'
|
||||
|
||||
list_file = f'{cursor_dir}/aabook_list.txt'
|
||||
details_file = f'{cursor_dir}/aabook_details.txt'
|
||||
down_list_file = f'{cursor_dir}/aabook_down_list.txt'
|
||||
|
||||
# User-Agent 列表
|
||||
user_agents = [
|
||||
@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid):
|
||||
# 如果未找到匹配的 script 标签,则返回 None
|
||||
return None
|
||||
|
||||
# 判断内容是否被污染
|
||||
def check_content(content):
|
||||
if '2005-2024 疯情书库' in content:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# 计数器
|
||||
def reset_novel_pages():
|
||||
global curr_novel_pages
|
||||
@ -223,7 +234,7 @@ def get_novel_pages():
|
||||
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
chapter_url = f'{base_url}/read-{chapid}.html'
|
||||
|
||||
novel_file = dir_prefix + '/' + novel_name + '.txt'
|
||||
novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
|
||||
if os.path.exists(novel_file):
|
||||
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
||||
|
||||
@ -250,11 +261,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
logging.error(f"Chapter title not found in {chapter_url}, retry...")
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
# 写入标题到文件
|
||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||
f.write(chapter_title + '\n\n')
|
||||
|
||||
|
||||
# 提取正文内容的请求地址
|
||||
content_url = extract_content_url(soup, base_url, chapid)
|
||||
if content_url:
|
||||
@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
# 获取正文内容
|
||||
content_response = get_page_content(content_url)
|
||||
if content_response:
|
||||
if not check_content(content_response):
|
||||
logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
|
||||
continue
|
||||
|
||||
content_soup = BeautifulSoup(content_response, 'html.parser')
|
||||
paragraphs = content_soup.find_all('p')
|
||||
|
||||
# 写入标题到文件
|
||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||
f.write(chapter_title + '\n\n')
|
||||
|
||||
# 写入每个段落内容到文件
|
||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||
for paragraph in paragraphs:
|
||||
@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name):
|
||||
|
||||
# 下载小说,检查是否已经下载过
|
||||
def download_books():
|
||||
if not os.path.isfile(details_file):
|
||||
logging.error(f'input file {details_file} not exist!')
|
||||
return
|
||||
|
||||
if not os.path.isfile(down_list_file):
|
||||
logging.info(f'input file {down_list_file} not exist, use empty dict instead.')
|
||||
|
||||
# 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
|
||||
downloaded_books = {}
|
||||
if os.path.exists(down_list_file):
|
||||
@ -361,10 +383,18 @@ def download_books():
|
||||
down_list.write(f"{novel_id}\t{book_name}\n")
|
||||
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
|
||||
|
||||
# 下载指定的小说
|
||||
def download_map():
|
||||
# 遍历 novel_map,下载所有小说
|
||||
for novel_id, novel_name in novel_map.items():
|
||||
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
|
||||
download_novel(novel_id, novel_name, './aabook')
|
||||
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python script.py <cmd>")
|
||||
print("cmd: get_list, get_detail, get_all, download")
|
||||
print("cmd: get_list, get_detail, get_all, download, download_map")
|
||||
sys.exit(1)
|
||||
|
||||
cmd = sys.argv[1]
|
||||
@ -378,6 +408,8 @@ def main():
|
||||
get_detail()
|
||||
elif cmd == "download":
|
||||
download_books() # 下载书籍功能
|
||||
elif cmd == "download_map":
|
||||
download_map() # 下载书籍功能
|
||||
else:
|
||||
print(f"Unknown command: {cmd}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user