modify aabook files.

This commit is contained in:
2024-11-04 10:56:43 +08:00
parent fcf6f8a945
commit 2c3b1b7cdf
4 changed files with 136 additions and 25 deletions

View File

@ -7,6 +7,8 @@ import time
import re
import logging
import config # 日志配置
from aabook_list import novel_map
# 日志
config.setup_logging()
@ -16,9 +18,11 @@ base_url = 'https://aabook.xyz'
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
curr_novel_pages = 0
list_file = 'aabook_list.txt'
details_file = 'aabook_details.txt'
down_list_file = 'aabook_down_list.txt'
cursor_dir = 'cursor'
list_file = f'{cursor_dir}/aabook_list.txt'
details_file = f'{cursor_dir}/aabook_details.txt'
down_list_file = f'{cursor_dir}/aabook_down_list.txt'
# User-Agent 列表
user_agents = [
@ -208,6 +212,13 @@ def extract_content_url(soup, base_url, chapid):
# 如果未找到匹配的 script 标签,则返回 None
return None
# 判断内容是否被污染
def check_content(content):
if '2005-2024 疯情书库' in content:
return False
return True
# 计数器
def reset_novel_pages():
global curr_novel_pages
@ -223,7 +234,7 @@ def get_novel_pages():
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
chapter_url = f'{base_url}/read-{chapid}.html'
novel_file = dir_prefix + '/' + novel_name + '.txt'
novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
if os.path.exists(novel_file):
os.remove(novel_file) # 如果存在同名文件,删除重新下载
@ -250,11 +261,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
logging.error(f"Chapter title not found in {chapter_url}, retry...")
time.sleep(2)
continue
# 写入标题到文件
with open(novel_file, 'a', encoding='utf-8') as f:
f.write(chapter_title + '\n\n')
# 提取正文内容的请求地址
content_url = extract_content_url(soup, base_url, chapid)
if content_url:
@ -263,9 +270,17 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
# 获取正文内容
content_response = get_page_content(content_url)
if content_response:
if not check_content(content_response):
logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
continue
content_soup = BeautifulSoup(content_response, 'html.parser')
paragraphs = content_soup.find_all('p')
# 写入标题到文件
with open(novel_file, 'a', encoding='utf-8') as f:
f.write(chapter_title + '\n\n')
# 写入每个段落内容到文件
with open(novel_file, 'a', encoding='utf-8') as f:
for paragraph in paragraphs:
@ -319,6 +334,13 @@ def create_directory_if_not_exists(category_name):
# 下载小说,检查是否已经下载过
def download_books():
if not os.path.isfile(details_file):
logging.error(f'input file {details_file} not exist!')
return
if not os.path.isfile(down_list_file):
logging.info(f'input file {down_list_file} not exist, use empty dict instead.')
# 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
downloaded_books = {}
if os.path.exists(down_list_file):
@ -361,10 +383,18 @@ def download_books():
down_list.write(f"{novel_id}\t{book_name}\n")
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
# 下载指定的小说
def download_map():
# 遍历 novel_map下载所有小说
for novel_id, novel_name in novel_map.items():
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
download_novel(novel_id, novel_name, './aabook')
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
def main():
if len(sys.argv) != 2:
print("Usage: python script.py <cmd>")
print("cmd: get_list, get_detail, get_all, download")
print("cmd: get_list, get_detail, get_all, download, download_map")
sys.exit(1)
cmd = sys.argv[1]
@ -378,6 +408,8 @@ def main():
get_detail()
elif cmd == "download":
download_books() # 下载书籍功能
elif cmd == "download_map":
download_map() # 下载书籍功能
else:
print(f"Unknown command: {cmd}")