modify crawler files.

This commit is contained in:
2024-10-23 11:47:53 +08:00
parent a2fb937b8f
commit cff542c61e
2 changed files with 8494 additions and 3 deletions

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,7 @@ config.setup_logging()
# 配置基础URL和输出文件 # 配置基础URL和输出文件
base_url = 'https://aabook.xyz' base_url = 'https://aabook.xyz'
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount' list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
curr_novel_pages = 0
list_file = 'aabook_list.txt' list_file = 'aabook_list.txt'
details_file = 'aabook_details.txt' details_file = 'aabook_details.txt'
@ -207,6 +208,17 @@ def extract_content_url(soup, base_url, chapid):
# 如果未找到匹配的 script 标签,则返回 None # 如果未找到匹配的 script 标签,则返回 None
return None return None
# 计数器
def reset_novel_pages():
global curr_novel_pages
curr_novel_pages = 0
def add_novel_pages():
global curr_novel_pages
curr_novel_pages += 1
def get_novel_pages():
global curr_novel_pages
return curr_novel_pages
# 解析章节内容并保存到文件中 # 解析章节内容并保存到文件中
def download_novel(chapid, novel_name, dir_prefix='./aabook'): def download_novel(chapid, novel_name, dir_prefix='./aabook'):
chapter_url = f'{base_url}/read-{chapid}.html' chapter_url = f'{base_url}/read-{chapid}.html'
@ -215,6 +227,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
if os.path.exists(novel_file): if os.path.exists(novel_file):
os.remove(novel_file) # 如果存在同名文件,删除重新下载 os.remove(novel_file) # 如果存在同名文件,删除重新下载
reset_novel_pages()
while chapter_url: while chapter_url:
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}") logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
@ -269,6 +282,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...") logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
continue continue
# 页码数+1
add_novel_pages()
# 查找下一章的链接 # 查找下一章的链接
next_div = soup.find('div', class_='next_arrow') next_div = soup.find('div', class_='next_arrow')
# 判断是否找到了包含下一章链接的 div 标签 # 判断是否找到了包含下一章链接的 div 标签
@ -309,7 +324,11 @@ def download_books():
if os.path.exists(down_list_file): if os.path.exists(down_list_file):
with open(down_list_file, 'r', encoding='utf-8') as f: with open(down_list_file, 'r', encoding='utf-8') as f:
for line in f: for line in f:
novel_id, novel_name = line.strip().split('\t') fields = line.strip().split('\t')
if len(fields) != 2:
logging.info(f'invalid line data: {line}')
continue
novel_id, novel_name = fields
downloaded_books[novel_id] = novel_name downloaded_books[novel_id] = novel_name
# 打开 aabook_details.txt 读取书籍信息 # 打开 aabook_details.txt 读取书籍信息
@ -331,12 +350,16 @@ def download_books():
create_directory_if_not_exists(down_dir) create_directory_if_not_exists(down_dir)
# 调用下载函数下载书籍 # 调用下载函数下载书籍
start_time = time.time() # 在函数执行前获取当前时间
download_novel(novel_id, book_name, down_dir) download_novel(novel_id, book_name, down_dir)
end_time = time.time() # 在函数执行后获取当前时间
elapsed_time = int(end_time - start_time) # 计算时间差,秒
novel_pages = get_novel_pages()
# 下载后,将书籍信息追加写入 aabook_down_list.txt # 下载后,将书籍信息追加写入 aabook_down_list.txt
with open(down_list_file, 'a', encoding='utf-8') as down_list: with open(down_list_file, 'a', encoding='utf-8') as down_list:
down_list.write(f"{novel_id}\t{book_name}\n") down_list.write(f"{novel_id}\t{book_name}\n")
logging.info(f"Downloaded and recorded: {book_name} (ID: {novel_id})") logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
def main(): def main():
if len(sys.argv) != 2: if len(sys.argv) != 2: