modify crawler files.
This commit is contained in:
8468
scripts/data/惊悚/虫使.txt
Normal file
8468
scripts/data/惊悚/虫使.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -14,6 +14,7 @@ config.setup_logging()
|
||||
# 配置基础URL和输出文件
|
||||
base_url = 'https://aabook.xyz'
|
||||
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
||||
curr_novel_pages = 0
|
||||
|
||||
list_file = 'aabook_list.txt'
|
||||
details_file = 'aabook_details.txt'
|
||||
@ -207,6 +208,17 @@ def extract_content_url(soup, base_url, chapid):
|
||||
# 如果未找到匹配的 script 标签,则返回 None
|
||||
return None
|
||||
|
||||
# 计数器
|
||||
def reset_novel_pages():
|
||||
global curr_novel_pages
|
||||
curr_novel_pages = 0
|
||||
def add_novel_pages():
|
||||
global curr_novel_pages
|
||||
curr_novel_pages += 1
|
||||
def get_novel_pages():
|
||||
global curr_novel_pages
|
||||
return curr_novel_pages
|
||||
|
||||
# 解析章节内容并保存到文件中
|
||||
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
chapter_url = f'{base_url}/read-{chapid}.html'
|
||||
@ -214,7 +226,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
novel_file = dir_prefix + '/' + novel_name + '.txt'
|
||||
if os.path.exists(novel_file):
|
||||
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
||||
|
||||
|
||||
reset_novel_pages()
|
||||
while chapter_url:
|
||||
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
|
||||
|
||||
@ -269,6 +282,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||
logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
|
||||
continue
|
||||
|
||||
# 页码数+1
|
||||
add_novel_pages()
|
||||
# 查找下一章的链接
|
||||
next_div = soup.find('div', class_='next_arrow')
|
||||
# 判断是否找到了包含下一章链接的 div 标签
|
||||
@ -309,7 +324,11 @@ def download_books():
|
||||
if os.path.exists(down_list_file):
|
||||
with open(down_list_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
novel_id, novel_name = line.strip().split('\t')
|
||||
fields = line.strip().split('\t')
|
||||
if len(fields) != 2:
|
||||
logging.info(f'invalid line data: {line}')
|
||||
continue
|
||||
novel_id, novel_name = fields
|
||||
downloaded_books[novel_id] = novel_name
|
||||
|
||||
# 打开 aabook_details.txt 读取书籍信息
|
||||
@ -331,12 +350,16 @@ def download_books():
|
||||
create_directory_if_not_exists(down_dir)
|
||||
|
||||
# 调用下载函数下载书籍
|
||||
start_time = time.time() # 在函数执行前获取当前时间
|
||||
download_novel(novel_id, book_name, down_dir)
|
||||
end_time = time.time() # 在函数执行后获取当前时间
|
||||
elapsed_time = int(end_time - start_time) # 计算时间差,秒
|
||||
novel_pages = get_novel_pages()
|
||||
|
||||
# 下载后,将书籍信息追加写入 aabook_down_list.txt
|
||||
with open(down_list_file, 'a', encoding='utf-8') as down_list:
|
||||
down_list.write(f"{novel_id}\t{book_name}\n")
|
||||
logging.info(f"Downloaded and recorded: {book_name} (ID: {novel_id})")
|
||||
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
|
||||
Reference in New Issue
Block a user