modify crawler files.
This commit is contained in:
8468
scripts/data/惊悚/虫使.txt
Normal file
8468
scripts/data/惊悚/虫使.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -14,6 +14,7 @@ config.setup_logging()
|
|||||||
# 配置基础URL和输出文件
|
# 配置基础URL和输出文件
|
||||||
base_url = 'https://aabook.xyz'
|
base_url = 'https://aabook.xyz'
|
||||||
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
||||||
|
curr_novel_pages = 0
|
||||||
|
|
||||||
list_file = 'aabook_list.txt'
|
list_file = 'aabook_list.txt'
|
||||||
details_file = 'aabook_details.txt'
|
details_file = 'aabook_details.txt'
|
||||||
@ -207,6 +208,17 @@ def extract_content_url(soup, base_url, chapid):
|
|||||||
# 如果未找到匹配的 script 标签,则返回 None
|
# 如果未找到匹配的 script 标签,则返回 None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# 计数器
|
||||||
|
def reset_novel_pages():
|
||||||
|
global curr_novel_pages
|
||||||
|
curr_novel_pages = 0
|
||||||
|
def add_novel_pages():
|
||||||
|
global curr_novel_pages
|
||||||
|
curr_novel_pages += 1
|
||||||
|
def get_novel_pages():
|
||||||
|
global curr_novel_pages
|
||||||
|
return curr_novel_pages
|
||||||
|
|
||||||
# 解析章节内容并保存到文件中
|
# 解析章节内容并保存到文件中
|
||||||
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
||||||
chapter_url = f'{base_url}/read-{chapid}.html'
|
chapter_url = f'{base_url}/read-{chapid}.html'
|
||||||
@ -215,6 +227,7 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
|||||||
if os.path.exists(novel_file):
|
if os.path.exists(novel_file):
|
||||||
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
||||||
|
|
||||||
|
reset_novel_pages()
|
||||||
while chapter_url:
|
while chapter_url:
|
||||||
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
|
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
|
||||||
|
|
||||||
@ -269,6 +282,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
|
|||||||
logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
|
logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 页码数+1
|
||||||
|
add_novel_pages()
|
||||||
# 查找下一章的链接
|
# 查找下一章的链接
|
||||||
next_div = soup.find('div', class_='next_arrow')
|
next_div = soup.find('div', class_='next_arrow')
|
||||||
# 判断是否找到了包含下一章链接的 div 标签
|
# 判断是否找到了包含下一章链接的 div 标签
|
||||||
@ -309,7 +324,11 @@ def download_books():
|
|||||||
if os.path.exists(down_list_file):
|
if os.path.exists(down_list_file):
|
||||||
with open(down_list_file, 'r', encoding='utf-8') as f:
|
with open(down_list_file, 'r', encoding='utf-8') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
novel_id, novel_name = line.strip().split('\t')
|
fields = line.strip().split('\t')
|
||||||
|
if len(fields) != 2:
|
||||||
|
logging.info(f'invalid line data: {line}')
|
||||||
|
continue
|
||||||
|
novel_id, novel_name = fields
|
||||||
downloaded_books[novel_id] = novel_name
|
downloaded_books[novel_id] = novel_name
|
||||||
|
|
||||||
# 打开 aabook_details.txt 读取书籍信息
|
# 打开 aabook_details.txt 读取书籍信息
|
||||||
@ -331,12 +350,16 @@ def download_books():
|
|||||||
create_directory_if_not_exists(down_dir)
|
create_directory_if_not_exists(down_dir)
|
||||||
|
|
||||||
# 调用下载函数下载书籍
|
# 调用下载函数下载书籍
|
||||||
|
start_time = time.time() # 在函数执行前获取当前时间
|
||||||
download_novel(novel_id, book_name, down_dir)
|
download_novel(novel_id, book_name, down_dir)
|
||||||
|
end_time = time.time() # 在函数执行后获取当前时间
|
||||||
|
elapsed_time = int(end_time - start_time) # 计算时间差,秒
|
||||||
|
novel_pages = get_novel_pages()
|
||||||
|
|
||||||
# 下载后,将书籍信息追加写入 aabook_down_list.txt
|
# 下载后,将书籍信息追加写入 aabook_down_list.txt
|
||||||
with open(down_list_file, 'a', encoding='utf-8') as down_list:
|
with open(down_list_file, 'a', encoding='utf-8') as down_list:
|
||||||
down_list.write(f"{novel_id}\t{book_name}\n")
|
down_list.write(f"{novel_id}\t{book_name}\n")
|
||||||
logging.info(f"Downloaded and recorded: {book_name} (ID: {novel_id})")
|
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
|
|||||||
Reference in New Issue
Block a user