modify crawler files.

2024-10-23 11:47:53 +08:00
parent a2fb937b8f
commit cff542c61e
2 changed files with 8494 additions and 3 deletions
--- a/scripts/data/惊悚/虫使.txt
+++ b/scripts/data/惊悚/虫使.txt
--- a/scripts/get_aabook_list.py
+++ b/scripts/get_aabook_list.py
@ -14,6 +14,7 @@ config.setup_logging()
 # 配置基础URL和输出文件
 base_url = 'https://aabook.xyz'
 list_url_template = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
+curr_novel_pages = 0

 list_file = 'aabook_list.txt'
 details_file = 'aabook_details.txt'
@ -207,6 +208,17 @@ def extract_content_url(soup, base_url, chapid):
    # 如果未找到匹配的 script 标签，则返回 None
    return None

+# 计数器
+def reset_novel_pages():
+    global curr_novel_pages
+    curr_novel_pages = 0
+def add_novel_pages():
+    global curr_novel_pages
+    curr_novel_pages += 1 
+def get_novel_pages():
+    global curr_novel_pages
+    return curr_novel_pages
+
 # 解析章节内容并保存到文件中
 def download_novel(chapid, novel_name, dir_prefix='./aabook'):
    chapter_url = f'{base_url}/read-{chapid}.html'
@ -214,7 +226,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
    novel_file = dir_prefix + '/' + novel_name + '.txt'
    if os.path.exists(novel_file):
        os.remove(novel_file)  # 如果存在同名文件，删除重新下载
-    
+
+    reset_novel_pages()
    while chapter_url:
        logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
        
@ -269,6 +282,8 @@ def download_novel(chapid, novel_name, dir_prefix='./aabook'):
            logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
            continue
        
+        # 页码数+1
+        add_novel_pages()
        # 查找下一章的链接
        next_div = soup.find('div', class_='next_arrow')   
        # 判断是否找到了包含下一章链接的 div 标签
@ -309,7 +324,11 @@ def download_books():
    if os.path.exists(down_list_file):
        with open(down_list_file, 'r', encoding='utf-8') as f:
            for line in f:
-                novel_id, novel_name = line.strip().split('\t')
+                fields = line.strip().split('\t')
+                if len(fields) != 2:
+                    logging.info(f'invalid line data: {line}')
+                    continue
+                novel_id, novel_name = fields
                downloaded_books[novel_id] = novel_name

    # 打开 aabook_details.txt 读取书籍信息
@ -331,12 +350,16 @@ def download_books():
            create_directory_if_not_exists(down_dir)
            
            # 调用下载函数下载书籍
+            start_time = time.time()  # 在函数执行前获取当前时间
            download_novel(novel_id, book_name, down_dir)
+            end_time = time.time()  # 在函数执行后获取当前时间
+            elapsed_time = int(end_time - start_time)  # 计算时间差,秒
+            novel_pages = get_novel_pages()
            
            # 下载后，将书籍信息追加写入 aabook_down_list.txt
            with open(down_list_file, 'a', encoding='utf-8') as down_list:
                down_list.write(f"{novel_id}\t{book_name}\n")
-                logging.info(f"Downloaded and recorded: {book_name} (ID: {novel_id})")
+                logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")

 def main():
    if len(sys.argv) != 2: