modify

2025-03-20 09:53:00 +08:00
parent d7afa70e57
commit 57d140eb51
5 changed files with 204 additions and 75 deletions
--- a/aabook/src/alter_table.py
+++ b/aabook/src/alter_table.py
@ -0,0 +1,40 @@
 import sqlite3
 import json
 import config
 import utils
 import logging
 import sys
 from datetime import datetime
 # 连接 SQLite 数据库
 DB_PATH = config.global_sqlite_path  # 替换为你的数据库文件
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
 tbl_name_books = 'books'
 tbl_name_chapters_prefix = 'chapters'
 tbl_name_section = 'books_sections'
 def add_columns_to_table(table_name):
    try:
        # 添加 words 字段
        add_words_column_query = f"ALTER TABLE {table_name} ADD COLUMN words INTEGER DEFAULT 0"
        cursor.execute(add_words_column_query)
        # 添加 update_time 字段
        add_update_time_column_query = f"ALTER TABLE {table_name} ADD COLUMN update_time TEXT DEFAULT ('2000-01-01 00:00:00')"
        cursor.execute(add_update_time_column_query)
        # 提交事务
        conn.commit()
        print(f"成功向表 {table_name} 中添加字段 words 和 update_time")
    except sqlite3.Error as e:
        print(f"添加字段时出现错误: {e}")
 # 使用示例
 if __name__ == "__main__":
    # 循环遍历 0 到 100 的数字
    for i in range(100):
        table_name = f'{tbl_name_chapters_prefix}_{i}'
        add_columns_to_table(table_name)
--- a/aabook/src/fetch.py
+++ b/aabook/src/fetch.py
@ -102,10 +102,11 @@ def fetch_book_toc(url):
 # 获取小说的目录页，并插入到数据库
 def fetch_table_of_contents(): 
    total_updated_rows = 0
    while True:
        update_list = db_tools.query_books(is_latest=0, limit = 2 if debug else 100)
        if update_list is None or len(update_list) <1 :
-            logging.info(f'no more data need fecth.')
+            logging.info(f'no more data need fecth. updated chapters(table of contents): {total_updated_rows}')
            return
        for row in update_list:
@ -152,15 +153,17 @@ def fetch_table_of_contents():
                # 插入目录数据
                for chap in chapters:
-                    chap_row_id = db_tools.insert_chapter_data({
+                    chap_row_id, affected_rows = db_tools.insert_chapter_data({
-                        'book_id':   bookid,
+                        'book_id':      bookid,
-                        'chapter_id': chap['chapter_id'],
+                        'chapter_id':   chap['chapter_id'],
-                        'section_id': section_id,
+                        'section_id':   section_id,
-                        'title':     chap['title'],
+                        'title':        chap['title'],
-                        'href':     chap['href'],
+                        'href':         chap['href'],
-                        'content':  '',
+                        'words':        chap['words'],
-                        'has_content' : 0
+                        'update_time':  chap['update_time'],
                        'content':      ''
                    })
                    total_updated_rows = total_updated_rows + (affected_rows if affected_rows else 0)
                    if chap_row_id is None:
                        logging.warning(f'insert_chapter_data error. url: {toc_url}')
                        succ = 0
@ -195,81 +198,57 @@ def fetch_contents():
            content, next_url = fetch_chapter_content(url)
            if content and content['title'] and content['contents']:
                # 写入到数据表里
-                db_tools.insert_chapter_data({
+                row_id = db_tools.update_chapter_data({
                    'book_id':  row['book_id'],
                    'chapter_id': row['chapter_id'],
                    'section_id': row['section_id'],
                    'title':     row['title'],
                    'href':     url,
-                    'content':  '\n\n'.join(content['contents']),
+                    'content':  '\n\n'.join(content['contents'])
                    'has_content': 1
                })
                if row_id is None:
                    logging.warning(f"update chapter data error at {url} ")
            else:
                logging.warning(f'fetch content error. url: {url}')
        if debug:
            return
-'''
+# 更新小说目录页的一些信息（字数，时间），临时
-# 下载完整的小说
+def update_chapter_meta():
-def fetch_book_data():
+    toc_links = db_tools.query_toc_href()
-    update_list = db_tools.query_books(need_update=1, limit = 1)
+    for item in toc_links:
-    if update_list:
+        toc_url = item['table_of_contents_href']
-        for row in update_list:
+        bookid = item['id']
-            name = row['name']
+        logging.info(f'fetching page: {toc_url}')
-            href = row['href']
+        toc_data = fetch_book_toc(toc_url)
-            bookid = row['id']
+
-            # 先打开详情页
+        # 解析目录页
-            logging.info(f'----------fetching book {name}: {href}-------------')
+        if toc_data is None:
-            book_detail = fetch_book_detail(href)
+            logging.warning(f'fetch_book_toc error. url: {toc_url}')
-            if book_detail:
+            continue
-                # 获取内容页，然后循环读取内容
+
-                chapter_url = book_detail['start_page_href']
+        # 插入所有的目录数据
-                chapter_id = utils.extract_page_num(chapter_url)
+        for row in toc_data:
-                # 断点续传，从上次拉取的最后一页开始
+            chapters = row['chapters']
-                if not force:
+            # 插入目录数据
-                    last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
+            for chap in chapters:
-                    if last_chapter_url:
+                chap_row_id = db_tools.update_toc_words_uptime({
-                        chapter_url = last_chapter_url
+                    'book_id':      bookid,
-                while chapter_url:
+                    'href':         chap['href'],
-                    logging.info(f'fetching page: {chapter_url}')
+                    'words':        chap['words'],
-                    content, next_url = fetch_chapter_content(chapter_url)
+                    'update_time':  chap['update_time']
-                    if content and content['title'] and content['contents']:
+                })   
-                        # 写入到数据表里
+                if chap_row_id is None:
-                        db_tools.insert_chapter_data({
+                    logging.warning(f'insert toc error.  url: {toc_url}')
                            'book_id':   bookid,
                            'chapter_id': chapter_id,
                            'title':     content['title'],
                            'href':     chapter_url,
                            'content':  '\n\n'.join(content['contents']),
                            'has_content': 1
                        })
                        if debug:
                            return
                    else:
                        logging.warning(f'fetch content error. url: {chapter_url}')
                    chapter_url = next_url
                # 读取完毕，更新列表
                row_id = db_tools.update_book_detail({
                    'href' : href,
                    **book_detail
                })
                if row_id:
                    logging.debug(f'update book succ. id: {row_id}, url: {href}')
                else:
                    logging.warning(f'update book failed. url: {href}')
            else:
                logging.warning(f'get book detail failed. url: {href}')
    else:
        logging.warning(f'get no data needed update.')   
 '''
 # 建立缩写到函数的映射
 function_map = {
    "list":     fetch_book_list,
    "toc" :     fetch_table_of_contents,
    "content":  fetch_contents,
    "update":  update_chapter_meta,
 }   
 # 主函数
--- a/aabook/src/scraper.py
+++ b/aabook/src/scraper.py
@ -188,10 +188,17 @@ def pase_chapter_list(soup, url):
            chap_list = sections[i].find_all("a")
            chap_data = []
            for chap in chap_list:
-                chap_title = chap.get_text().strip()
+                chap_title = chap.get_text().strip()        # 获取章节标题
-                chap_link = f"{host_url}/{chap['href']}"
+                chap_link = f"{host_url}/{chap['href']}"    # 获取章节链接
                chap_id = utils.extract_page_num(chap_link)
-                chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
+                chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
                chap_data.append({
                    'href': chap_link, 
                    'title': chap_title, 
                    'chapter_id': chap_id,
                    'words': chap_words,
                    'update_time' : chap_uptime,
                    })
            table_of_contents.append({'title': section_title, 'chapters': chap_data})
    return table_of_contents
@ -277,7 +284,14 @@ def parse_content_page(soup, url):
        for paragraph in paragraphs:
            cleaned_text = process_paragraph(paragraph)
            content.append(cleaned_text)
-    
+    else:
        # 某些页面，没有p标签，只有一个h1，要兼容此问题
        paragraphs = soup.find_all('h1')
        if paragraphs:
            for paragraph in paragraphs:
                cleaned_text = process_paragraph(paragraph)
                content.append(cleaned_text)
    return content
 # 通用的 HTML 结构验证器
--- a/aabook/src/sqlite_utils.py
+++ b/aabook/src/sqlite_utils.py
@ -231,13 +231,56 @@ def check_and_create_chapters_table(book_number):
 # 插入到数据表中
 def insert_chapter_data(data):
-    tbl_num = int(data['book_id']) % 100
+    try:        
-    tbl_name = check_and_create_chapters_table(tbl_num)
+        # 查询是否存在以及是否需要更新
-    if tbl_name :
+        tbl_num = int(data['book_id']) % 100
-        return insert_or_update_common(data, tbl_name)
+        tbl_name = check_and_create_chapters_table(tbl_num)
-    else:
+
        cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
        existing_record = cursor.fetchone()
        if existing_record:  # **如果演员已存在**
            logging.debug(f"chapter {data['href']} already exist. id: {existing_record[0]}")
            return existing_record[0], 0
        # 不存在，或者需要更新
        data['has_content'] = 0
        return insert_or_update_common(data, tbl_name), 1
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None, 0
 # 更新章节内容到数据表中
 def update_chapter_data(data):
    try:                
        data['has_content'] = 1
        tbl_num = int(data['book_id']) % 100
        tbl_name = check_and_create_chapters_table(tbl_num)
        # 排除不更新的字段，只更新data中含有的字段
        fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
        # 构建更新语句
        set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
        sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
        # 准备参数
        values = [data[field] for field in fields_to_update]
        values.append(data['href'])
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None
-    
+
 # 查询某本书最后的获取页码
 def query_last_chapter_by_book(bookid):
    tbl_num = int(bookid) % 100
@ -281,6 +324,46 @@ def query_no_content_chapters(limit = 100):
    return all_results
 # 更新目录页的特定字段（临时）
 def update_toc_words_uptime(data):
    try:                
        tbl_num = int(data['book_id']) % 100
        tbl_name = check_and_create_chapters_table(tbl_num)
        # 排除不更新的字段，只更新data中含有的字段
        fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
        # 构建更新语句
        set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
        sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
        # 准备参数
        values = [data[field] for field in fields_to_update]
        values.append(data['href'])
        cursor.execute(sql, values)
        conn.commit()
        # 获取插入或更新后的 report_id
        cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
        report_id = cursor.fetchone()[0]
        return report_id
    except sqlite3.Error as e:
        logging.error(f"Error inserting or updating data: {e}")
        return None    
 # 获取所有的目录页
 def query_toc_href():
    try:
        sql = f"SELECT id, table_of_contents_href FROM {tbl_name_books} "
        cursor.execute(sql)
        return [{'id': row[0], 'table_of_contents_href': row[1]} for row in cursor.fetchall()]
    except sqlite3.Error as e:
        logging.error(f"查询 href 失败: {e}")
        return None
 # 插入书本的卷信息
 def insert_or_update_book_sections(data):
    return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')
--- a/aabook/src/utils.py
+++ b/aabook/src/utils.py
@ -41,7 +41,20 @@ def extract_book_num(page_str, default_num = 0):
        return number
    else:
        return default_num
 # 目录页，获取更新时间和字数 
 def extract_chapter_uptime_words(input_str):
    # 定义正则表达式模式
    words_pattern = r'字数：(\d+)'
    words_match = re.search(words_pattern, input_str)
    words = words_match.group(1) if words_match else 0
    update_time_pattern = r'更新时间：(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
    update_time_match = re.search(update_time_pattern, input_str)
    update_time = update_time_match.group(1) if update_time_match else datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return words, update_time
 # 处理 [都市] 的方括号
 def remove_brackets_regex(input_str):
    pattern = r'\[(.*?)\]'