modify

2025-03-20 09:53:00 +08:00
parent d7afa70e57
commit 57d140eb51
5 changed files with 204 additions and 75 deletions
--- a/aabook/src/alter_table.py
+++ b/aabook/src/alter_table.py
@ -0,0 +1,40 @@
+import sqlite3
+import json
+import config
+import utils
+import logging
+import sys
+from datetime import datetime
+
+
+# 连接 SQLite 数据库
+DB_PATH = config.global_sqlite_path  # 替换为你的数据库文件
+conn = sqlite3.connect(DB_PATH)
+cursor = conn.cursor()
+
+tbl_name_books = 'books'
+tbl_name_chapters_prefix = 'chapters'
+tbl_name_section = 'books_sections'
+
+def add_columns_to_table(table_name):
+    try:
+        # 添加 words 字段
+        add_words_column_query = f"ALTER TABLE {table_name} ADD COLUMN words INTEGER DEFAULT 0"
+        cursor.execute(add_words_column_query)
+
+        # 添加 update_time 字段
+        add_update_time_column_query = f"ALTER TABLE {table_name} ADD COLUMN update_time TEXT DEFAULT ('2000-01-01 00:00:00')"
+        cursor.execute(add_update_time_column_query)
+
+        # 提交事务
+        conn.commit()
+        print(f"成功向表 {table_name} 中添加字段 words 和 update_time")
+    except sqlite3.Error as e:
+        print(f"添加字段时出现错误: {e}")
+
+# 使用示例
+if __name__ == "__main__":
+    # 循环遍历 0 到 100 的数字
+    for i in range(100):
+        table_name = f'{tbl_name_chapters_prefix}_{i}'
+        add_columns_to_table(table_name)
--- a/aabook/src/fetch.py
+++ b/aabook/src/fetch.py
@ -102,10 +102,11 @@ def fetch_book_toc(url):

 # 获取小说的目录页，并插入到数据库
 def fetch_table_of_contents(): 
+    total_updated_rows = 0
    while True:
        update_list = db_tools.query_books(is_latest=0, limit = 2 if debug else 100)
        if update_list is None or len(update_list) <1 :
-            logging.info(f'no more data need fecth.')
+            logging.info(f'no more data need fecth. updated chapters(table of contents): {total_updated_rows}')
            return
        
        for row in update_list:
@ -152,15 +153,17 @@ def fetch_table_of_contents():

                # 插入目录数据
                for chap in chapters:
-                    chap_row_id = db_tools.insert_chapter_data({
-                        'book_id':   bookid,
-                        'chapter_id': chap['chapter_id'],
-                        'section_id': section_id,
-                        'title':     chap['title'],
-                        'href':     chap['href'],
-                        'content':  '',
-                        'has_content' : 0
+                    chap_row_id, affected_rows = db_tools.insert_chapter_data({
+                        'book_id':      bookid,
+                        'chapter_id':   chap['chapter_id'],
+                        'section_id':   section_id,
+                        'title':        chap['title'],
+                        'href':         chap['href'],
+                        'words':        chap['words'],
+                        'update_time':  chap['update_time'],
+                        'content':      ''
                    })
+                    total_updated_rows = total_updated_rows + (affected_rows if affected_rows else 0)
                    if chap_row_id is None:
                        logging.warning(f'insert_chapter_data error. url: {toc_url}')
                        succ = 0
@ -195,81 +198,57 @@ def fetch_contents():
            content, next_url = fetch_chapter_content(url)
            if content and content['title'] and content['contents']:
                # 写入到数据表里
-                db_tools.insert_chapter_data({
+                row_id = db_tools.update_chapter_data({
                    'book_id':  row['book_id'],
                    'chapter_id': row['chapter_id'],
                    'section_id': row['section_id'],
                    'title':     row['title'],
                    'href':     url,
-                    'content':  '\n\n'.join(content['contents']),
-                    'has_content': 1
+                    'content':  '\n\n'.join(content['contents'])
                })
+                if row_id is None:
+                    logging.warning(f"update chapter data error at {url} ")
            else:
                logging.warning(f'fetch content error. url: {url}')
        if debug:
            return
        

-'''
-# 下载完整的小说
-def fetch_book_data():
-    update_list = db_tools.query_books(need_update=1, limit = 1)
-    if update_list:
-        for row in update_list:
-            name = row['name']
-            href = row['href']
-            bookid = row['id']
-            # 先打开详情页
-            logging.info(f'----------fetching book {name}: {href}-------------')
-            book_detail = fetch_book_detail(href)
-            if book_detail:
-                # 获取内容页，然后循环读取内容
-                chapter_url = book_detail['start_page_href']
-                chapter_id = utils.extract_page_num(chapter_url)
-                # 断点续传，从上次拉取的最后一页开始
-                if not force:
-                    last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
-                    if last_chapter_url:
-                        chapter_url = last_chapter_url
-                while chapter_url:
-                    logging.info(f'fetching page: {chapter_url}')
-                    content, next_url = fetch_chapter_content(chapter_url)
-                    if content and content['title'] and content['contents']:
-                        # 写入到数据表里
-                        db_tools.insert_chapter_data({
-                            'book_id':   bookid,
-                            'chapter_id': chapter_id,
-                            'title':     content['title'],
-                            'href':     chapter_url,
-                            'content':  '\n\n'.join(content['contents']),
-                            'has_content': 1
-                        })
+# 更新小说目录页的一些信息（字数，时间），临时
+def update_chapter_meta():
+    toc_links = db_tools.query_toc_href()
+    for item in toc_links:
+        toc_url = item['table_of_contents_href']
+        bookid = item['id']
+        logging.info(f'fetching page: {toc_url}')
+        toc_data = fetch_book_toc(toc_url)
+
+        # 解析目录页
+        if toc_data is None:
+            logging.warning(f'fetch_book_toc error. url: {toc_url}')
+            continue
+
+        # 插入所有的目录数据
+        for row in toc_data:
+            chapters = row['chapters']
+            # 插入目录数据
+            for chap in chapters:
+                chap_row_id = db_tools.update_toc_words_uptime({
+                    'book_id':      bookid,
+                    'href':         chap['href'],
+                    'words':        chap['words'],
+                    'update_time':  chap['update_time']
+                })   
+                if chap_row_id is None:
+                    logging.warning(f'insert toc error.  url: {toc_url}')

-                        if debug:
-                            return
-                    else:
-                        logging.warning(f'fetch content error. url: {chapter_url}')
-                    chapter_url = next_url
-                # 读取完毕，更新列表
-                row_id = db_tools.update_book_detail({
-                    'href' : href,
-                    **book_detail
-                })
-                if row_id:
-                    logging.debug(f'update book succ. id: {row_id}, url: {href}')
-                else:
-                    logging.warning(f'update book failed. url: {href}')
-            else:
-                logging.warning(f'get book detail failed. url: {href}')
-    else:
-        logging.warning(f'get no data needed update.')   
-'''

 # 建立缩写到函数的映射
 function_map = {
    "list":     fetch_book_list,
    "toc" :     fetch_table_of_contents,
    "content":  fetch_contents,
+    "update":  update_chapter_meta,
 }   

 # 主函数
--- a/aabook/src/scraper.py
+++ b/aabook/src/scraper.py
@ -188,10 +188,17 @@ def pase_chapter_list(soup, url):
            chap_list = sections[i].find_all("a")
            chap_data = []
            for chap in chap_list:
-                chap_title = chap.get_text().strip()
-                chap_link = f"{host_url}/{chap['href']}"
+                chap_title = chap.get_text().strip()        # 获取章节标题
+                chap_link = f"{host_url}/{chap['href']}"    # 获取章节链接
                chap_id = utils.extract_page_num(chap_link)
-                chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
+                chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
+                chap_data.append({
+                    'href': chap_link, 
+                    'title': chap_title, 
+                    'chapter_id': chap_id,
+                    'words': chap_words,
+                    'update_time' : chap_uptime,
+                    })
            table_of_contents.append({'title': section_title, 'chapters': chap_data})

    return table_of_contents
@ -277,7 +284,14 @@ def parse_content_page(soup, url):
        for paragraph in paragraphs:
            cleaned_text = process_paragraph(paragraph)
            content.append(cleaned_text)
-    
+    else:
+        # 某些页面，没有p标签，只有一个h1，要兼容此问题
+        paragraphs = soup.find_all('h1')
+        if paragraphs:
+            for paragraph in paragraphs:
+                cleaned_text = process_paragraph(paragraph)
+                content.append(cleaned_text)
+
    return content

 # 通用的 HTML 结构验证器
--- a/aabook/src/sqlite_utils.py
+++ b/aabook/src/sqlite_utils.py
@ -231,13 +231,56 @@ def check_and_create_chapters_table(book_number):

 # 插入到数据表中
 def insert_chapter_data(data):
-    tbl_num = int(data['book_id']) % 100
-    tbl_name = check_and_create_chapters_table(tbl_num)
-    if tbl_name :
-        return insert_or_update_common(data, tbl_name)
-    else:
+    try:        
+        # 查询是否存在以及是否需要更新
+        tbl_num = int(data['book_id']) % 100
+        tbl_name = check_and_create_chapters_table(tbl_num)
+
+        cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
+        existing_record = cursor.fetchone()
+
+        if existing_record:  # **如果演员已存在**
+            logging.debug(f"chapter {data['href']} already exist. id: {existing_record[0]}")
+            return existing_record[0], 0
+        
+        # 不存在，或者需要更新
+        data['has_content'] = 0
+        return insert_or_update_common(data, tbl_name), 1
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting or updating data: {e}")
+        return None, 0
+
+
+# 更新章节内容到数据表中
+def update_chapter_data(data):
+    try:                
+        data['has_content'] = 1
+
+        tbl_num = int(data['book_id']) % 100
+        tbl_name = check_and_create_chapters_table(tbl_num)
+
+        # 排除不更新的字段，只更新data中含有的字段
+        fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
+
+        # 构建更新语句
+        set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
+        sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
+
+        # 准备参数
+        values = [data[field] for field in fields_to_update]
+        values.append(data['href'])
+
+        cursor.execute(sql, values)
+        conn.commit()
+        
+        # 获取插入或更新后的 report_id
+        cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
+        report_id = cursor.fetchone()[0]
+        return report_id
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting or updating data: {e}")
        return None
-    
+
 # 查询某本书最后的获取页码
 def query_last_chapter_by_book(bookid):
    tbl_num = int(bookid) % 100
@ -281,6 +324,46 @@ def query_no_content_chapters(limit = 100):

    return all_results
    
+# 更新目录页的特定字段（临时）
+def update_toc_words_uptime(data):
+    try:                
+        tbl_num = int(data['book_id']) % 100
+        tbl_name = check_and_create_chapters_table(tbl_num)
+
+        # 排除不更新的字段，只更新data中含有的字段
+        fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
+
+        # 构建更新语句
+        set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
+        sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
+
+        # 准备参数
+        values = [data[field] for field in fields_to_update]
+        values.append(data['href'])
+
+        cursor.execute(sql, values)
+        conn.commit()
+        
+        # 获取插入或更新后的 report_id
+        cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
+        report_id = cursor.fetchone()[0]
+        return report_id
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting or updating data: {e}")
+        return None    
+
+# 获取所有的目录页
+def query_toc_href():
+    try:
+        sql = f"SELECT id, table_of_contents_href FROM {tbl_name_books} "
+        cursor.execute(sql)
+
+        return [{'id': row[0], 'table_of_contents_href': row[1]} for row in cursor.fetchall()]
+
+    except sqlite3.Error as e:
+        logging.error(f"查询 href 失败: {e}")
+        return None
+
 # 插入书本的卷信息
 def insert_or_update_book_sections(data):
    return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')
--- a/aabook/src/utils.py
+++ b/aabook/src/utils.py
@ -41,7 +41,20 @@ def extract_book_num(page_str, default_num = 0):
        return number
    else:
        return default_num
+
+# 目录页，获取更新时间和字数 
+def extract_chapter_uptime_words(input_str):
+    # 定义正则表达式模式
+    words_pattern = r'字数：(\d+)'
+    words_match = re.search(words_pattern, input_str)
+    words = words_match.group(1) if words_match else 0
    
+    update_time_pattern = r'更新时间：(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
+    update_time_match = re.search(update_time_pattern, input_str)
+    update_time = update_time_match.group(1) if update_time_match else datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    return words, update_time
+
 # 处理 [都市] 的方括号
 def remove_brackets_regex(input_str):
    pattern = r'\[(.*?)\]'