diff --git a/aabook/src/alter_table.py b/aabook/src/alter_table.py new file mode 100644 index 0000000..72e37fb --- /dev/null +++ b/aabook/src/alter_table.py @@ -0,0 +1,40 @@ +import sqlite3 +import json +import config +import utils +import logging +import sys +from datetime import datetime + + +# 连接 SQLite 数据库 +DB_PATH = config.global_sqlite_path # 替换为你的数据库文件 +conn = sqlite3.connect(DB_PATH) +cursor = conn.cursor() + +tbl_name_books = 'books' +tbl_name_chapters_prefix = 'chapters' +tbl_name_section = 'books_sections' + +def add_columns_to_table(table_name): + try: + # 添加 words 字段 + add_words_column_query = f"ALTER TABLE {table_name} ADD COLUMN words INTEGER DEFAULT 0" + cursor.execute(add_words_column_query) + + # 添加 update_time 字段 + add_update_time_column_query = f"ALTER TABLE {table_name} ADD COLUMN update_time TEXT DEFAULT ('2000-01-01 00:00:00')" + cursor.execute(add_update_time_column_query) + + # 提交事务 + conn.commit() + print(f"成功向表 {table_name} 中添加字段 words 和 update_time") + except sqlite3.Error as e: + print(f"添加字段时出现错误: {e}") + +# 使用示例 +if __name__ == "__main__": + # 循环遍历 0 到 100 的数字 + for i in range(100): + table_name = f'{tbl_name_chapters_prefix}_{i}' + add_columns_to_table(table_name) diff --git a/aabook/src/fetch.py b/aabook/src/fetch.py index 668560a..9838c4b 100644 --- a/aabook/src/fetch.py +++ b/aabook/src/fetch.py @@ -102,10 +102,11 @@ def fetch_book_toc(url): # 获取小说的目录页,并插入到数据库 def fetch_table_of_contents(): + total_updated_rows = 0 while True: update_list = db_tools.query_books(is_latest=0, limit = 2 if debug else 100) if update_list is None or len(update_list) <1 : - logging.info(f'no more data need fecth.') + logging.info(f'no more data need fecth. updated chapters(table of contents): {total_updated_rows}') return for row in update_list: @@ -152,15 +153,17 @@ def fetch_table_of_contents(): # 插入目录数据 for chap in chapters: - chap_row_id = db_tools.insert_chapter_data({ - 'book_id': bookid, - 'chapter_id': chap['chapter_id'], - 'section_id': section_id, - 'title': chap['title'], - 'href': chap['href'], - 'content': '', - 'has_content' : 0 + chap_row_id, affected_rows = db_tools.insert_chapter_data({ + 'book_id': bookid, + 'chapter_id': chap['chapter_id'], + 'section_id': section_id, + 'title': chap['title'], + 'href': chap['href'], + 'words': chap['words'], + 'update_time': chap['update_time'], + 'content': '' }) + total_updated_rows = total_updated_rows + (affected_rows if affected_rows else 0) if chap_row_id is None: logging.warning(f'insert_chapter_data error. url: {toc_url}') succ = 0 @@ -195,81 +198,57 @@ def fetch_contents(): content, next_url = fetch_chapter_content(url) if content and content['title'] and content['contents']: # 写入到数据表里 - db_tools.insert_chapter_data({ + row_id = db_tools.update_chapter_data({ 'book_id': row['book_id'], 'chapter_id': row['chapter_id'], 'section_id': row['section_id'], 'title': row['title'], 'href': url, - 'content': '\n\n'.join(content['contents']), - 'has_content': 1 + 'content': '\n\n'.join(content['contents']) }) + if row_id is None: + logging.warning(f"update chapter data error at {url} ") else: logging.warning(f'fetch content error. url: {url}') if debug: return -''' -# 下载完整的小说 -def fetch_book_data(): - update_list = db_tools.query_books(need_update=1, limit = 1) - if update_list: - for row in update_list: - name = row['name'] - href = row['href'] - bookid = row['id'] - # 先打开详情页 - logging.info(f'----------fetching book {name}: {href}-------------') - book_detail = fetch_book_detail(href) - if book_detail: - # 获取内容页,然后循环读取内容 - chapter_url = book_detail['start_page_href'] - chapter_id = utils.extract_page_num(chapter_url) - # 断点续传,从上次拉取的最后一页开始 - if not force: - last_chapter_url = db_tools.query_last_chapter_by_book(bookid) - if last_chapter_url: - chapter_url = last_chapter_url - while chapter_url: - logging.info(f'fetching page: {chapter_url}') - content, next_url = fetch_chapter_content(chapter_url) - if content and content['title'] and content['contents']: - # 写入到数据表里 - db_tools.insert_chapter_data({ - 'book_id': bookid, - 'chapter_id': chapter_id, - 'title': content['title'], - 'href': chapter_url, - 'content': '\n\n'.join(content['contents']), - 'has_content': 1 - }) +# 更新小说目录页的一些信息(字数,时间),临时 +def update_chapter_meta(): + toc_links = db_tools.query_toc_href() + for item in toc_links: + toc_url = item['table_of_contents_href'] + bookid = item['id'] + logging.info(f'fetching page: {toc_url}') + toc_data = fetch_book_toc(toc_url) + + # 解析目录页 + if toc_data is None: + logging.warning(f'fetch_book_toc error. url: {toc_url}') + continue + + # 插入所有的目录数据 + for row in toc_data: + chapters = row['chapters'] + # 插入目录数据 + for chap in chapters: + chap_row_id = db_tools.update_toc_words_uptime({ + 'book_id': bookid, + 'href': chap['href'], + 'words': chap['words'], + 'update_time': chap['update_time'] + }) + if chap_row_id is None: + logging.warning(f'insert toc error. url: {toc_url}') - if debug: - return - else: - logging.warning(f'fetch content error. url: {chapter_url}') - chapter_url = next_url - # 读取完毕,更新列表 - row_id = db_tools.update_book_detail({ - 'href' : href, - **book_detail - }) - if row_id: - logging.debug(f'update book succ. id: {row_id}, url: {href}') - else: - logging.warning(f'update book failed. url: {href}') - else: - logging.warning(f'get book detail failed. url: {href}') - else: - logging.warning(f'get no data needed update.') -''' # 建立缩写到函数的映射 function_map = { "list": fetch_book_list, "toc" : fetch_table_of_contents, "content": fetch_contents, + "update": update_chapter_meta, } # 主函数 diff --git a/aabook/src/scraper.py b/aabook/src/scraper.py index b0928f5..302690a 100644 --- a/aabook/src/scraper.py +++ b/aabook/src/scraper.py @@ -188,10 +188,17 @@ def pase_chapter_list(soup, url): chap_list = sections[i].find_all("a") chap_data = [] for chap in chap_list: - chap_title = chap.get_text().strip() - chap_link = f"{host_url}/{chap['href']}" + chap_title = chap.get_text().strip() # 获取章节标题 + chap_link = f"{host_url}/{chap['href']}" # 获取章节链接 chap_id = utils.extract_page_num(chap_link) - chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id}) + chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数 + chap_data.append({ + 'href': chap_link, + 'title': chap_title, + 'chapter_id': chap_id, + 'words': chap_words, + 'update_time' : chap_uptime, + }) table_of_contents.append({'title': section_title, 'chapters': chap_data}) return table_of_contents @@ -277,7 +284,14 @@ def parse_content_page(soup, url): for paragraph in paragraphs: cleaned_text = process_paragraph(paragraph) content.append(cleaned_text) - + else: + # 某些页面,没有p标签,只有一个h1,要兼容此问题 + paragraphs = soup.find_all('h1') + if paragraphs: + for paragraph in paragraphs: + cleaned_text = process_paragraph(paragraph) + content.append(cleaned_text) + return content # 通用的 HTML 结构验证器 diff --git a/aabook/src/sqlite_utils.py b/aabook/src/sqlite_utils.py index deb4e16..8834ce0 100644 --- a/aabook/src/sqlite_utils.py +++ b/aabook/src/sqlite_utils.py @@ -231,13 +231,56 @@ def check_and_create_chapters_table(book_number): # 插入到数据表中 def insert_chapter_data(data): - tbl_num = int(data['book_id']) % 100 - tbl_name = check_and_create_chapters_table(tbl_num) - if tbl_name : - return insert_or_update_common(data, tbl_name) - else: + try: + # 查询是否存在以及是否需要更新 + tbl_num = int(data['book_id']) % 100 + tbl_name = check_and_create_chapters_table(tbl_num) + + cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], )) + existing_record = cursor.fetchone() + + if existing_record: # **如果演员已存在** + logging.debug(f"chapter {data['href']} already exist. id: {existing_record[0]}") + return existing_record[0], 0 + + # 不存在,或者需要更新 + data['has_content'] = 0 + return insert_or_update_common(data, tbl_name), 1 + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None, 0 + + +# 更新章节内容到数据表中 +def update_chapter_data(data): + try: + data['has_content'] = 1 + + tbl_num = int(data['book_id']) % 100 + tbl_name = check_and_create_chapters_table(tbl_num) + + # 排除不更新的字段,只更新data中含有的字段 + fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']] + + # 构建更新语句 + set_clause = ', '.join([f"{field} = ?" for field in fields_to_update]) + sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?" + + # 准备参数 + values = [data[field] for field in fields_to_update] + values.append(data['href']) + + cursor.execute(sql, values) + conn.commit() + + # 获取插入或更新后的 report_id + cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],)) + report_id = cursor.fetchone()[0] + return report_id + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") return None - + # 查询某本书最后的获取页码 def query_last_chapter_by_book(bookid): tbl_num = int(bookid) % 100 @@ -281,6 +324,46 @@ def query_no_content_chapters(limit = 100): return all_results +# 更新目录页的特定字段(临时) +def update_toc_words_uptime(data): + try: + tbl_num = int(data['book_id']) % 100 + tbl_name = check_and_create_chapters_table(tbl_num) + + # 排除不更新的字段,只更新data中含有的字段 + fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']] + + # 构建更新语句 + set_clause = ', '.join([f"{field} = ?" for field in fields_to_update]) + sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?" + + # 准备参数 + values = [data[field] for field in fields_to_update] + values.append(data['href']) + + cursor.execute(sql, values) + conn.commit() + + # 获取插入或更新后的 report_id + cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],)) + report_id = cursor.fetchone()[0] + return report_id + except sqlite3.Error as e: + logging.error(f"Error inserting or updating data: {e}") + return None + +# 获取所有的目录页 +def query_toc_href(): + try: + sql = f"SELECT id, table_of_contents_href FROM {tbl_name_books} " + cursor.execute(sql) + + return [{'id': row[0], 'table_of_contents_href': row[1]} for row in cursor.fetchall()] + + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None + # 插入书本的卷信息 def insert_or_update_book_sections(data): return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section') diff --git a/aabook/src/utils.py b/aabook/src/utils.py index 7c5f153..6db2052 100644 --- a/aabook/src/utils.py +++ b/aabook/src/utils.py @@ -41,7 +41,20 @@ def extract_book_num(page_str, default_num = 0): return number else: return default_num + +# 目录页,获取更新时间和字数 +def extract_chapter_uptime_words(input_str): + # 定义正则表达式模式 + words_pattern = r'字数:(\d+)' + words_match = re.search(words_pattern, input_str) + words = words_match.group(1) if words_match else 0 + update_time_pattern = r'更新时间:(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})' + update_time_match = re.search(update_time_pattern, input_str) + update_time = update_time_match.group(1) if update_time_match else datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + return words, update_time + # 处理 [都市] 的方括号 def remove_brackets_regex(input_str): pattern = r'\[(.*?)\]'