This commit is contained in:
oscarz
2025-03-20 09:53:00 +08:00
parent d7afa70e57
commit 57d140eb51
5 changed files with 204 additions and 75 deletions

40
aabook/src/alter_table.py Normal file
View File

@ -0,0 +1,40 @@
import sqlite3
import json
import config
import utils
import logging
import sys
from datetime import datetime
# 连接 SQLite 数据库
DB_PATH = config.global_sqlite_path # 替换为你的数据库文件
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
tbl_name_books = 'books'
tbl_name_chapters_prefix = 'chapters'
tbl_name_section = 'books_sections'
def add_columns_to_table(table_name):
try:
# 添加 words 字段
add_words_column_query = f"ALTER TABLE {table_name} ADD COLUMN words INTEGER DEFAULT 0"
cursor.execute(add_words_column_query)
# 添加 update_time 字段
add_update_time_column_query = f"ALTER TABLE {table_name} ADD COLUMN update_time TEXT DEFAULT ('2000-01-01 00:00:00')"
cursor.execute(add_update_time_column_query)
# 提交事务
conn.commit()
print(f"成功向表 {table_name} 中添加字段 words 和 update_time")
except sqlite3.Error as e:
print(f"添加字段时出现错误: {e}")
# 使用示例
if __name__ == "__main__":
# 循环遍历 0 到 100 的数字
for i in range(100):
table_name = f'{tbl_name_chapters_prefix}_{i}'
add_columns_to_table(table_name)

View File

@ -102,10 +102,11 @@ def fetch_book_toc(url):
# 获取小说的目录页,并插入到数据库 # 获取小说的目录页,并插入到数据库
def fetch_table_of_contents(): def fetch_table_of_contents():
total_updated_rows = 0
while True: while True:
update_list = db_tools.query_books(is_latest=0, limit = 2 if debug else 100) update_list = db_tools.query_books(is_latest=0, limit = 2 if debug else 100)
if update_list is None or len(update_list) <1 : if update_list is None or len(update_list) <1 :
logging.info(f'no more data need fecth.') logging.info(f'no more data need fecth. updated chapters(table of contents): {total_updated_rows}')
return return
for row in update_list: for row in update_list:
@ -152,15 +153,17 @@ def fetch_table_of_contents():
# 插入目录数据 # 插入目录数据
for chap in chapters: for chap in chapters:
chap_row_id = db_tools.insert_chapter_data({ chap_row_id, affected_rows = db_tools.insert_chapter_data({
'book_id': bookid, 'book_id': bookid,
'chapter_id': chap['chapter_id'], 'chapter_id': chap['chapter_id'],
'section_id': section_id, 'section_id': section_id,
'title': chap['title'], 'title': chap['title'],
'href': chap['href'], 'href': chap['href'],
'content': '', 'words': chap['words'],
'has_content' : 0 'update_time': chap['update_time'],
'content': ''
}) })
total_updated_rows = total_updated_rows + (affected_rows if affected_rows else 0)
if chap_row_id is None: if chap_row_id is None:
logging.warning(f'insert_chapter_data error. url: {toc_url}') logging.warning(f'insert_chapter_data error. url: {toc_url}')
succ = 0 succ = 0
@ -195,81 +198,57 @@ def fetch_contents():
content, next_url = fetch_chapter_content(url) content, next_url = fetch_chapter_content(url)
if content and content['title'] and content['contents']: if content and content['title'] and content['contents']:
# 写入到数据表里 # 写入到数据表里
db_tools.insert_chapter_data({ row_id = db_tools.update_chapter_data({
'book_id': row['book_id'], 'book_id': row['book_id'],
'chapter_id': row['chapter_id'], 'chapter_id': row['chapter_id'],
'section_id': row['section_id'], 'section_id': row['section_id'],
'title': row['title'], 'title': row['title'],
'href': url, 'href': url,
'content': '\n\n'.join(content['contents']), 'content': '\n\n'.join(content['contents'])
'has_content': 1
}) })
if row_id is None:
logging.warning(f"update chapter data error at {url} ")
else: else:
logging.warning(f'fetch content error. url: {url}') logging.warning(f'fetch content error. url: {url}')
if debug: if debug:
return return
''' # 更新小说目录页的一些信息(字数,时间),临时
# 下载完整的小说 def update_chapter_meta():
def fetch_book_data(): toc_links = db_tools.query_toc_href()
update_list = db_tools.query_books(need_update=1, limit = 1) for item in toc_links:
if update_list: toc_url = item['table_of_contents_href']
for row in update_list: bookid = item['id']
name = row['name'] logging.info(f'fetching page: {toc_url}')
href = row['href'] toc_data = fetch_book_toc(toc_url)
bookid = row['id']
# 先打开详情 # 解析目录
logging.info(f'----------fetching book {name}: {href}-------------') if toc_data is None:
book_detail = fetch_book_detail(href) logging.warning(f'fetch_book_toc error. url: {toc_url}')
if book_detail: continue
# 获取内容页,然后循环读取内容
chapter_url = book_detail['start_page_href'] # 插入所有的目录数据
chapter_id = utils.extract_page_num(chapter_url) for row in toc_data:
# 断点续传,从上次拉取的最后一页开始 chapters = row['chapters']
if not force: # 插入目录数据
last_chapter_url = db_tools.query_last_chapter_by_book(bookid) for chap in chapters:
if last_chapter_url: chap_row_id = db_tools.update_toc_words_uptime({
chapter_url = last_chapter_url 'book_id': bookid,
while chapter_url: 'href': chap['href'],
logging.info(f'fetching page: {chapter_url}') 'words': chap['words'],
content, next_url = fetch_chapter_content(chapter_url) 'update_time': chap['update_time']
if content and content['title'] and content['contents']: })
# 写入到数据表里 if chap_row_id is None:
db_tools.insert_chapter_data({ logging.warning(f'insert toc error. url: {toc_url}')
'book_id': bookid,
'chapter_id': chapter_id,
'title': content['title'],
'href': chapter_url,
'content': '\n\n'.join(content['contents']),
'has_content': 1
})
if debug:
return
else:
logging.warning(f'fetch content error. url: {chapter_url}')
chapter_url = next_url
# 读取完毕,更新列表
row_id = db_tools.update_book_detail({
'href' : href,
**book_detail
})
if row_id:
logging.debug(f'update book succ. id: {row_id}, url: {href}')
else:
logging.warning(f'update book failed. url: {href}')
else:
logging.warning(f'get book detail failed. url: {href}')
else:
logging.warning(f'get no data needed update.')
'''
# 建立缩写到函数的映射 # 建立缩写到函数的映射
function_map = { function_map = {
"list": fetch_book_list, "list": fetch_book_list,
"toc" : fetch_table_of_contents, "toc" : fetch_table_of_contents,
"content": fetch_contents, "content": fetch_contents,
"update": update_chapter_meta,
} }
# 主函数 # 主函数

View File

@ -188,10 +188,17 @@ def pase_chapter_list(soup, url):
chap_list = sections[i].find_all("a") chap_list = sections[i].find_all("a")
chap_data = [] chap_data = []
for chap in chap_list: for chap in chap_list:
chap_title = chap.get_text().strip() chap_title = chap.get_text().strip() # 获取章节标题
chap_link = f"{host_url}/{chap['href']}" chap_link = f"{host_url}/{chap['href']}" # 获取章节链接
chap_id = utils.extract_page_num(chap_link) chap_id = utils.extract_page_num(chap_link)
chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id}) chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
chap_data.append({
'href': chap_link,
'title': chap_title,
'chapter_id': chap_id,
'words': chap_words,
'update_time' : chap_uptime,
})
table_of_contents.append({'title': section_title, 'chapters': chap_data}) table_of_contents.append({'title': section_title, 'chapters': chap_data})
return table_of_contents return table_of_contents
@ -277,7 +284,14 @@ def parse_content_page(soup, url):
for paragraph in paragraphs: for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph) cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text) content.append(cleaned_text)
else:
# 某些页面没有p标签只有一个h1要兼容此问题
paragraphs = soup.find_all('h1')
if paragraphs:
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text)
return content return content
# 通用的 HTML 结构验证器 # 通用的 HTML 结构验证器

View File

@ -231,13 +231,56 @@ def check_and_create_chapters_table(book_number):
# 插入到数据表中 # 插入到数据表中
def insert_chapter_data(data): def insert_chapter_data(data):
tbl_num = int(data['book_id']) % 100 try:
tbl_name = check_and_create_chapters_table(tbl_num) # 查询是否存在以及是否需要更新
if tbl_name : tbl_num = int(data['book_id']) % 100
return insert_or_update_common(data, tbl_name) tbl_name = check_and_create_chapters_table(tbl_num)
else:
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
existing_record = cursor.fetchone()
if existing_record: # **如果演员已存在**
logging.debug(f"chapter {data['href']} already exist. id: {existing_record[0]}")
return existing_record[0], 0
# 不存在,或者需要更新
data['has_content'] = 0
return insert_or_update_common(data, tbl_name), 1
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None, 0
# 更新章节内容到数据表中
def update_chapter_data(data):
try:
data['has_content'] = 1
tbl_num = int(data['book_id']) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
# 排除不更新的字段只更新data中含有的字段
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
# 构建更新语句
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
# 准备参数
values = [data[field] for field in fields_to_update]
values.append(data['href'])
cursor.execute(sql, values)
conn.commit()
# 获取插入或更新后的 report_id
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
report_id = cursor.fetchone()[0]
return report_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None return None
# 查询某本书最后的获取页码 # 查询某本书最后的获取页码
def query_last_chapter_by_book(bookid): def query_last_chapter_by_book(bookid):
tbl_num = int(bookid) % 100 tbl_num = int(bookid) % 100
@ -281,6 +324,46 @@ def query_no_content_chapters(limit = 100):
return all_results return all_results
# 更新目录页的特定字段(临时)
def update_toc_words_uptime(data):
try:
tbl_num = int(data['book_id']) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
# 排除不更新的字段只更新data中含有的字段
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
# 构建更新语句
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
# 准备参数
values = [data[field] for field in fields_to_update]
values.append(data['href'])
cursor.execute(sql, values)
conn.commit()
# 获取插入或更新后的 report_id
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
report_id = cursor.fetchone()[0]
return report_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# 获取所有的目录页
def query_toc_href():
try:
sql = f"SELECT id, table_of_contents_href FROM {tbl_name_books} "
cursor.execute(sql)
return [{'id': row[0], 'table_of_contents_href': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入书本的卷信息 # 插入书本的卷信息
def insert_or_update_book_sections(data): def insert_or_update_book_sections(data):
return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section') return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')

View File

@ -41,7 +41,20 @@ def extract_book_num(page_str, default_num = 0):
return number return number
else: else:
return default_num return default_num
# 目录页,获取更新时间和字数
def extract_chapter_uptime_words(input_str):
# 定义正则表达式模式
words_pattern = r'字数:(\d+)'
words_match = re.search(words_pattern, input_str)
words = words_match.group(1) if words_match else 0
update_time_pattern = r'更新时间:(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
update_time_match = re.search(update_time_pattern, input_str)
update_time = update_time_match.group(1) if update_time_match else datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return words, update_time
# 处理 [都市] 的方括号 # 处理 [都市] 的方括号
def remove_brackets_regex(input_str): def remove_brackets_regex(input_str):
pattern = r'\[(.*?)\]' pattern = r'\[(.*?)\]'