This commit is contained in:
oscarz
2025-03-20 09:53:00 +08:00
parent d7afa70e57
commit 57d140eb51
5 changed files with 204 additions and 75 deletions

40
aabook/src/alter_table.py Normal file
View File

@ -0,0 +1,40 @@
import sqlite3
import json
import config
import utils
import logging
import sys
from datetime import datetime
# 连接 SQLite 数据库
DB_PATH = config.global_sqlite_path # 替换为你的数据库文件
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
tbl_name_books = 'books'
tbl_name_chapters_prefix = 'chapters'
tbl_name_section = 'books_sections'
def add_columns_to_table(table_name):
try:
# 添加 words 字段
add_words_column_query = f"ALTER TABLE {table_name} ADD COLUMN words INTEGER DEFAULT 0"
cursor.execute(add_words_column_query)
# 添加 update_time 字段
add_update_time_column_query = f"ALTER TABLE {table_name} ADD COLUMN update_time TEXT DEFAULT ('2000-01-01 00:00:00')"
cursor.execute(add_update_time_column_query)
# 提交事务
conn.commit()
print(f"成功向表 {table_name} 中添加字段 words 和 update_time")
except sqlite3.Error as e:
print(f"添加字段时出现错误: {e}")
# 使用示例
if __name__ == "__main__":
# 循环遍历 0 到 100 的数字
for i in range(100):
table_name = f'{tbl_name_chapters_prefix}_{i}'
add_columns_to_table(table_name)

View File

@ -102,10 +102,11 @@ def fetch_book_toc(url):
# 获取小说的目录页,并插入到数据库
def fetch_table_of_contents():
total_updated_rows = 0
while True:
update_list = db_tools.query_books(is_latest=0, limit = 2 if debug else 100)
if update_list is None or len(update_list) <1 :
logging.info(f'no more data need fecth.')
logging.info(f'no more data need fecth. updated chapters(table of contents): {total_updated_rows}')
return
for row in update_list:
@ -152,15 +153,17 @@ def fetch_table_of_contents():
# 插入目录数据
for chap in chapters:
chap_row_id = db_tools.insert_chapter_data({
'book_id': bookid,
'chapter_id': chap['chapter_id'],
'section_id': section_id,
'title': chap['title'],
'href': chap['href'],
'content': '',
'has_content' : 0
chap_row_id, affected_rows = db_tools.insert_chapter_data({
'book_id': bookid,
'chapter_id': chap['chapter_id'],
'section_id': section_id,
'title': chap['title'],
'href': chap['href'],
'words': chap['words'],
'update_time': chap['update_time'],
'content': ''
})
total_updated_rows = total_updated_rows + (affected_rows if affected_rows else 0)
if chap_row_id is None:
logging.warning(f'insert_chapter_data error. url: {toc_url}')
succ = 0
@ -195,81 +198,57 @@ def fetch_contents():
content, next_url = fetch_chapter_content(url)
if content and content['title'] and content['contents']:
# 写入到数据表里
db_tools.insert_chapter_data({
row_id = db_tools.update_chapter_data({
'book_id': row['book_id'],
'chapter_id': row['chapter_id'],
'section_id': row['section_id'],
'title': row['title'],
'href': url,
'content': '\n\n'.join(content['contents']),
'has_content': 1
'content': '\n\n'.join(content['contents'])
})
if row_id is None:
logging.warning(f"update chapter data error at {url} ")
else:
logging.warning(f'fetch content error. url: {url}')
if debug:
return
'''
# 下载完整的小说
def fetch_book_data():
update_list = db_tools.query_books(need_update=1, limit = 1)
if update_list:
for row in update_list:
name = row['name']
href = row['href']
bookid = row['id']
# 先打开详情
logging.info(f'----------fetching book {name}: {href}-------------')
book_detail = fetch_book_detail(href)
if book_detail:
# 获取内容页,然后循环读取内容
chapter_url = book_detail['start_page_href']
chapter_id = utils.extract_page_num(chapter_url)
# 断点续传,从上次拉取的最后一页开始
if not force:
last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
if last_chapter_url:
chapter_url = last_chapter_url
while chapter_url:
logging.info(f'fetching page: {chapter_url}')
content, next_url = fetch_chapter_content(chapter_url)
if content and content['title'] and content['contents']:
# 写入到数据表里
db_tools.insert_chapter_data({
'book_id': bookid,
'chapter_id': chapter_id,
'title': content['title'],
'href': chapter_url,
'content': '\n\n'.join(content['contents']),
'has_content': 1
})
# 更新小说目录页的一些信息(字数,时间),临时
def update_chapter_meta():
toc_links = db_tools.query_toc_href()
for item in toc_links:
toc_url = item['table_of_contents_href']
bookid = item['id']
logging.info(f'fetching page: {toc_url}')
toc_data = fetch_book_toc(toc_url)
# 解析目录
if toc_data is None:
logging.warning(f'fetch_book_toc error. url: {toc_url}')
continue
# 插入所有的目录数据
for row in toc_data:
chapters = row['chapters']
# 插入目录数据
for chap in chapters:
chap_row_id = db_tools.update_toc_words_uptime({
'book_id': bookid,
'href': chap['href'],
'words': chap['words'],
'update_time': chap['update_time']
})
if chap_row_id is None:
logging.warning(f'insert toc error. url: {toc_url}')
if debug:
return
else:
logging.warning(f'fetch content error. url: {chapter_url}')
chapter_url = next_url
# 读取完毕,更新列表
row_id = db_tools.update_book_detail({
'href' : href,
**book_detail
})
if row_id:
logging.debug(f'update book succ. id: {row_id}, url: {href}')
else:
logging.warning(f'update book failed. url: {href}')
else:
logging.warning(f'get book detail failed. url: {href}')
else:
logging.warning(f'get no data needed update.')
'''
# 建立缩写到函数的映射
function_map = {
"list": fetch_book_list,
"toc" : fetch_table_of_contents,
"content": fetch_contents,
"update": update_chapter_meta,
}
# 主函数

View File

@ -188,10 +188,17 @@ def pase_chapter_list(soup, url):
chap_list = sections[i].find_all("a")
chap_data = []
for chap in chap_list:
chap_title = chap.get_text().strip()
chap_link = f"{host_url}/{chap['href']}"
chap_title = chap.get_text().strip() # 获取章节标题
chap_link = f"{host_url}/{chap['href']}" # 获取章节链接
chap_id = utils.extract_page_num(chap_link)
chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
chap_data.append({
'href': chap_link,
'title': chap_title,
'chapter_id': chap_id,
'words': chap_words,
'update_time' : chap_uptime,
})
table_of_contents.append({'title': section_title, 'chapters': chap_data})
return table_of_contents
@ -277,7 +284,14 @@ def parse_content_page(soup, url):
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text)
else:
# 某些页面没有p标签只有一个h1要兼容此问题
paragraphs = soup.find_all('h1')
if paragraphs:
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text)
return content
# 通用的 HTML 结构验证器

View File

@ -231,13 +231,56 @@ def check_and_create_chapters_table(book_number):
# 插入到数据表中
def insert_chapter_data(data):
tbl_num = int(data['book_id']) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
if tbl_name :
return insert_or_update_common(data, tbl_name)
else:
try:
# 查询是否存在以及是否需要更新
tbl_num = int(data['book_id']) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
existing_record = cursor.fetchone()
if existing_record: # **如果演员已存在**
logging.debug(f"chapter {data['href']} already exist. id: {existing_record[0]}")
return existing_record[0], 0
# 不存在,或者需要更新
data['has_content'] = 0
return insert_or_update_common(data, tbl_name), 1
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None, 0
# 更新章节内容到数据表中
def update_chapter_data(data):
try:
data['has_content'] = 1
tbl_num = int(data['book_id']) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
# 排除不更新的字段只更新data中含有的字段
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
# 构建更新语句
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
# 准备参数
values = [data[field] for field in fields_to_update]
values.append(data['href'])
cursor.execute(sql, values)
conn.commit()
# 获取插入或更新后的 report_id
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
report_id = cursor.fetchone()[0]
return report_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# 查询某本书最后的获取页码
def query_last_chapter_by_book(bookid):
tbl_num = int(bookid) % 100
@ -281,6 +324,46 @@ def query_no_content_chapters(limit = 100):
return all_results
# 更新目录页的特定字段(临时)
def update_toc_words_uptime(data):
try:
tbl_num = int(data['book_id']) % 100
tbl_name = check_and_create_chapters_table(tbl_num)
# 排除不更新的字段只更新data中含有的字段
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
# 构建更新语句
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
# 准备参数
values = [data[field] for field in fields_to_update]
values.append(data['href'])
cursor.execute(sql, values)
conn.commit()
# 获取插入或更新后的 report_id
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
report_id = cursor.fetchone()[0]
return report_id
except sqlite3.Error as e:
logging.error(f"Error inserting or updating data: {e}")
return None
# 获取所有的目录页
def query_toc_href():
try:
sql = f"SELECT id, table_of_contents_href FROM {tbl_name_books} "
cursor.execute(sql)
return [{'id': row[0], 'table_of_contents_href': row[1]} for row in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
# 插入书本的卷信息
def insert_or_update_book_sections(data):
return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')

View File

@ -41,7 +41,20 @@ def extract_book_num(page_str, default_num = 0):
return number
else:
return default_num
# 目录页,获取更新时间和字数
def extract_chapter_uptime_words(input_str):
# 定义正则表达式模式
words_pattern = r'字数:(\d+)'
words_match = re.search(words_pattern, input_str)
words = words_match.group(1) if words_match else 0
update_time_pattern = r'更新时间:(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
update_time_match = re.search(update_time_pattern, input_str)
update_time = update_time_match.group(1) if update_time_match else datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return words, update_time
# 处理 [都市] 的方括号
def remove_brackets_regex(input_str):
pattern = r'\[(.*?)\]'