modify
This commit is contained in:
40
aabook/src/alter_table.py
Normal file
40
aabook/src/alter_table.py
Normal file
@ -0,0 +1,40 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import config
|
||||
import utils
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
# 连接 SQLite 数据库
|
||||
DB_PATH = config.global_sqlite_path # 替换为你的数据库文件
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
tbl_name_books = 'books'
|
||||
tbl_name_chapters_prefix = 'chapters'
|
||||
tbl_name_section = 'books_sections'
|
||||
|
||||
def add_columns_to_table(table_name):
|
||||
try:
|
||||
# 添加 words 字段
|
||||
add_words_column_query = f"ALTER TABLE {table_name} ADD COLUMN words INTEGER DEFAULT 0"
|
||||
cursor.execute(add_words_column_query)
|
||||
|
||||
# 添加 update_time 字段
|
||||
add_update_time_column_query = f"ALTER TABLE {table_name} ADD COLUMN update_time TEXT DEFAULT ('2000-01-01 00:00:00')"
|
||||
cursor.execute(add_update_time_column_query)
|
||||
|
||||
# 提交事务
|
||||
conn.commit()
|
||||
print(f"成功向表 {table_name} 中添加字段 words 和 update_time")
|
||||
except sqlite3.Error as e:
|
||||
print(f"添加字段时出现错误: {e}")
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 循环遍历 0 到 100 的数字
|
||||
for i in range(100):
|
||||
table_name = f'{tbl_name_chapters_prefix}_{i}'
|
||||
add_columns_to_table(table_name)
|
||||
@ -102,10 +102,11 @@ def fetch_book_toc(url):
|
||||
|
||||
# 获取小说的目录页,并插入到数据库
|
||||
def fetch_table_of_contents():
|
||||
total_updated_rows = 0
|
||||
while True:
|
||||
update_list = db_tools.query_books(is_latest=0, limit = 2 if debug else 100)
|
||||
if update_list is None or len(update_list) <1 :
|
||||
logging.info(f'no more data need fecth.')
|
||||
logging.info(f'no more data need fecth. updated chapters(table of contents): {total_updated_rows}')
|
||||
return
|
||||
|
||||
for row in update_list:
|
||||
@ -152,15 +153,17 @@ def fetch_table_of_contents():
|
||||
|
||||
# 插入目录数据
|
||||
for chap in chapters:
|
||||
chap_row_id = db_tools.insert_chapter_data({
|
||||
chap_row_id, affected_rows = db_tools.insert_chapter_data({
|
||||
'book_id': bookid,
|
||||
'chapter_id': chap['chapter_id'],
|
||||
'section_id': section_id,
|
||||
'title': chap['title'],
|
||||
'href': chap['href'],
|
||||
'content': '',
|
||||
'has_content' : 0
|
||||
'words': chap['words'],
|
||||
'update_time': chap['update_time'],
|
||||
'content': ''
|
||||
})
|
||||
total_updated_rows = total_updated_rows + (affected_rows if affected_rows else 0)
|
||||
if chap_row_id is None:
|
||||
logging.warning(f'insert_chapter_data error. url: {toc_url}')
|
||||
succ = 0
|
||||
@ -195,81 +198,57 @@ def fetch_contents():
|
||||
content, next_url = fetch_chapter_content(url)
|
||||
if content and content['title'] and content['contents']:
|
||||
# 写入到数据表里
|
||||
db_tools.insert_chapter_data({
|
||||
row_id = db_tools.update_chapter_data({
|
||||
'book_id': row['book_id'],
|
||||
'chapter_id': row['chapter_id'],
|
||||
'section_id': row['section_id'],
|
||||
'title': row['title'],
|
||||
'href': url,
|
||||
'content': '\n\n'.join(content['contents']),
|
||||
'has_content': 1
|
||||
'content': '\n\n'.join(content['contents'])
|
||||
})
|
||||
if row_id is None:
|
||||
logging.warning(f"update chapter data error at {url} ")
|
||||
else:
|
||||
logging.warning(f'fetch content error. url: {url}')
|
||||
if debug:
|
||||
return
|
||||
|
||||
|
||||
'''
|
||||
# 下载完整的小说
|
||||
def fetch_book_data():
|
||||
update_list = db_tools.query_books(need_update=1, limit = 1)
|
||||
if update_list:
|
||||
for row in update_list:
|
||||
name = row['name']
|
||||
href = row['href']
|
||||
bookid = row['id']
|
||||
# 先打开详情页
|
||||
logging.info(f'----------fetching book {name}: {href}-------------')
|
||||
book_detail = fetch_book_detail(href)
|
||||
if book_detail:
|
||||
# 获取内容页,然后循环读取内容
|
||||
chapter_url = book_detail['start_page_href']
|
||||
chapter_id = utils.extract_page_num(chapter_url)
|
||||
# 断点续传,从上次拉取的最后一页开始
|
||||
if not force:
|
||||
last_chapter_url = db_tools.query_last_chapter_by_book(bookid)
|
||||
if last_chapter_url:
|
||||
chapter_url = last_chapter_url
|
||||
while chapter_url:
|
||||
logging.info(f'fetching page: {chapter_url}')
|
||||
content, next_url = fetch_chapter_content(chapter_url)
|
||||
if content and content['title'] and content['contents']:
|
||||
# 写入到数据表里
|
||||
db_tools.insert_chapter_data({
|
||||
'book_id': bookid,
|
||||
'chapter_id': chapter_id,
|
||||
'title': content['title'],
|
||||
'href': chapter_url,
|
||||
'content': '\n\n'.join(content['contents']),
|
||||
'has_content': 1
|
||||
})
|
||||
# 更新小说目录页的一些信息(字数,时间),临时
|
||||
def update_chapter_meta():
|
||||
toc_links = db_tools.query_toc_href()
|
||||
for item in toc_links:
|
||||
toc_url = item['table_of_contents_href']
|
||||
bookid = item['id']
|
||||
logging.info(f'fetching page: {toc_url}')
|
||||
toc_data = fetch_book_toc(toc_url)
|
||||
|
||||
if debug:
|
||||
return
|
||||
else:
|
||||
logging.warning(f'fetch content error. url: {chapter_url}')
|
||||
chapter_url = next_url
|
||||
# 读取完毕,更新列表
|
||||
row_id = db_tools.update_book_detail({
|
||||
'href' : href,
|
||||
**book_detail
|
||||
# 解析目录页
|
||||
if toc_data is None:
|
||||
logging.warning(f'fetch_book_toc error. url: {toc_url}')
|
||||
continue
|
||||
|
||||
# 插入所有的目录数据
|
||||
for row in toc_data:
|
||||
chapters = row['chapters']
|
||||
# 插入目录数据
|
||||
for chap in chapters:
|
||||
chap_row_id = db_tools.update_toc_words_uptime({
|
||||
'book_id': bookid,
|
||||
'href': chap['href'],
|
||||
'words': chap['words'],
|
||||
'update_time': chap['update_time']
|
||||
})
|
||||
if row_id:
|
||||
logging.debug(f'update book succ. id: {row_id}, url: {href}')
|
||||
else:
|
||||
logging.warning(f'update book failed. url: {href}')
|
||||
else:
|
||||
logging.warning(f'get book detail failed. url: {href}')
|
||||
else:
|
||||
logging.warning(f'get no data needed update.')
|
||||
'''
|
||||
if chap_row_id is None:
|
||||
logging.warning(f'insert toc error. url: {toc_url}')
|
||||
|
||||
|
||||
# 建立缩写到函数的映射
|
||||
function_map = {
|
||||
"list": fetch_book_list,
|
||||
"toc" : fetch_table_of_contents,
|
||||
"content": fetch_contents,
|
||||
"update": update_chapter_meta,
|
||||
}
|
||||
|
||||
# 主函数
|
||||
|
||||
@ -188,10 +188,17 @@ def pase_chapter_list(soup, url):
|
||||
chap_list = sections[i].find_all("a")
|
||||
chap_data = []
|
||||
for chap in chap_list:
|
||||
chap_title = chap.get_text().strip()
|
||||
chap_link = f"{host_url}/{chap['href']}"
|
||||
chap_title = chap.get_text().strip() # 获取章节标题
|
||||
chap_link = f"{host_url}/{chap['href']}" # 获取章节链接
|
||||
chap_id = utils.extract_page_num(chap_link)
|
||||
chap_data.append({'href': chap_link, 'title': chap_title, 'chapter_id': chap_id})
|
||||
chap_words, chap_uptime = utils.extract_chapter_uptime_words(chap['title']) # 获取更新时间和字数
|
||||
chap_data.append({
|
||||
'href': chap_link,
|
||||
'title': chap_title,
|
||||
'chapter_id': chap_id,
|
||||
'words': chap_words,
|
||||
'update_time' : chap_uptime,
|
||||
})
|
||||
table_of_contents.append({'title': section_title, 'chapters': chap_data})
|
||||
|
||||
return table_of_contents
|
||||
@ -277,6 +284,13 @@ def parse_content_page(soup, url):
|
||||
for paragraph in paragraphs:
|
||||
cleaned_text = process_paragraph(paragraph)
|
||||
content.append(cleaned_text)
|
||||
else:
|
||||
# 某些页面,没有p标签,只有一个h1,要兼容此问题
|
||||
paragraphs = soup.find_all('h1')
|
||||
if paragraphs:
|
||||
for paragraph in paragraphs:
|
||||
cleaned_text = process_paragraph(paragraph)
|
||||
content.append(cleaned_text)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
@ -231,11 +231,54 @@ def check_and_create_chapters_table(book_number):
|
||||
|
||||
# 插入到数据表中
|
||||
def insert_chapter_data(data):
|
||||
try:
|
||||
# 查询是否存在以及是否需要更新
|
||||
tbl_num = int(data['book_id']) % 100
|
||||
tbl_name = check_and_create_chapters_table(tbl_num)
|
||||
if tbl_name :
|
||||
return insert_or_update_common(data, tbl_name)
|
||||
else:
|
||||
|
||||
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ? and update_time >= ?", (data['href'], data['update_time'], ))
|
||||
existing_record = cursor.fetchone()
|
||||
|
||||
if existing_record: # **如果演员已存在**
|
||||
logging.debug(f"chapter {data['href']} already exist. id: {existing_record[0]}")
|
||||
return existing_record[0], 0
|
||||
|
||||
# 不存在,或者需要更新
|
||||
data['has_content'] = 0
|
||||
return insert_or_update_common(data, tbl_name), 1
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None, 0
|
||||
|
||||
|
||||
# 更新章节内容到数据表中
|
||||
def update_chapter_data(data):
|
||||
try:
|
||||
data['has_content'] = 1
|
||||
|
||||
tbl_num = int(data['book_id']) % 100
|
||||
tbl_name = check_and_create_chapters_table(tbl_num)
|
||||
|
||||
# 排除不更新的字段,只更新data中含有的字段
|
||||
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
|
||||
|
||||
# 构建更新语句
|
||||
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
|
||||
sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
|
||||
|
||||
# 准备参数
|
||||
values = [data[field] for field in fields_to_update]
|
||||
values.append(data['href'])
|
||||
|
||||
cursor.execute(sql, values)
|
||||
conn.commit()
|
||||
|
||||
# 获取插入或更新后的 report_id
|
||||
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
|
||||
report_id = cursor.fetchone()[0]
|
||||
return report_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
# 查询某本书最后的获取页码
|
||||
@ -281,6 +324,46 @@ def query_no_content_chapters(limit = 100):
|
||||
|
||||
return all_results
|
||||
|
||||
# 更新目录页的特定字段(临时)
|
||||
def update_toc_words_uptime(data):
|
||||
try:
|
||||
tbl_num = int(data['book_id']) % 100
|
||||
tbl_name = check_and_create_chapters_table(tbl_num)
|
||||
|
||||
# 排除不更新的字段,只更新data中含有的字段
|
||||
fields_to_update = [field for field in data if field not in ['id', 'href', 'created_at']]
|
||||
|
||||
# 构建更新语句
|
||||
set_clause = ', '.join([f"{field} = ?" for field in fields_to_update])
|
||||
sql = f"UPDATE {tbl_name} SET {set_clause}, updated_at = datetime('now', 'localtime') WHERE href = ?"
|
||||
|
||||
# 准备参数
|
||||
values = [data[field] for field in fields_to_update]
|
||||
values.append(data['href'])
|
||||
|
||||
cursor.execute(sql, values)
|
||||
conn.commit()
|
||||
|
||||
# 获取插入或更新后的 report_id
|
||||
cursor.execute(f"SELECT id FROM {tbl_name} WHERE href = ?", (data['href'],))
|
||||
report_id = cursor.fetchone()[0]
|
||||
return report_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
# 获取所有的目录页
|
||||
def query_toc_href():
|
||||
try:
|
||||
sql = f"SELECT id, table_of_contents_href FROM {tbl_name_books} "
|
||||
cursor.execute(sql)
|
||||
|
||||
return [{'id': row[0], 'table_of_contents_href': row[1]} for row in cursor.fetchall()]
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"查询 href 失败: {e}")
|
||||
return None
|
||||
|
||||
# 插入书本的卷信息
|
||||
def insert_or_update_book_sections(data):
|
||||
return insert_or_update_common(data, tbl_name_section, uniq_key='bookid_section')
|
||||
|
||||
@ -42,6 +42,19 @@ def extract_book_num(page_str, default_num = 0):
|
||||
else:
|
||||
return default_num
|
||||
|
||||
# 目录页,获取更新时间和字数
|
||||
def extract_chapter_uptime_words(input_str):
|
||||
# 定义正则表达式模式
|
||||
words_pattern = r'字数:(\d+)'
|
||||
words_match = re.search(words_pattern, input_str)
|
||||
words = words_match.group(1) if words_match else 0
|
||||
|
||||
update_time_pattern = r'更新时间:(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
|
||||
update_time_match = re.search(update_time_pattern, input_str)
|
||||
update_time = update_time_match.group(1) if update_time_match else datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
return words, update_time
|
||||
|
||||
# 处理 [都市] 的方括号
|
||||
def remove_brackets_regex(input_str):
|
||||
pattern = r'\[(.*?)\]'
|
||||
|
||||
Reference in New Issue
Block a user