modify scripts
This commit is contained in:
484
aabook/bak/aabook_fetch.py
Normal file
484
aabook/bak/aabook_fetch.py
Normal file
@ -0,0 +1,484 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
import time
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from datetime import date
|
||||
import config # 日志配置
|
||||
from down_list import novel_map
|
||||
import utils
|
||||
|
||||
|
||||
# 日志
|
||||
config.setup_logging()
|
||||
|
||||
# 配置基础URL和输出文件
|
||||
base_url = 'https://aabook.xyz'
|
||||
list_url_wordcount = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=wordcount'
|
||||
list_url_update = 'https://aabook.xyz/category.html?pageNum={}&pageSize=30&catId=-1&size=-1&isFinish=-1&updT=-1&orderBy=update'
|
||||
curr_novel_pages = 0
|
||||
|
||||
meta_dir = f'{config.global_host_data_dir}/aabook/meta'
|
||||
novel_dir = f'{config.global_host_data_dir}/aabook/data'
|
||||
|
||||
list_file = f'{meta_dir}/list.txt'
|
||||
details_file = f'{meta_dir}/details.txt'
|
||||
down_list_file = f'{meta_dir}/down_list.txt'
|
||||
|
||||
# User-Agent 列表
|
||||
user_agents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
|
||||
"Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
|
||||
]
|
||||
# 定义获取页面内容的函数,带重试机制
|
||||
def get_page_content(url, max_retries=100, sleep_time=5, default_timeout=10):
|
||||
retries = 0
|
||||
# 随机选择一个 User-Agent
|
||||
headers = {
|
||||
'User-Agent': random.choice(user_agents)
|
||||
}
|
||||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=default_timeout, stream=True)
|
||||
response.raise_for_status()
|
||||
return response.text # 请求成功,返回内容
|
||||
except requests.RequestException as e:
|
||||
retries += 1
|
||||
logging.info(f"Warn fetching page {url}: {e}. Retrying {retries}/{max_retries}...")
|
||||
if retries >= max_retries:
|
||||
logging.error(f"Failed to fetch page {url} after {max_retries} retries.")
|
||||
return None
|
||||
time.sleep(sleep_time) # 休眠指定的时间,然后重试
|
||||
|
||||
|
||||
# 获取排行列表
|
||||
def get_list(write_list_file = list_file, list_url = list_url_wordcount, start_date = '2000-01-01', order_by_date = False):
|
||||
page_num = 1
|
||||
start_time = datetime.strptime(f'{start_date} 00:00:00', "%Y-%m-%d %H:%M:%S")
|
||||
with open(write_list_file, 'w', encoding='utf-8') as f:
|
||||
while True:
|
||||
# 发起请求
|
||||
list_url = list_url.format(page_num)
|
||||
logging.info(f"Fetching page [{page_num}] {list_url}")
|
||||
|
||||
content = get_page_content(list_url)
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
# 查找书籍列表
|
||||
list_main = soup.find('div', class_='list_main')
|
||||
if not list_main:
|
||||
logging.info("No list_main Found. retry...")
|
||||
continue
|
||||
|
||||
tbody = list_main.find('tbody')
|
||||
if not tbody:
|
||||
logging.info("No tbody found. retry...")
|
||||
continue
|
||||
|
||||
# 获取每本书的基础信息:排名、分类、书名、作者、月票、更新时间(按字数排序时是总字数,按日期排序时是最后更新日期)
|
||||
for tr in tbody.find_all('tr'):
|
||||
tds = tr.find_all('td')
|
||||
if len(tds) < 6:
|
||||
logging.info("Invalid tr format.")
|
||||
continue
|
||||
ranking = tds[0].text.strip()
|
||||
category = tds[1].text.strip()
|
||||
book_link_tag = tds[2].find('a')
|
||||
book_name = book_link_tag.text.strip()
|
||||
book_link = base_url + '/' + book_link_tag['href']
|
||||
author = tds[3].text.strip()
|
||||
monthly_tickets = tds[4].text.strip()
|
||||
update_time = tds[5].text.strip() #实际上是字数(按字数排序时是总字数,按日期排序时是最后更新日期)
|
||||
|
||||
# 检查更新
|
||||
if order_by_date :
|
||||
up_time = datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S")
|
||||
if start_time > up_time:
|
||||
return
|
||||
|
||||
# 写入 aabook_list.txt
|
||||
# 排名 分类 书名 作者 月票 字数(更新日期) 书本链接
|
||||
f.write(f"{ranking}\t{category}\t{book_name}\t{author}\t{monthly_tickets}\t{update_time}\t{book_link}\n")
|
||||
f.flush()
|
||||
|
||||
# 查找下一页链接
|
||||
next_page_tag = soup.find('a', title='下一页')
|
||||
if next_page_tag:
|
||||
list_url = base_url + next_page_tag['href']
|
||||
page_num += 1
|
||||
else:
|
||||
logging.info("No next page, stopping.")
|
||||
break
|
||||
|
||||
time.sleep(3)
|
||||
#break ## for test
|
||||
|
||||
# 拉取详情,并校验
|
||||
def fetch_detail_and_check(url, book_name):
|
||||
while True:
|
||||
contenxt = get_page_content(url)
|
||||
soup = BeautifulSoup(contenxt, 'html.parser')
|
||||
|
||||
# 解析书籍详细信息
|
||||
book_info_tag = soup.find('li', class_='zuopinxinxi')
|
||||
if not book_info_tag:
|
||||
logging.info(f"No details found for {book_name}, retry...")
|
||||
continue
|
||||
|
||||
book_info_lis = book_info_tag.find_all('li')
|
||||
if len(book_info_lis) < 4:
|
||||
logging.info(f"invalid book info. {book_name}. retry...")
|
||||
continue
|
||||
|
||||
return contenxt
|
||||
|
||||
# 获取每本书的详情
|
||||
def get_detail(write_list_file = list_file, wirte_details_file = details_file):
|
||||
# 读取已完成详细信息的书籍链接
|
||||
if os.path.exists(wirte_details_file):
|
||||
with open(wirte_details_file, 'r', encoding='utf-8') as f:
|
||||
completed_links = set(line.split('\t')[4] for line in f.readlines())
|
||||
else:
|
||||
completed_links = set()
|
||||
|
||||
with open(write_list_file, 'r', encoding='utf-8') as f_list, open(wirte_details_file, 'a', encoding='utf-8') as f_details:
|
||||
for line in f_list:
|
||||
fields = line.strip().split('\t')
|
||||
if len(fields) < 7:
|
||||
continue
|
||||
book_link = fields[6]
|
||||
book_name = fields[2]
|
||||
|
||||
if book_link in completed_links:
|
||||
logging.info(f"Skipping {book_name} {book_link}, already processed.")
|
||||
continue
|
||||
|
||||
# 访问书籍详细页
|
||||
logging.info(f"Fetching details for {book_name} {book_link}")
|
||||
#contenxt = get_page_content(book_link)
|
||||
contenxt = fetch_detail_and_check(book_link, book_name)
|
||||
soup = BeautifulSoup(contenxt, 'html.parser')
|
||||
|
||||
# 解析书籍详细信息
|
||||
book_info_tag = soup.find('li', class_='zuopinxinxi')
|
||||
if not book_info_tag:
|
||||
logging.info(f"No details found for {book_name}, skipping.")
|
||||
continue
|
||||
|
||||
book_info_lis = book_info_tag.find_all('li')
|
||||
if len(book_info_lis) < 4:
|
||||
logging.info(f"invalid book info. {book_name}")
|
||||
continue
|
||||
book_category = book_info_lis[0].find('span').text.strip()
|
||||
book_status = book_info_lis[1].find('span').text.strip()
|
||||
total_word_count = book_info_lis[2].find('span').text.strip()
|
||||
total_clicks = book_info_lis[3].find('span').text.strip()
|
||||
# 去掉后面的汉字,只要数字
|
||||
total_word_count = int(re.search(r'\d+', total_word_count).group())
|
||||
|
||||
# 读取创建时间
|
||||
creation_time_tag = soup.find('li', class_='update_time')
|
||||
creation_time = creation_time_tag.text.strip() if creation_time_tag else 'N/A'
|
||||
|
||||
# 获取起始页链接和编号
|
||||
start_page_tag = soup.find('ul', class_='gezhonganniu').find_all('li')[0].find('a')
|
||||
start_page_link = base_url + '/' + start_page_tag['href']
|
||||
start_page_number = start_page_link.split('-')[-1].replace('.html', '')
|
||||
|
||||
# 写入 aabook_details.txt
|
||||
# 排名 类别 书名 作者 书本链接 首页链接 开始链接编码 状态 总字数 总点击 总字数 创建时间
|
||||
f_details.write(f"{fields[0]}\t{book_category}\t{fields[2]}\t{fields[3]}\t{book_link}\t"
|
||||
f"{start_page_link}\t{start_page_number}\t{book_status}\t{total_word_count}\t"
|
||||
f"{total_clicks}\t{fields[5]}\t{creation_time}\n")
|
||||
f_details.flush()
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
# 解析内容中的水印部分
|
||||
def clean_watermarks(html):
|
||||
"""
|
||||
过滤掉带有 class 属性的水印标签及其内部内容,保留其他标签结构。
|
||||
"""
|
||||
# 使用正则表达式匹配并移除任何带有 class 属性的 HTML 标签及其内容
|
||||
cleaned_html = re.sub(r'<[^>]+class="[^"]+">.*?</[^>]+>', '', html, flags=re.DOTALL)
|
||||
return cleaned_html
|
||||
|
||||
def process_paragraph(paragraph):
|
||||
# 获取完整的 HTML 结构,而不是 get_text()
|
||||
paragraph_html = str(paragraph)
|
||||
|
||||
# 移除水印标签
|
||||
cleaned_html = clean_watermarks(paragraph_html)
|
||||
|
||||
# 使用 BeautifulSoup 解析移除水印标签后的 HTML 并提取文本
|
||||
soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||
cleaned_text = soup.get_text().strip()
|
||||
|
||||
return cleaned_text
|
||||
|
||||
# 从 script 标签中提取 content_url
|
||||
def extract_content_url(soup, base_url, chapid):
|
||||
# 找到所有 <script> 标签
|
||||
script_tags = soup.find_all('script')
|
||||
|
||||
# 遍历每一个 <script> 标签,查找包含特定内容的标签
|
||||
for script_tag in script_tags:
|
||||
script_content = script_tag.string
|
||||
if script_content and re.search(r'\.get\("./_getcontent\.php', script_content):
|
||||
# 匹配到特定内容,提取出 _getcontent.php 的 URL 模板
|
||||
match = re.search(r'\.get\("\./_getcontent\.php\?id="\+\s*chapid\s*\+\s*"&v=([^"]+)"', script_content)
|
||||
if match:
|
||||
# 从匹配中提取 v 参数值
|
||||
v_value = match.group(1)
|
||||
# 构建完整的 content_url
|
||||
content_url = f"{base_url}/_getcontent.php?id={chapid}&v={v_value}"
|
||||
return content_url
|
||||
|
||||
# 如果未找到匹配的 script 标签,则返回 None
|
||||
return None
|
||||
|
||||
# 判断内容是否被污染
|
||||
def check_content(content):
|
||||
if '2005-2024 疯情书库' in content or '2005-2025 疯情书库' in content:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# 计数器
|
||||
def reset_novel_pages():
|
||||
global curr_novel_pages
|
||||
curr_novel_pages = 0
|
||||
def add_novel_pages():
|
||||
global curr_novel_pages
|
||||
curr_novel_pages += 1
|
||||
def get_novel_pages():
|
||||
global curr_novel_pages
|
||||
return curr_novel_pages
|
||||
|
||||
# 解析章节内容并保存到文件中
|
||||
def download_novel(chapid, novel_name, dir_prefix=novel_dir):
|
||||
chapter_url = f'{base_url}/read-{chapid}.html'
|
||||
|
||||
novel_file = f'{dir_prefix}/{chapid}_{novel_name}.txt'
|
||||
if os.path.exists(novel_file):
|
||||
os.remove(novel_file) # 如果存在同名文件,删除重新下载
|
||||
|
||||
# 保存到其他类型的文件
|
||||
chapters = []
|
||||
reset_novel_pages()
|
||||
while chapter_url:
|
||||
logging.info(f"Processing: [{novel_name}] [{chapid}] {chapter_url}")
|
||||
|
||||
# 获取章节页面内容
|
||||
html_content = get_page_content(chapter_url)
|
||||
if html_content is None:
|
||||
logging.error(f"Get page error {chapter_url}, retry...")
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
# 解析章节内容
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 获取章节标题
|
||||
chapter_title_tag = soup.find('h1', class_='chapter_title')
|
||||
if chapter_title_tag:
|
||||
chapter_title = chapter_title_tag.get_text().strip()
|
||||
logging.info(f"Processing: [{novel_name}] [{chapid}] Chapter Title: {chapter_title}")
|
||||
else:
|
||||
logging.error(f"Chapter title not found in {chapter_url}, retry...")
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
# 提取正文内容的请求地址
|
||||
content_url = extract_content_url(soup, base_url, chapid)
|
||||
if content_url:
|
||||
logging.info(f"Fetching content from: {content_url}")
|
||||
|
||||
# 获取正文内容
|
||||
content_response = get_page_content(content_url)
|
||||
if content_response:
|
||||
if not check_content(content_response):
|
||||
logging.error(f'error response. dirty page [{novel_name}] {content_url}, retry...')
|
||||
continue
|
||||
|
||||
content_soup = BeautifulSoup(content_response, 'html.parser')
|
||||
paragraphs = content_soup.find_all('p')
|
||||
|
||||
# 写入标题到文件
|
||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||
f.write(chapter_title + '\n\n')
|
||||
|
||||
# 写入每个段落内容到文件
|
||||
content = ''
|
||||
with open(novel_file, 'a', encoding='utf-8') as f:
|
||||
for paragraph in paragraphs:
|
||||
#cleaned_part = clean_watermarks(paragraph.get_text().strip())
|
||||
#f.write(paragraph.get_text() + '\n\n')
|
||||
#f.write(cleaned_part + '\n\n')
|
||||
cleaned_text = process_paragraph(paragraph)
|
||||
f.write(cleaned_text + '\n\n')
|
||||
content = content + '<p>' + cleaned_text + '</p>' # epub 里面,用html标签来分段落
|
||||
logging.info(f"Writting content to file. [{novel_name}] [{chapid}] [{chapter_title}]")
|
||||
chapters.append((chapter_title, content))
|
||||
else:
|
||||
logging.info(f"Fetching content error: [{novel_name}] {content_url}, retry...")
|
||||
continue
|
||||
else:
|
||||
logging.info(f"Content URL not found in [{novel_name}] {chapter_url}, retry...")
|
||||
continue
|
||||
|
||||
# 页码数+1
|
||||
add_novel_pages()
|
||||
# 查找下一章的链接
|
||||
next_div = soup.find('div', class_='next_arrow')
|
||||
# 判断是否找到了包含下一章链接的 div 标签
|
||||
if next_div:
|
||||
next_page_tag = next_div.find('a', href=True, title=re.compile(r'下一章'))
|
||||
if next_page_tag:
|
||||
next_page_url = next_page_tag['href']
|
||||
|
||||
# 使用正则提取其中的章节 ID(数字部分)
|
||||
chapid_match = re.search(r'read-(\d+)\.html', next_page_url)
|
||||
if chapid_match:
|
||||
chapid = chapid_match.group(1) # 提取到的章节 ID
|
||||
chapter_url = f"{base_url}/{next_page_url}"
|
||||
logging.debug(f"Next chapter URL: {chapter_url}, chapid: {chapid}")
|
||||
else:
|
||||
logging.info(f"Failed to extract chapid from next_page_url: {next_page_url}")
|
||||
break
|
||||
else:
|
||||
logging.info(f"No next page found. Ending download for {novel_name}.")
|
||||
break
|
||||
else:
|
||||
logging.info(f"No 'next_arrow' div found in {chapter_url}. Ending download.")
|
||||
break
|
||||
|
||||
time.sleep(3)
|
||||
# 全部获取完,生成epub文件
|
||||
utils.generate_epub(novel_name, 'nobody', chapters, dir_prefix)
|
||||
|
||||
|
||||
# 检查子目录是否存在,不存在则创建
|
||||
def create_directory_if_not_exists(category_name):
|
||||
if not os.path.exists(category_name):
|
||||
os.makedirs(category_name)
|
||||
logging.info(f"Created directory: {category_name}")
|
||||
|
||||
# 下载小说,检查是否已经下载过
|
||||
def download_books(need_down_list_file = details_file, cursor_file = down_list_file):
|
||||
if not os.path.isfile(need_down_list_file):
|
||||
logging.error(f'input file {need_down_list_file} not exist!')
|
||||
return
|
||||
|
||||
if not os.path.isfile(cursor_file):
|
||||
logging.info(f'input file {cursor_file} not exist, use empty dict instead.')
|
||||
|
||||
# 读取 aabook_down_list.txt 中已下载书籍的起始页数字编号和书名
|
||||
downloaded_books = {}
|
||||
if os.path.exists(cursor_file):
|
||||
with open(cursor_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
fields = line.strip().split('\t')
|
||||
if len(fields) != 2:
|
||||
logging.info(f'invalid line data: {line}')
|
||||
continue
|
||||
novel_id, novel_name = fields
|
||||
downloaded_books[novel_id] = novel_name
|
||||
|
||||
# 打开 aabook_details.txt 读取书籍信息
|
||||
with open(need_down_list_file, 'r', encoding='utf-8') as details:
|
||||
for line in details:
|
||||
fields = line.strip().split('\t')
|
||||
if len(fields) < 8:
|
||||
logging.info(f'invalid line data. {line}')
|
||||
continue # 跳过不完整的数据
|
||||
ranking, category, book_name, author, book_link, start_page_link, novel_id, status, total_word_count, total_clicks, update_time, creation_time = fields
|
||||
|
||||
# 检查书籍是否已经下载过
|
||||
if novel_id in downloaded_books:
|
||||
logging.info(f"Skipping already downloaded novel: {book_name} (ID: {novel_id})")
|
||||
continue # 已经下载过,跳过
|
||||
|
||||
# 创建分类目录
|
||||
down_dir = f'{novel_dir}/{category}'
|
||||
create_directory_if_not_exists(down_dir)
|
||||
|
||||
# 调用下载函数下载书籍
|
||||
start_time = time.time() # 在函数执行前获取当前时间
|
||||
download_novel(novel_id, book_name, down_dir)
|
||||
end_time = time.time() # 在函数执行后获取当前时间
|
||||
elapsed_time = int(end_time - start_time) # 计算时间差,秒
|
||||
novel_pages = get_novel_pages()
|
||||
|
||||
# 下载后,将书籍信息追加写入 aabook_down_list.txt
|
||||
with open(cursor_file, 'a', encoding='utf-8') as down_list:
|
||||
down_list.write(f"{novel_id}\t{book_name}\n")
|
||||
logging.info(f"Downloaded and recorded: ({book_name}) (ID: {novel_id}) total pages: {novel_pages} time cost: {elapsed_time} s")
|
||||
|
||||
# 下载指定的小说
|
||||
def download_map():
|
||||
# 遍历 novel_map,下载所有小说
|
||||
for novel_id, novel_name in novel_map.items():
|
||||
logging.info(f"Starting download for {novel_name} (ID: {novel_id})")
|
||||
download_novel(novel_id, novel_name, novel_dir)
|
||||
logging.info(f"Completed download for {novel_id}_{novel_name}.\n")
|
||||
|
||||
# 获取更新列表,并下载
|
||||
def get_update(start_date, fetch_all = False):
|
||||
today_str = date.today().strftime("%Y-%m-%d")
|
||||
update_file = f'{meta_dir}/{today_str}_list_{start_date}.txt'
|
||||
details_file = f'{meta_dir}/{today_str}_details_{start_date}.txt'
|
||||
cursor_file = f'{meta_dir}/{today_str}_down_list_{start_date}.txt'
|
||||
logging.info(f"\n\nFetching novel list by update time from {start_date} \n\n")
|
||||
get_list(update_file, list_url_update, start_date, True)
|
||||
logging.info(f"\n\nFetching novel details by update time from {start_date} \n\n")
|
||||
get_detail(update_file, details_file)
|
||||
|
||||
if fetch_all:
|
||||
logging.info(f"\n\nDownloading novel lists by update time from {start_date} \n\n")
|
||||
download_books(details_file, cursor_file)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python script.py <cmd>")
|
||||
print("cmd: get_list, get_detail, get_all, get_update, get_update_all, download, download_map")
|
||||
sys.exit(1)
|
||||
|
||||
# 确保目录存在
|
||||
create_directory_if_not_exists(meta_dir)
|
||||
create_directory_if_not_exists(novel_dir)
|
||||
|
||||
cmd = sys.argv[1]
|
||||
|
||||
if cmd == "get_list":
|
||||
get_list() # 之前已经实现的获取列表功能
|
||||
elif cmd == "get_detail":
|
||||
get_detail() # 之前已经实现的获取详情功能
|
||||
elif cmd == "get_all":
|
||||
get_list()
|
||||
get_detail()
|
||||
elif cmd == "download":
|
||||
download_books() # 下载书籍功能
|
||||
elif cmd == "download_map":
|
||||
download_map() # 下载书籍功能
|
||||
elif cmd == "get_update" or cmd == "get_update_all":
|
||||
fetch_all = False if cmd == "get_update" else True
|
||||
start_date = '2000-01-01'
|
||||
if len(sys.argv) == 3:
|
||||
start_date = sys.argv[2]
|
||||
get_update(start_date, fetch_all) # 获取更新列表,并下载
|
||||
else:
|
||||
print(f"Unknown command: {cmd}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
27
aabook/bak/config.py
Normal file
27
aabook/bak/config.py
Normal file
@ -0,0 +1,27 @@
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
from datetime import datetime
|
||||
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_host_data_dir = f'{home_dir}/hostdir/scripts_data'
|
||||
global_share_data_dir = f'{home_dir}/sharedata'
|
||||
|
||||
# 设置日志配置
|
||||
def setup_logging(log_filename=None):
|
||||
# 如果未传入 log_filename,则使用当前脚本名称作为日志文件名
|
||||
if log_filename is None:
|
||||
# 获取调用 setup_logging 的脚本文件名
|
||||
caller_frame = inspect.stack()[1]
|
||||
caller_filename = os.path.splitext(os.path.basename(caller_frame.filename))[0]
|
||||
|
||||
# 获取当前日期,格式为 yyyymmdd
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
# 拼接 log 文件名,将日期加在扩展名前
|
||||
log_filename = f'./log/{caller_filename}_{current_date}.log'
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename),
|
||||
logging.StreamHandler()
|
||||
])
|
||||
126
aabook/bak/down_list.py
Normal file
126
aabook/bak/down_list.py
Normal file
@ -0,0 +1,126 @@
|
||||
|
||||
# 定义小说映射
|
||||
novel_map_new = {
|
||||
138219: '我的将军生涯',
|
||||
6548: '我和我哥们的女友的女友的故事',
|
||||
605: '我的支书生涯',
|
||||
138219: '我的将军生涯',
|
||||
6548: '我和我哥们的女友的女友的故事',
|
||||
203144: '我的校长生涯',
|
||||
}
|
||||
# 定义小说映射
|
||||
novel_map = {
|
||||
371300: '临时夫妻',
|
||||
}
|
||||
|
||||
|
||||
novel_map_done = {
|
||||
5479: '倚天屠龙记(成人版)',
|
||||
269: '雪域往事',
|
||||
156643: '都市偷心龙爪手',
|
||||
85227: '明星潜规则之皇',
|
||||
88155: '娇娇师娘(与爱同行)',
|
||||
12829: '女人四十一枝花',
|
||||
116756: '风雨里的罂粟花',
|
||||
320500: '豪门浪荡史',
|
||||
329495: '女市长迷途沉沦:权斗',
|
||||
159927: '豪乳老师刘艳',
|
||||
308650: '山里的女人',
|
||||
163322: '翁媳乱情',
|
||||
103990: '欲望之门',
|
||||
59793: '红尘都市',
|
||||
231646: '那山,那人,那情',
|
||||
61336: '妻欲:欲望迷城(H 版)',
|
||||
104929: '都市奇缘',
|
||||
239682: '叶辰风流',
|
||||
261481: '我本风流',
|
||||
171107: '爱与欲的升华',
|
||||
171029: '亲爱的不要离开我',
|
||||
5049: '红楼春梦',
|
||||
5479: '倚天屠龙记(成人版)',
|
||||
71468: '襄阳战记',
|
||||
29242: '仙剑淫女传',
|
||||
237271: '新倚天行',
|
||||
231192: '神雕侠绿',
|
||||
31882: '新编蜗居H版',
|
||||
230877: '黄蓉的改变',
|
||||
187150: '黄蓉襄阳淫史',
|
||||
316162: '洛玉衡的堕落(大奉打更人H)',
|
||||
7678: '射雕别记(黄蓉的故事)',
|
||||
185302: '天地之间(精修版)',
|
||||
54344: '情欲两极 (情和欲的两极)',
|
||||
2072: '父女情',
|
||||
214610: '大黄的故事',
|
||||
2211: '隔墙有眼',
|
||||
221453: '当维修工的日子',
|
||||
153792: '荒村红杏',
|
||||
186052: '食色男女',
|
||||
68: '童年+静静的辽河',
|
||||
322665: '乡村活寡美人沟',
|
||||
160528: '我和我的母亲(改写寄印传奇)',
|
||||
23228: '风流人生',
|
||||
181617: '红楼遗秘',
|
||||
219454: '寻秦记(全本改编版)',
|
||||
49051: '情色搜神记',
|
||||
5860: '天若有情(一家之主)',
|
||||
161497: '步步高升',
|
||||
51870: '母爱的光辉',
|
||||
258388: '露从今夜白',
|
||||
202281: '异地夫妻',
|
||||
1960: '北方的天空',
|
||||
164544: '少妇的悲哀',
|
||||
158872: '我的极品老婆',
|
||||
3975: '出轨的诱惑',
|
||||
26442: '爱满江城',
|
||||
7776: '小城乱事',
|
||||
179710: '淫男乱女(小雄性事)',
|
||||
79161: '情迷芦苇荡:山乡艳事',
|
||||
99885: '江南第一风流才子(唐伯虎淫传)',
|
||||
54426: '水浒潘金莲',
|
||||
327794: '枕瑶钗([清]东涧老人)',
|
||||
161243: '我的青年岁月',
|
||||
137885: '破碎的命运',
|
||||
159266: '我的好儿媳(极品好儿媳)',
|
||||
166534: '女友与睡在隔壁的兄弟',
|
||||
40646: '女子医院的男医生',
|
||||
61535: '魅骨少妇(苏樱的暧昧情事)',
|
||||
13166: '青春性事:一个八零后的情欲往事',
|
||||
21563: '幸福的借种经历',
|
||||
51916: '乱情家庭',
|
||||
26787: '少妇人妻的欲望',
|
||||
59610: '金瓶梅(崇祯原本)',
|
||||
322155: '少年阿宾',
|
||||
89532: '宋家湾那些事儿',
|
||||
297078: '熟透了的村妇',
|
||||
350314: '多情村妇',
|
||||
53823: '蛮荒小村的风流韵事',
|
||||
82570: '潭河峪的那些事儿',
|
||||
72429: '杨家将外传_薛家将秘史',
|
||||
410: '农村的妞',
|
||||
37443: '山里人家',
|
||||
28478: '追忆平凡年代的全家故事',
|
||||
199014: '风流岁月',
|
||||
59737: '丝之恋-我与一对母女的故事',
|
||||
14733: '乡村乱情|奇思妙想',
|
||||
43: '空空幻',
|
||||
3858: '绿头巾',
|
||||
13483: '乡野欲潮:绝色村嫂的泛滥春情',
|
||||
67423: '欲海沉沦:一个换妻经历者的良心忏悔',
|
||||
51776: '我成了父亲与妻子的月老',
|
||||
54192: '郝叔和他的女人',
|
||||
68339: '和护士后妈生活的日子',
|
||||
15168: '妻子的会客厅:高官的秘密',
|
||||
7064: '男欢女爱',
|
||||
50555: '人生得意须纵欢',
|
||||
67114: '潜色官迹:小所长孽欲涅盘',
|
||||
1487: '神雕风流',
|
||||
4951: '合租情缘(出租屋里的真实换妻记录)',
|
||||
4701: '艰难的借种经历',
|
||||
162845: '人妻牌坊——我和人妻的故事',
|
||||
183692: '幸福家庭背后的隐私',
|
||||
140605: '东北大炕',
|
||||
24344: '淫乱一家亲(超级乱伦家庭)',
|
||||
25154: '全家人互爱共乐的日子',
|
||||
16941: '平凡的激情',
|
||||
70767: '合家欢',
|
||||
}
|
||||
122
aabook/bak/utils.py
Normal file
122
aabook/bak/utils.py
Normal file
@ -0,0 +1,122 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import epub
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import csv
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import config
|
||||
|
||||
|
||||
# 从"创建时间 2025-03-08 13:57:00" 中提取时间
|
||||
def extract_create_time(input_str):
|
||||
pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
|
||||
match = re.search(pattern, input_str)
|
||||
if match:
|
||||
datetime_str = match.group(0)
|
||||
return datetime_str
|
||||
else:
|
||||
return input_str
|
||||
|
||||
# 从 "read-374864.html" 中获取数字编号
|
||||
def extract_page_num(page_str, default_num = 0):
|
||||
# 定义正则表达式模式
|
||||
pattern = r'read-(\d+)\.html'
|
||||
# 使用 re.search 查找匹配项
|
||||
match = re.search(pattern, page_str)
|
||||
if match:
|
||||
number = match.group(1)
|
||||
return number
|
||||
else:
|
||||
return default_num
|
||||
|
||||
# 从 "book-5549.html" 中获取数字编号
|
||||
def extract_book_num(page_str, default_num = 0):
|
||||
# 定义正则表达式模式
|
||||
pattern = r'book-(\d+)\.html'
|
||||
# 使用 re.search 查找匹配项
|
||||
match = re.search(pattern, page_str)
|
||||
if match:
|
||||
number = match.group(1)
|
||||
return number
|
||||
else:
|
||||
return default_num
|
||||
|
||||
# 处理 [都市] 的方括号
|
||||
def remove_brackets_regex(input_str):
|
||||
pattern = r'\[(.*?)\]'
|
||||
match = re.match(pattern, input_str)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return input_str
|
||||
|
||||
# 定义函数来抓取小说章节内容
|
||||
def fetch_chapter(url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# 这里需要根据实际网页结构修改选择器
|
||||
chapter_content = soup.find('div', class_='chapter-content').get_text()
|
||||
return chapter_content
|
||||
except requests.RequestException as e:
|
||||
print(f"请求出错: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 定义函数来生成 EPUB 文件
|
||||
def generate_epub(title, author, chapters, path):
|
||||
book = epub.EpubBook()
|
||||
book.set_title(title)
|
||||
book.set_language('zh')
|
||||
book.add_author(author)
|
||||
|
||||
epub_chapters = []
|
||||
for chapter_title, chapter_content in chapters:
|
||||
c = epub.EpubHtml(title=chapter_title, file_name=f'{chapter_title}.xhtml', lang='zh')
|
||||
c.content = f'<h1>{chapter_title}</h1><p>{chapter_content}</p>'
|
||||
book.add_item(c)
|
||||
epub_chapters.append(c)
|
||||
|
||||
# 定义书的结构
|
||||
book.toc = tuple(epub_chapters)
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
# 定义样式
|
||||
style = 'body { font-family: Times, serif; }'
|
||||
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
|
||||
book.add_item(nav_css)
|
||||
|
||||
# 定义书的结构
|
||||
book.spine = ['nav'] + epub_chapters
|
||||
|
||||
# 保存 EPUB 文件
|
||||
epub.write_epub(f'{path}/{title}.epub', book, {})
|
||||
|
||||
|
||||
# 示例使用
|
||||
if __name__ == "__main__":
|
||||
# 这里需要替换为实际的小说章节链接
|
||||
chapter_info = [
|
||||
('第一章', 'https://example.com/chapter1'),
|
||||
('第二章', 'https://example.com/chapter2')
|
||||
]
|
||||
title = '小说标题'
|
||||
author = '小说作者'
|
||||
|
||||
chapters = []
|
||||
for chapter_title, url in chapter_info:
|
||||
content = fetch_chapter(url)
|
||||
if content:
|
||||
chapters.append((chapter_title, content))
|
||||
|
||||
if chapters:
|
||||
generate_epub(title, author, chapters)
|
||||
print(f'{title}.epub 文件生成成功。')
|
||||
else:
|
||||
print('未获取到有效章节内容,无法生成 EPUB 文件。')
|
||||
|
||||
Reference in New Issue
Block a user