From 0cfd87b55514989b4200ee497a96f779f7696335 Mon Sep 17 00:00:00 2001 From: oscarz Date: Sun, 23 Mar 2025 10:59:35 +0800 Subject: [PATCH] modify scripts --- aabook/src/scraper.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/aabook/src/scraper.py b/aabook/src/scraper.py index 3b84a45..99982cc 100644 --- a/aabook/src/scraper.py +++ b/aabook/src/scraper.py @@ -277,7 +277,7 @@ def process_paragraph(paragraph): return cleaned_text # 解析内容页 -def parse_content_page(soup, url): +def parse_content_page2(soup, url): content = [] paragraphs = soup.find_all('p') if paragraphs: @@ -298,6 +298,26 @@ def parse_content_page(soup, url): return content +def parse_content_page(soup, url): + content = [] + + # 提取所有 p 标签和 h1 标签 + paragraphs = soup.find_all(['p', 'h1']) + if paragraphs: + for paragraph in paragraphs: + cleaned_text = process_paragraph(paragraph) + if cleaned_text: + content.append(cleaned_text) + + # 如果没有找到 p 或 h1,再兜底提取所有文本,同时移除水印 + if not content: + cleaned_html = process_paragraph(soup) + cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser') + content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()] + + return content + + # 通用的 HTML 结构验证器 def generic_validator(soup, tag, identifier, attr_type="id"): if attr_type == "id":