modify scripts

2025-03-23 10:59:35 +08:00
parent 5f835443e8
commit 0cfd87b555
1 changed files with 21 additions and 1 deletions
--- a/aabook/src/scraper.py
+++ b/aabook/src/scraper.py
@ -277,7 +277,7 @@ def process_paragraph(paragraph):
    return cleaned_text

 # 解析内容页
-def parse_content_page(soup, url):
+def parse_content_page2(soup, url):
    content = []
    paragraphs = soup.find_all('p')
    if paragraphs:
@ -298,6 +298,26 @@ def parse_content_page(soup, url):

    return content

+def parse_content_page(soup, url):
+    content = []
+
+    # 提取所有 p 标签和 h1 标签
+    paragraphs = soup.find_all(['p', 'h1'])
+    if paragraphs:
+        for paragraph in paragraphs:
+            cleaned_text = process_paragraph(paragraph)
+            if cleaned_text:
+                content.append(cleaned_text)
+
+    # 如果没有找到 p 或 h1，再兜底提取所有文本，同时移除水印
+    if not content:
+        cleaned_html = process_paragraph(soup)
+        cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
+        content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()]
+
+    return content
+
+
 # 通用的 HTML 结构验证器
 def generic_validator(soup, tag, identifier, attr_type="id"):
    if attr_type == "id":