From 5f835443e879e18e3f4319f0552a74cecca6449c Mon Sep 17 00:00:00 2001 From: oscarz Date: Sun, 23 Mar 2025 10:25:25 +0800 Subject: [PATCH] modify scripts --- aabook/src/scraper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aabook/src/scraper.py b/aabook/src/scraper.py index 302690a..3b84a45 100644 --- a/aabook/src/scraper.py +++ b/aabook/src/scraper.py @@ -292,6 +292,10 @@ def parse_content_page(soup, url): cleaned_text = process_paragraph(paragraph) content.append(cleaned_text) + # 某些页面只有
标签,soup.stripped_strings:返回去除空白后的所有文本节点。 + if len(content) == 0: + content = [block.strip() for block in soup.stripped_strings if block.strip()] + return content # 通用的 HTML 结构验证器