diff --git a/aabook/src/scraper.py b/aabook/src/scraper.py index 302690a..3b84a45 100644 --- a/aabook/src/scraper.py +++ b/aabook/src/scraper.py @@ -292,6 +292,10 @@ def parse_content_page(soup, url): cleaned_text = process_paragraph(paragraph) content.append(cleaned_text) + # 某些页面只有
标签,soup.stripped_strings:返回去除空白后的所有文本节点。 + if len(content) == 0: + content = [block.strip() for block in soup.stripped_strings if block.strip()] + return content # 通用的 HTML 结构验证器