modify scripts

This commit is contained in:
oscarz
2025-03-23 10:25:25 +08:00
parent 37b82e5e5c
commit 5f835443e8

View File

@ -292,6 +292,10 @@ def parse_content_page(soup, url):
cleaned_text = process_paragraph(paragraph) cleaned_text = process_paragraph(paragraph)
content.append(cleaned_text) content.append(cleaned_text)
# 某些页面只有<br>标签soup.stripped_strings返回去除空白后的所有文本节点。
if len(content) == 0:
content = [block.strip() for block in soup.stripped_strings if block.strip()]
return content return content
# 通用的 HTML 结构验证器 # 通用的 HTML 结构验证器