modify scripts
This commit is contained in:
@ -292,6 +292,10 @@ def parse_content_page(soup, url):
|
||||
cleaned_text = process_paragraph(paragraph)
|
||||
content.append(cleaned_text)
|
||||
|
||||
# 某些页面只有<br>标签,soup.stripped_strings:返回去除空白后的所有文本节点。
|
||||
if len(content) == 0:
|
||||
content = [block.strip() for block in soup.stripped_strings if block.strip()]
|
||||
|
||||
return content
|
||||
|
||||
# 通用的 HTML 结构验证器
|
||||
|
||||
Reference in New Issue
Block a user