modify scripts
This commit is contained in:
@ -292,6 +292,10 @@ def parse_content_page(soup, url):
|
|||||||
cleaned_text = process_paragraph(paragraph)
|
cleaned_text = process_paragraph(paragraph)
|
||||||
content.append(cleaned_text)
|
content.append(cleaned_text)
|
||||||
|
|
||||||
|
# 某些页面只有<br>标签,soup.stripped_strings:返回去除空白后的所有文本节点。
|
||||||
|
if len(content) == 0:
|
||||||
|
content = [block.strip() for block in soup.stripped_strings if block.strip()]
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
# 通用的 HTML 结构验证器
|
# 通用的 HTML 结构验证器
|
||||||
|
|||||||
Reference in New Issue
Block a user