modify scripts
This commit is contained in:
@ -277,7 +277,7 @@ def process_paragraph(paragraph):
|
|||||||
return cleaned_text
|
return cleaned_text
|
||||||
|
|
||||||
# 解析内容页
|
# 解析内容页
|
||||||
def parse_content_page(soup, url):
|
def parse_content_page2(soup, url):
|
||||||
content = []
|
content = []
|
||||||
paragraphs = soup.find_all('p')
|
paragraphs = soup.find_all('p')
|
||||||
if paragraphs:
|
if paragraphs:
|
||||||
@ -298,6 +298,26 @@ def parse_content_page(soup, url):
|
|||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
def parse_content_page(soup, url):
|
||||||
|
content = []
|
||||||
|
|
||||||
|
# 提取所有 p 标签和 h1 标签
|
||||||
|
paragraphs = soup.find_all(['p', 'h1'])
|
||||||
|
if paragraphs:
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
cleaned_text = process_paragraph(paragraph)
|
||||||
|
if cleaned_text:
|
||||||
|
content.append(cleaned_text)
|
||||||
|
|
||||||
|
# 如果没有找到 p 或 h1,再兜底提取所有文本,同时移除水印
|
||||||
|
if not content:
|
||||||
|
cleaned_html = process_paragraph(soup)
|
||||||
|
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||||
|
content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()]
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
# 通用的 HTML 结构验证器
|
# 通用的 HTML 结构验证器
|
||||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||||
if attr_type == "id":
|
if attr_type == "id":
|
||||||
|
|||||||
Reference in New Issue
Block a user