modify scripts
This commit is contained in:
@ -277,7 +277,7 @@ def process_paragraph(paragraph):
|
||||
return cleaned_text
|
||||
|
||||
# 解析内容页
|
||||
def parse_content_page(soup, url):
|
||||
def parse_content_page2(soup, url):
|
||||
content = []
|
||||
paragraphs = soup.find_all('p')
|
||||
if paragraphs:
|
||||
@ -298,6 +298,26 @@ def parse_content_page(soup, url):
|
||||
|
||||
return content
|
||||
|
||||
def parse_content_page(soup, url):
|
||||
content = []
|
||||
|
||||
# 提取所有 p 标签和 h1 标签
|
||||
paragraphs = soup.find_all(['p', 'h1'])
|
||||
if paragraphs:
|
||||
for paragraph in paragraphs:
|
||||
cleaned_text = process_paragraph(paragraph)
|
||||
if cleaned_text:
|
||||
content.append(cleaned_text)
|
||||
|
||||
# 如果没有找到 p 或 h1,再兜底提取所有文本,同时移除水印
|
||||
if not content:
|
||||
cleaned_html = process_paragraph(soup)
|
||||
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
|
||||
content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()]
|
||||
|
||||
return content
|
||||
|
||||
|
||||
# 通用的 HTML 结构验证器
|
||||
def generic_validator(soup, tag, identifier, attr_type="id"):
|
||||
if attr_type == "id":
|
||||
|
||||
Reference in New Issue
Block a user