modify scripts

This commit is contained in:
oscarz
2025-03-23 10:59:35 +08:00
parent 5f835443e8
commit 0cfd87b555

View File

@ -277,7 +277,7 @@ def process_paragraph(paragraph):
return cleaned_text
# 解析内容页
def parse_content_page(soup, url):
def parse_content_page2(soup, url):
content = []
paragraphs = soup.find_all('p')
if paragraphs:
@ -298,6 +298,26 @@ def parse_content_page(soup, url):
return content
def parse_content_page(soup, url):
content = []
# 提取所有 p 标签和 h1 标签
paragraphs = soup.find_all(['p', 'h1'])
if paragraphs:
for paragraph in paragraphs:
cleaned_text = process_paragraph(paragraph)
if cleaned_text:
content.append(cleaned_text)
# 如果没有找到 p 或 h1再兜底提取所有文本同时移除水印
if not content:
cleaned_html = process_paragraph(soup)
cleaned_soup = BeautifulSoup(cleaned_html, 'html.parser')
content = [block.strip() for block in cleaned_soup.stripped_strings if block.strip()]
return content
# 通用的 HTML 结构验证器
def generic_validator(soup, tag, identifier, attr_type="id"):
if attr_type == "id":