From 7a46b1bc4c0a6f8810cf531a6c2d9d4cb0580365 Mon Sep 17 00:00:00 2001 From: oscarz Date: Wed, 2 Jul 2025 19:19:18 +0800 Subject: [PATCH] modify scripts --- .../scrapy_proj/spiders/u3c3_spider.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py index 6bf0f9a..ae87757 100644 --- a/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py @@ -2,6 +2,26 @@ import scrapy from scrapy_proj.items import U001Item from scrapy_proj.utils.size_converter import parse_size +def extract_title(element): + """提取a标签中的文本内容,优先使用非空title属性""" + # 检查title属性是否存在且不为空字符串 + title_attr = element.attrib.get('title', '').strip() + if title_attr: + return title_attr + + # 否则使用XPath的string(.)函数获取所有子孙节点的文本 + full_text = element.xpath('string(.)').get(default='').strip() + + # 如果结果为空,尝试获取所有文本片段并分别strip后合并 + if not full_text: + text_parts = element.css('::text').getall() + # 对每个文本片段进行strip处理 + stripped_parts = [part.strip() for part in text_parts] + # 过滤掉空字符串并拼接 + full_text = ' '.join(filter(None, stripped_parts)) + + return full_text or '无标题' # 确保至少返回"无标题" + class U001Spider(scrapy.Spider): name = "u3c3" allowed_domains = ["u001.25img.com"] @@ -11,7 +31,7 @@ class U001Spider(scrapy.Spider): for row in response.css('table.torrent-list tbody tr'): item = U001Item() item['category'] = row.css('td:nth-child(1) a::attr(title)').get() - item['title'] = row.css('td:nth-child(2) a::attr(title)').get() + item['title'] = extract_title(row.css('td:nth-child(2) a')) item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get()) links = row.css('td:nth-child(3) a::attr(href)').getall()