modify scripts
This commit is contained in:
@ -2,6 +2,26 @@ import scrapy
|
|||||||
from scrapy_proj.items import U001Item
|
from scrapy_proj.items import U001Item
|
||||||
from scrapy_proj.utils.size_converter import parse_size
|
from scrapy_proj.utils.size_converter import parse_size
|
||||||
|
|
||||||
|
def extract_title(element):
|
||||||
|
"""提取a标签中的文本内容,优先使用非空title属性"""
|
||||||
|
# 检查title属性是否存在且不为空字符串
|
||||||
|
title_attr = element.attrib.get('title', '').strip()
|
||||||
|
if title_attr:
|
||||||
|
return title_attr
|
||||||
|
|
||||||
|
# 否则使用XPath的string(.)函数获取所有子孙节点的文本
|
||||||
|
full_text = element.xpath('string(.)').get(default='').strip()
|
||||||
|
|
||||||
|
# 如果结果为空,尝试获取所有文本片段并分别strip后合并
|
||||||
|
if not full_text:
|
||||||
|
text_parts = element.css('::text').getall()
|
||||||
|
# 对每个文本片段进行strip处理
|
||||||
|
stripped_parts = [part.strip() for part in text_parts]
|
||||||
|
# 过滤掉空字符串并拼接
|
||||||
|
full_text = ' '.join(filter(None, stripped_parts))
|
||||||
|
|
||||||
|
return full_text or '无标题' # 确保至少返回"无标题"
|
||||||
|
|
||||||
class U001Spider(scrapy.Spider):
|
class U001Spider(scrapy.Spider):
|
||||||
name = "u3c3"
|
name = "u3c3"
|
||||||
allowed_domains = ["u001.25img.com"]
|
allowed_domains = ["u001.25img.com"]
|
||||||
@ -11,7 +31,7 @@ class U001Spider(scrapy.Spider):
|
|||||||
for row in response.css('table.torrent-list tbody tr'):
|
for row in response.css('table.torrent-list tbody tr'):
|
||||||
item = U001Item()
|
item = U001Item()
|
||||||
item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
|
item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
|
||||||
item['title'] = row.css('td:nth-child(2) a::attr(title)').get()
|
item['title'] = extract_title(row.css('td:nth-child(2) a'))
|
||||||
item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
|
item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
|
||||||
|
|
||||||
links = row.css('td:nth-child(3) a::attr(href)').getall()
|
links = row.css('td:nth-child(3) a::attr(href)').getall()
|
||||||
|
|||||||
Reference in New Issue
Block a user