resources/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py

import scrapy
from scrapy_proj.items import U001Item
from scrapy_proj.utils.size_converter import parse_size

def extract_title(element):
    """提取a标签中的文本内容，优先使用非空title属性"""
    # 检查title属性是否存在且不为空字符串
    title_attr = element.attrib.get('title', '').strip()
    if title_attr:
        return title_attr

    # 否则使用XPath的string(.)函数获取所有子孙节点的文本
    full_text = element.xpath('string(.)').get(default='').strip()

    # 如果结果为空，尝试获取所有文本片段并分别strip后合并
    if not full_text:
        text_parts = element.css('::text').getall()
        # 对每个文本片段进行strip处理
        stripped_parts = [part.strip() for part in text_parts]
        # 过滤掉空字符串并拼接
        full_text = ' '.join(filter(None, stripped_parts))

    return full_text or '无标题'  # 确保至少返回"无标题"

class U001Spider(scrapy.Spider):
    name = "u3c3"
    allowed_domains = ["u001.25img.com"]
    start_urls = ["https://u001.25img.com/?p=1"]

    def parse(self, response):
        for row in response.css('table.torrent-list tbody tr'):
            item = U001Item()
            item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
            item['title'] = extract_title(row.css('td:nth-child(2) a'))
            item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())

            links = row.css('td:nth-child(3) a::attr(href)').getall()
            item['torrent_url'] = response.urljoin(links[0]) if links else ''
            item['magnet_url'] = links[1] if len(links) > 1 else ''

            size_text = row.css('td:nth-child(4)::text').get(default='').strip()
            item['size_text'] = size_text
            item['size_gb'] = parse_size(size_text)

            item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip()
            yield item

        # 翻页逻辑
        current_page = int(response.url.split('=')[-1])
        total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
        if current_page < total_pages:
            yield response.follow(f"?p={current_page + 1}", self.parse)