This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
2025-07-02 19:19:18 +08:00

52 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import scrapy
from scrapy_proj.items import U001Item
from scrapy_proj.utils.size_converter import parse_size
def extract_title(element):
"""提取a标签中的文本内容优先使用非空title属性"""
# 检查title属性是否存在且不为空字符串
title_attr = element.attrib.get('title', '').strip()
if title_attr:
return title_attr
# 否则使用XPath的string(.)函数获取所有子孙节点的文本
full_text = element.xpath('string(.)').get(default='').strip()
# 如果结果为空尝试获取所有文本片段并分别strip后合并
if not full_text:
text_parts = element.css('::text').getall()
# 对每个文本片段进行strip处理
stripped_parts = [part.strip() for part in text_parts]
# 过滤掉空字符串并拼接
full_text = ' '.join(filter(None, stripped_parts))
return full_text or '无标题' # 确保至少返回"无标题"
class U001Spider(scrapy.Spider):
name = "u3c3"
allowed_domains = ["u001.25img.com"]
start_urls = ["https://u001.25img.com/?p=1"]
def parse(self, response):
for row in response.css('table.torrent-list tbody tr'):
item = U001Item()
item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
item['title'] = extract_title(row.css('td:nth-child(2) a'))
item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
links = row.css('td:nth-child(3) a::attr(href)').getall()
item['torrent_url'] = response.urljoin(links[0]) if links else ''
item['magnet_url'] = links[1] if len(links) > 1 else ''
size_text = row.css('td:nth-child(4)::text').get(default='').strip()
item['size_text'] = size_text
item['size_gb'] = parse_size(size_text)
item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip()
yield item
# 翻页逻辑
current_page = int(response.url.split('=')[-1])
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
if current_page < total_pages:
yield response.follow(f"?p={current_page + 1}", self.parse)