53 lines
2.5 KiB
Python
53 lines
2.5 KiB
Python
import scrapy
|
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
|
from scrapy_proj.items import U001Item
|
|
from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
|
|
|
class U001Spider(BaseSpider):
|
|
name = "u3c3"
|
|
allowed_domains = ["u001.25img.com"]
|
|
start_urls = ["https://u001.25img.com/?p=1"]
|
|
|
|
def __init__(self, debug='False', begin=None, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.debug = True if (str(debug).lower() == 'true' or str(debug).lower() == '1') else False
|
|
self.logger.info(f"debug mod: {self.debug}")
|
|
|
|
self.begin = parse_date_to_datetime(begin) if begin else None
|
|
|
|
# 入口函数,由基类的方法触发
|
|
def _parse(self, response):
|
|
need_next = False
|
|
for row in response.css('table.torrent-list tbody tr'):
|
|
item = U001Item()
|
|
item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
|
|
item['title'] = extract_text_from_element(row.css('td:nth-child(2) a'), use_title=True)
|
|
item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
|
|
|
|
links = row.css('td:nth-child(3) a::attr(href)').getall()
|
|
item['torrent_url'] = response.urljoin(links[0]) if links else ''
|
|
item['magnet_url'] = links[1] if len(links) > 1 else ''
|
|
|
|
size_text = row.css('td:nth-child(4)::text').get(default='').strip()
|
|
item['size_text'] = size_text
|
|
item['size_gb'] = parse_size(size_text)
|
|
|
|
item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip()
|
|
|
|
# 判断是否还要翻页,只有满足所有页面的数据,日期均小于开始日期时,停止翻页
|
|
up_date = parse_date_to_datetime(item['update_date'])
|
|
if up_date and self.begin and up_date < self.begin :
|
|
self.logger.debug(f"find early data.")
|
|
else:
|
|
need_next = True
|
|
yield item
|
|
|
|
if need_next :
|
|
# 翻页逻辑
|
|
current_page = int(response.url.split('=')[-1])
|
|
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
|
|
if current_page < total_pages:
|
|
if self.debug and current_page >= 5:
|
|
self.logger.info(f"debug mod. stop crawling.")
|
|
else:
|
|
yield response.follow(f"?p={current_page + 1}", self._parse) |