This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/extensions/failure_monitor.py
2025-07-18 16:54:22 +08:00

86 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# extensions/failure_monitor.py
from scrapy import signals
from scrapy.exceptions import NotConfigured
import time
class FailureMonitorExtension:
def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window, min_requests):
self.crawler = crawler
self.max_consecutive_failures = max_consecutive_failures
self.failure_rate_threshold = failure_rate_threshold
self.time_window = time_window # 秒
self.min_requests = min_requests
self.consecutive_failures = 0
self.total_requests = 0
self.failed_requests = 0
self.request_times = [] # 记录请求时间用于计算速率
@classmethod
def from_crawler(cls, crawler):
# 从设置中获取参数
max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 100)
failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5)
time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60)
min_requests = crawler.settings.getint('EXT_FAIL_MONI_MIN_REQUESTS', 10)
if max_consecutive <= 0 and failure_rate <= 0:
raise NotConfigured
ext = cls(crawler, max_consecutive, failure_rate, time_window, min_requests)
# 注册信号处理函数
crawler.signals.connect(ext.request_succeeded, signal=signals.response_received)
crawler.signals.connect(ext.request_dropped, signal=signals.request_dropped)
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
return ext
def request_succeeded(self, response, request, spider):
self.consecutive_failures = 0 # 重置连续失败计数
self.total_requests += 1
self.request_times.append(time.time())
self._cleanup_old_requests() # 移除时间窗口外的请求
'''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.'''
def request_dropped(self, request, spider):
spider.logger.warning(f"request_dropped on url {request.url}")
self.calculate_failure(spider)
'''
Sent when a spider callback generates an error (i.e. raises an exception).
https://docs.scrapy.org/en/latest/topics/signals.html#request-failed
'''
def spider_error(self, failure, response, spider):
# 忽略302重定向导致的失败核心过滤逻辑
if response.status in [301, 302, 307, 308]:
spider.logger.info(f"忽略302重定向{response.url}")
return # 直接返回,不处理该“失败”
self.calculate_failure(spider)
def calculate_failure(self, spider):
self.consecutive_failures += 1
self.failed_requests += 1
self.total_requests += 1
self.request_times.append(time.time())
self._cleanup_old_requests()
# 检查连续失败次数
if self.max_consecutive_failures > 0 and self.consecutive_failures >= self.max_consecutive_failures:
spider.logger.error(f"达到连续失败上限 ({self.consecutive_failures}/{self.max_consecutive_failures}),停止爬虫")
self.crawler.engine.close_spider(spider, 'consecutive_failures_exceeded')
# 检查失败率
if self.total_requests >= self.min_requests and self.failure_rate_threshold > 0:
current_failure_rate = self.failed_requests / self.total_requests
if current_failure_rate >= self.failure_rate_threshold:
spider.logger.error(f"失败率超过阈值 ({current_failure_rate:.2%} > {self.failure_rate_threshold:.2%}),停止爬虫")
self.crawler.engine.close_spider(spider, 'failure_rate_exceeded')
def _cleanup_old_requests(self):
"""移除时间窗口外的请求记录"""
cutoff_time = time.time() - self.time_window
self.request_times = [t for t in self.request_times if t >= cutoff_time]
# 重新计算失败率
self.total_requests = len(self.request_times)