86 lines
4.1 KiB
Python
86 lines
4.1 KiB
Python
# extensions/failure_monitor.py
|
||
from scrapy import signals
|
||
from scrapy.exceptions import NotConfigured
|
||
import time
|
||
|
||
class FailureMonitorExtension:
|
||
def __init__(self, crawler, max_consecutive_failures, failure_rate_threshold, time_window, min_requests):
|
||
self.crawler = crawler
|
||
self.max_consecutive_failures = max_consecutive_failures
|
||
self.failure_rate_threshold = failure_rate_threshold
|
||
self.time_window = time_window # 秒
|
||
self.min_requests = min_requests
|
||
|
||
self.consecutive_failures = 0
|
||
self.total_requests = 0
|
||
self.failed_requests = 0
|
||
self.request_times = [] # 记录请求时间用于计算速率
|
||
|
||
@classmethod
|
||
def from_crawler(cls, crawler):
|
||
# 从设置中获取参数
|
||
max_consecutive = crawler.settings.getint('EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES', 100)
|
||
failure_rate = crawler.settings.getfloat('EXT_FAIL_MONI_RATE_THRESHOLD', 0.5)
|
||
time_window = crawler.settings.getint('EXT_FAIL_MONI_FAILURE_TIME_WINDOW', 60)
|
||
min_requests = crawler.settings.getint('EXT_FAIL_MONI_MIN_REQUESTS', 10)
|
||
|
||
if max_consecutive <= 0 and failure_rate <= 0:
|
||
raise NotConfigured
|
||
|
||
ext = cls(crawler, max_consecutive, failure_rate, time_window, min_requests)
|
||
|
||
# 注册信号处理函数
|
||
crawler.signals.connect(ext.request_succeeded, signal=signals.response_received)
|
||
crawler.signals.connect(ext.request_dropped, signal=signals.request_dropped)
|
||
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
||
|
||
return ext
|
||
|
||
def request_succeeded(self, response, request, spider):
|
||
self.consecutive_failures = 0 # 重置连续失败计数
|
||
self.total_requests += 1
|
||
self.request_times.append(time.time())
|
||
self._cleanup_old_requests() # 移除时间窗口外的请求
|
||
|
||
'''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.'''
|
||
def request_dropped(self, request, spider):
|
||
spider.logger.warning(f"request_dropped on url {request.url}")
|
||
self.calculate_failure(spider)
|
||
|
||
'''
|
||
Sent when a spider callback generates an error (i.e. raises an exception).
|
||
https://docs.scrapy.org/en/latest/topics/signals.html#request-failed
|
||
'''
|
||
def spider_error(self, failure, response, spider):
|
||
# 忽略302重定向导致的失败(核心过滤逻辑)
|
||
if response.status in [301, 302, 307, 308]:
|
||
spider.logger.info(f"忽略302重定向:{response.url}")
|
||
return # 直接返回,不处理该“失败”
|
||
self.calculate_failure(spider)
|
||
|
||
def calculate_failure(self, spider):
|
||
self.consecutive_failures += 1
|
||
self.failed_requests += 1
|
||
self.total_requests += 1
|
||
self.request_times.append(time.time())
|
||
self._cleanup_old_requests()
|
||
|
||
# 检查连续失败次数
|
||
if self.max_consecutive_failures > 0 and self.consecutive_failures >= self.max_consecutive_failures:
|
||
spider.logger.error(f"达到连续失败上限 ({self.consecutive_failures}/{self.max_consecutive_failures}),停止爬虫")
|
||
self.crawler.engine.close_spider(spider, 'consecutive_failures_exceeded')
|
||
|
||
# 检查失败率
|
||
if self.total_requests >= self.min_requests and self.failure_rate_threshold > 0:
|
||
current_failure_rate = self.failed_requests / self.total_requests
|
||
if current_failure_rate >= self.failure_rate_threshold:
|
||
spider.logger.error(f"失败率超过阈值 ({current_failure_rate:.2%} > {self.failure_rate_threshold:.2%}),停止爬虫")
|
||
self.crawler.engine.close_spider(spider, 'failure_rate_exceeded')
|
||
|
||
def _cleanup_old_requests(self):
|
||
"""移除时间窗口外的请求记录"""
|
||
cutoff_time = time.time() - self.time_window
|
||
self.request_times = [t for t in self.request_times if t >= cutoff_time]
|
||
|
||
# 重新计算失败率
|
||
self.total_requests = len(self.request_times) |