From a377e361daf7a4b81789e3d84b3268e690bbe9cb Mon Sep 17 00:00:00 2001 From: sophon Date: Wed, 30 Jul 2025 09:43:11 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/scrapy_proj/custom_scheduler.py | 25 ++++++++++ .../scrapy_proj/extensions/failure_monitor.py | 4 +- scrapy_proj/scrapy_proj/settings.py | 4 +- scrapy_proj/scrapy_proj/tools/check_cached.py | 49 +++++++++++++++++++ 4 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 scrapy_proj/scrapy_proj/custom_scheduler.py create mode 100644 scrapy_proj/scrapy_proj/tools/check_cached.py diff --git a/scrapy_proj/scrapy_proj/custom_scheduler.py b/scrapy_proj/scrapy_proj/custom_scheduler.py new file mode 100644 index 0000000..ce44ead --- /dev/null +++ b/scrapy_proj/scrapy_proj/custom_scheduler.py @@ -0,0 +1,25 @@ +from scrapy.core.scheduler import Scheduler +from scrapy import signals + +class CustomScheduler(Scheduler): + + def enqueue_request(self, request): + added = super().enqueue_request(request) + + if not added: + is_duplicate = self.df.request_seen(request) + reason = "duplicate filtered" if is_duplicate else "queue rejected" + + # ✅ 设置 meta,供扩展读取 + request.meta['_dropreason'] = reason + + # ✅ 发送 dropped 信号 + self.crawler.signals.send_catch_log( + signal=signals.request_dropped, + request=request, + spider=self.crawler.spider, + reason=reason + ) + self.stats.inc_value("scheduler/dropped") + + return added \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/extensions/failure_monitor.py b/scrapy_proj/scrapy_proj/extensions/failure_monitor.py index d27d55b..cf9e105 100644 --- a/scrapy_proj/scrapy_proj/extensions/failure_monitor.py +++ b/scrapy_proj/scrapy_proj/extensions/failure_monitor.py @@ -43,9 +43,9 @@ class FailureMonitorExtension: self._cleanup_old_requests() # 移除时间窗口外的请求 '''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.''' - def request_dropped(self, request, spider): + def request_dropped(self, request, spider, reason=None): # 从request.meta中获取丢弃理由 - drop_reason = request.meta.get('_dropreason', '未知原因') + drop_reason = reason if reason else request.meta.get('_dropreason', '未知原因') spider.logger.warning(f"request_dropped on url: {request.url} | 原因: {drop_reason}") self.calculate_failure(spider) diff --git a/scrapy_proj/scrapy_proj/settings.py b/scrapy_proj/scrapy_proj/settings.py index 7f55951..1a4e466 100644 --- a/scrapy_proj/scrapy_proj/settings.py +++ b/scrapy_proj/scrapy_proj/settings.py @@ -169,11 +169,13 @@ FEED_EXPORT_ENCODING = "utf-8" JOBDIR = 'crawl_state' # 调度器配置 -SCHEDULER = "scrapy.core.scheduler.Scheduler" +#SCHEDULER = "scrapy.core.scheduler.Scheduler" +SCHEDULER = "scrapy_proj.custom_scheduler.CustomScheduler" SCHEDULER_PERSIST = True # 是否持久化调度器状态(避免中断丢失请求) SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue" SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue" # 磁盘队列(FIFO) +DUPEFILTER_DEBUG = True # 如果需要 LIFO(栈式调度) # SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue" # SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleLifoDiskQueue" diff --git a/scrapy_proj/scrapy_proj/tools/check_cached.py b/scrapy_proj/scrapy_proj/tools/check_cached.py new file mode 100644 index 0000000..00c421e --- /dev/null +++ b/scrapy_proj/scrapy_proj/tools/check_cached.py @@ -0,0 +1,49 @@ +import os +import sys +from scrapy.http import Request +from scrapy.utils.request import RequestFingerprinter + +home_dir = os.path.expanduser("~") +CACHE_DIR = f"{home_dir}/sharedata/scrapy_cached" + +def check_cache(url): + # 无参初始化,使用默认配置 + fingerprinter = RequestFingerprinter() + + # 计算指纹 + req = Request(url) + fp = fingerprinter.fingerprint(req).hex() + prefix_path = os.path.join(fp[0:2], fp) + + # 查找缓存 + cache_path = None + for domain in os.listdir(CACHE_DIR): + candidate = os.path.join(CACHE_DIR, domain, prefix_path) + print(candidate) + if os.path.exists(candidate): + cache_path = candidate + break + + print(f"URL: {url}") + print(f"Fingerprint: {fp}") + if cache_path: + print(f"✅ Found cached response at: {cache_path}") + files = os.listdir(cache_path) + print("Cached files:", files) + + # 如果有 response_body 文件,读取前 500 字节 + body_file = os.path.join(cache_path, "response_body") + if os.path.exists(body_file): + with open(body_file, "rb") as f: + snippet = f.read(500).decode(errors="ignore") + print("Content preview:\n", snippet) + else: + print("❌ Not found in cache directory.") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print(f"Usage: python {sys.argv[0]} ") + sys.exit(1) + + url = sys.argv[1] + check_cache(url) \ No newline at end of file