modify scripts

2025-07-30 09:43:11 +08:00
parent a7c60d8b50
commit a377e361da
4 changed files with 79 additions and 3 deletions
--- a/scrapy_proj/scrapy_proj/custom_scheduler.py
+++ b/scrapy_proj/scrapy_proj/custom_scheduler.py
@ -0,0 +1,25 @@
+from scrapy.core.scheduler import Scheduler
+from scrapy import signals
+
+class CustomScheduler(Scheduler):
+
+    def enqueue_request(self, request):
+        added = super().enqueue_request(request)
+
+        if not added:
+            is_duplicate = self.df.request_seen(request)
+            reason = "duplicate filtered" if is_duplicate else "queue rejected"
+
+            # ✅ 设置 meta，供扩展读取
+            request.meta['_dropreason'] = reason
+
+            # ✅ 发送 dropped 信号
+            self.crawler.signals.send_catch_log(
+                signal=signals.request_dropped,
+                request=request,
+                spider=self.crawler.spider,
+                reason=reason
+            )
+            self.stats.inc_value("scheduler/dropped")
+
+        return added
--- a/scrapy_proj/scrapy_proj/extensions/failure_monitor.py
+++ b/scrapy_proj/scrapy_proj/extensions/failure_monitor.py
@ -43,9 +43,9 @@ class FailureMonitorExtension:
        self._cleanup_old_requests()  # 移除时间窗口外的请求

    '''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.'''
-    def request_dropped(self, request, spider):
+    def request_dropped(self, request, spider, reason=None):
        # 从request.meta中获取丢弃理由
-        drop_reason = request.meta.get('_dropreason', '未知原因')
+        drop_reason = reason if reason else request.meta.get('_dropreason', '未知原因')
        spider.logger.warning(f"request_dropped on url: {request.url} | 原因: {drop_reason}")
        self.calculate_failure(spider)
    
--- a/scrapy_proj/scrapy_proj/settings.py
+++ b/scrapy_proj/scrapy_proj/settings.py
@ -169,11 +169,13 @@ FEED_EXPORT_ENCODING = "utf-8"
 JOBDIR = 'crawl_state'

 # 调度器配置
-SCHEDULER = "scrapy.core.scheduler.Scheduler"
+#SCHEDULER = "scrapy.core.scheduler.Scheduler"
+SCHEDULER = "scrapy_proj.custom_scheduler.CustomScheduler"
 SCHEDULER_PERSIST = True           # 是否持久化调度器状态（避免中断丢失请求）
 SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue"
 SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue"  # 磁盘队列（FIFO）

+DUPEFILTER_DEBUG = True
 # 如果需要 LIFO（栈式调度）
 # SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue"
 # SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleLifoDiskQueue"
--- a/scrapy_proj/scrapy_proj/tools/check_cached.py
+++ b/scrapy_proj/scrapy_proj/tools/check_cached.py
@ -0,0 +1,49 @@
+import os
+import sys
+from scrapy.http import Request
+from scrapy.utils.request import RequestFingerprinter
+
+home_dir = os.path.expanduser("~")
+CACHE_DIR = f"{home_dir}/sharedata/scrapy_cached"
+
+def check_cache(url):
+    # 无参初始化，使用默认配置
+    fingerprinter = RequestFingerprinter()
+
+    # 计算指纹
+    req = Request(url)
+    fp = fingerprinter.fingerprint(req).hex()
+    prefix_path = os.path.join(fp[0:2], fp)
+
+    # 查找缓存
+    cache_path = None
+    for domain in os.listdir(CACHE_DIR):
+        candidate = os.path.join(CACHE_DIR, domain, prefix_path)
+        print(candidate)
+        if os.path.exists(candidate):
+            cache_path = candidate
+            break
+
+    print(f"URL: {url}")
+    print(f"Fingerprint: {fp}")
+    if cache_path:
+        print(f"✅ Found cached response at: {cache_path}")
+        files = os.listdir(cache_path)
+        print("Cached files:", files)
+
+        # 如果有 response_body 文件，读取前 500 字节
+        body_file = os.path.join(cache_path, "response_body")
+        if os.path.exists(body_file):
+            with open(body_file, "rb") as f:
+                snippet = f.read(500).decode(errors="ignore")
+                print("Content preview:\n", snippet)
+    else:
+        print("❌ Not found in cache directory.")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(f"Usage: python {sys.argv[0]} <URL>")
+        sys.exit(1)
+
+    url = sys.argv[1]
+    check_cache(url)