modify scripts
This commit is contained in:
25
scrapy_proj/scrapy_proj/custom_scheduler.py
Normal file
25
scrapy_proj/scrapy_proj/custom_scheduler.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from scrapy.core.scheduler import Scheduler
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
class CustomScheduler(Scheduler):
|
||||||
|
|
||||||
|
def enqueue_request(self, request):
|
||||||
|
added = super().enqueue_request(request)
|
||||||
|
|
||||||
|
if not added:
|
||||||
|
is_duplicate = self.df.request_seen(request)
|
||||||
|
reason = "duplicate filtered" if is_duplicate else "queue rejected"
|
||||||
|
|
||||||
|
# ✅ 设置 meta,供扩展读取
|
||||||
|
request.meta['_dropreason'] = reason
|
||||||
|
|
||||||
|
# ✅ 发送 dropped 信号
|
||||||
|
self.crawler.signals.send_catch_log(
|
||||||
|
signal=signals.request_dropped,
|
||||||
|
request=request,
|
||||||
|
spider=self.crawler.spider,
|
||||||
|
reason=reason
|
||||||
|
)
|
||||||
|
self.stats.inc_value("scheduler/dropped")
|
||||||
|
|
||||||
|
return added
|
||||||
@ -43,9 +43,9 @@ class FailureMonitorExtension:
|
|||||||
self._cleanup_old_requests() # 移除时间窗口外的请求
|
self._cleanup_old_requests() # 移除时间窗口外的请求
|
||||||
|
|
||||||
'''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.'''
|
'''Sent when a Request, scheduled by the engine to be downloaded later, is rejected by the scheduler.'''
|
||||||
def request_dropped(self, request, spider):
|
def request_dropped(self, request, spider, reason=None):
|
||||||
# 从request.meta中获取丢弃理由
|
# 从request.meta中获取丢弃理由
|
||||||
drop_reason = request.meta.get('_dropreason', '未知原因')
|
drop_reason = reason if reason else request.meta.get('_dropreason', '未知原因')
|
||||||
spider.logger.warning(f"request_dropped on url: {request.url} | 原因: {drop_reason}")
|
spider.logger.warning(f"request_dropped on url: {request.url} | 原因: {drop_reason}")
|
||||||
self.calculate_failure(spider)
|
self.calculate_failure(spider)
|
||||||
|
|
||||||
|
|||||||
@ -169,11 +169,13 @@ FEED_EXPORT_ENCODING = "utf-8"
|
|||||||
JOBDIR = 'crawl_state'
|
JOBDIR = 'crawl_state'
|
||||||
|
|
||||||
# 调度器配置
|
# 调度器配置
|
||||||
SCHEDULER = "scrapy.core.scheduler.Scheduler"
|
#SCHEDULER = "scrapy.core.scheduler.Scheduler"
|
||||||
|
SCHEDULER = "scrapy_proj.custom_scheduler.CustomScheduler"
|
||||||
SCHEDULER_PERSIST = True # 是否持久化调度器状态(避免中断丢失请求)
|
SCHEDULER_PERSIST = True # 是否持久化调度器状态(避免中断丢失请求)
|
||||||
SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue"
|
SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue"
|
||||||
SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue" # 磁盘队列(FIFO)
|
SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue" # 磁盘队列(FIFO)
|
||||||
|
|
||||||
|
DUPEFILTER_DEBUG = True
|
||||||
# 如果需要 LIFO(栈式调度)
|
# 如果需要 LIFO(栈式调度)
|
||||||
# SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue"
|
# SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue"
|
||||||
# SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleLifoDiskQueue"
|
# SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleLifoDiskQueue"
|
||||||
|
|||||||
49
scrapy_proj/scrapy_proj/tools/check_cached.py
Normal file
49
scrapy_proj/scrapy_proj/tools/check_cached.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.utils.request import RequestFingerprinter
|
||||||
|
|
||||||
|
home_dir = os.path.expanduser("~")
|
||||||
|
CACHE_DIR = f"{home_dir}/sharedata/scrapy_cached"
|
||||||
|
|
||||||
|
def check_cache(url):
|
||||||
|
# 无参初始化,使用默认配置
|
||||||
|
fingerprinter = RequestFingerprinter()
|
||||||
|
|
||||||
|
# 计算指纹
|
||||||
|
req = Request(url)
|
||||||
|
fp = fingerprinter.fingerprint(req).hex()
|
||||||
|
prefix_path = os.path.join(fp[0:2], fp)
|
||||||
|
|
||||||
|
# 查找缓存
|
||||||
|
cache_path = None
|
||||||
|
for domain in os.listdir(CACHE_DIR):
|
||||||
|
candidate = os.path.join(CACHE_DIR, domain, prefix_path)
|
||||||
|
print(candidate)
|
||||||
|
if os.path.exists(candidate):
|
||||||
|
cache_path = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"URL: {url}")
|
||||||
|
print(f"Fingerprint: {fp}")
|
||||||
|
if cache_path:
|
||||||
|
print(f"✅ Found cached response at: {cache_path}")
|
||||||
|
files = os.listdir(cache_path)
|
||||||
|
print("Cached files:", files)
|
||||||
|
|
||||||
|
# 如果有 response_body 文件,读取前 500 字节
|
||||||
|
body_file = os.path.join(cache_path, "response_body")
|
||||||
|
if os.path.exists(body_file):
|
||||||
|
with open(body_file, "rb") as f:
|
||||||
|
snippet = f.read(500).decode(errors="ignore")
|
||||||
|
print("Content preview:\n", snippet)
|
||||||
|
else:
|
||||||
|
print("❌ Not found in cache directory.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print(f"Usage: python {sys.argv[0]} <URL>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
check_cache(url)
|
||||||
Reference in New Issue
Block a user