This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/settings.py
2025-07-20 11:18:52 +08:00

162 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Scrapy settings for scrapy_proj project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
from datetime import datetime
# 创建日志目录
LOG_DIR = './log'
os.makedirs(LOG_DIR, exist_ok=True)
log_date = datetime.now().strftime('%Y%m%d')
# 配置全局日志
LOG_LEVEL = 'INFO' # 设置为INFO级别
LOG_FILE = os.path.join(LOG_DIR, f'scrapy_{log_date}.log') # 日志文件路径
# 配置日志格式
LOG_FORMAT = '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'
BOT_NAME = "scrapy_proj"
SPIDER_MODULES = ["scrapy_proj.spiders"]
NEWSPIDER_MODULE = "scrapy_proj.spiders"
ADDONS = {}
# 并发设置
CONCURRENT_REQUESTS = 10
CONCURRENT_REQUESTS_PER_DOMAIN = 5
CONCURRENT_ITEMS = 1000
# 下载延迟
DOWNLOAD_DELAY = 0.3
# 启用管道
ITEM_PIPELINES = {
'scrapy_proj.pipelines.SQLitePipeline': 300,
}
# 用户代理池
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
# 更多 UA...
]
# 随机用户代理中间件
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None,
'scrapy_proj.middlewares.CloudScraperMiddleware': 543,
}
# settings.py
EXTENSIONS = {
'scrapy_proj.extensions.stats_extension.StatsExtension': 500,
'scrapy_proj.extensions.failure_monitor.FailureMonitorExtension': 500,
}
# 配置参数,失败检测,并退出任务
EXT_FAIL_MONI_MAX_CONSECUTIVE_FAILURES = 100 # 连续10次失败后退出
EXT_FAIL_MONI_RATE_THRESHOLD = 0.6 # 失败率超过30%时退出
EXT_FAIL_MONI_FAILURE_TIME_WINDOW = 300 # 时间窗口为300秒
EXT_FAIL_MONI_MIN_REQUESTS = 10 # 检测失败率时,窗口内的总请求次数不小于该值,避免单次失败直接退出
# 配置拦截检测和重试参数
BASE_SPIDER_MIN_CONTENT_LENGTH = 1000
BASE_SPIDER_BLOCKED_KEYWORDS = [
]
BASE_SPIDER_MAX_RETRIES = 5
BASE_SPIDER_RETRY_DELAY = 5
BASE_SPIDER_CLOSE_ON_MAX_RETRIES = False
# 配置统计导出参数
STATS_PUSH_MSG = True
STATS_EXPORT_INTERVAL = 1800 # 每10分钟导出一次
STATS_EXPORT_SCRIPT = 'scrapy_proj/extensions/push_to_wecom.sh' # 本地shell脚本路径相对路径
TWISTED_REACTOR = 'twisted.internet.epollreactor.EPollReactor' # 适用于Linux
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)"
# Obey robots.txt rules
#ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "scrapy_proj.middlewares.ScrapyProjSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "scrapy_proj.middlewares.ScrapyProjDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "scrapy_proj.pipelines.ScrapyProjPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"