modify scripts
This commit is contained in:
11
scrapy_proj/scrapy.cfg
Normal file
11
scrapy_proj/scrapy.cfg
Normal file
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = scrapy_proj.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = scrapy_proj
|
||||
0
scrapy_proj/scrapy_proj/__init__.py
Normal file
0
scrapy_proj/scrapy_proj/__init__.py
Normal file
33
scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh
Executable file
33
scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
: << 'EOF'
|
||||
执行本地脚本,以实现任务的状态监控。
|
||||
远程机上部署发送通知(企微)的脚本,把结果发送出来。
|
||||
EOF
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
NC='\033[0m' # 无颜色
|
||||
|
||||
REMOTE_SERVER="101.33.230.186"
|
||||
REMOTE_USER="root"
|
||||
SSH_OTRS="-o StrictHostKeyChecking=no -o ConnectTimeout=10"
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
# 检查是否提供了命令参数
|
||||
if [ $# -eq 0 ]; then
|
||||
result='test' # 无参数时默认值
|
||||
else
|
||||
result=$1 # 使用第一个参数作为结果
|
||||
fi
|
||||
|
||||
# 调用远程脚本并传递结果
|
||||
ssh $SSH_OTRS $REMOTE_USER@$REMOTE_SERVER "cd /root/projects/devops/tools; python3 ./send_to_wecom.py '$result'"
|
||||
return $? # 返回远程命令的执行状态
|
||||
}
|
||||
|
||||
# 执行主函数
|
||||
main "$@"
|
||||
116
scrapy_proj/scrapy_proj/extensions/stats_extension.py
Normal file
116
scrapy_proj/scrapy_proj/extensions/stats_extension.py
Normal file
@ -0,0 +1,116 @@
|
||||
import subprocess
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from twisted.internet import task
|
||||
|
||||
logger = logging.getLogger() # 修改点:使用全局 logger
|
||||
|
||||
class StatsExtension:
|
||||
def __init__(self, stats, interval, script_path=None):
|
||||
self.stats = stats
|
||||
self.interval = interval
|
||||
self.script_path = script_path
|
||||
self.spider_name = None
|
||||
self.loop = None # 添加循环任务
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
interval = crawler.settings.getint('STATS_EXPORT_INTERVAL', 600)
|
||||
script_path = crawler.settings.get('STATS_EXPORT_SCRIPT')
|
||||
|
||||
if interval <= 0:
|
||||
raise NotConfigured
|
||||
|
||||
ext = cls(crawler.stats, interval, script_path)
|
||||
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
||||
return ext
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.spider_name = spider.name
|
||||
logger.info(f"Spider {spider.name} opened - StatsExtension initialized")
|
||||
#self._export_stats(spider)
|
||||
|
||||
# 创建并启动循环任务
|
||||
self.loop = task.LoopingCall(self._export_stats, spider)
|
||||
self.loop.start(self.interval) # 每隔interval秒执行一次
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
# 停止循环任务
|
||||
if self.loop and self.loop.running:
|
||||
self.loop.stop()
|
||||
|
||||
self._export_stats(spider)
|
||||
logger.info(f"Spider {spider.name} closed - reason: {reason}")
|
||||
|
||||
def _export_stats(self, spider):
|
||||
# 获取当前统计信息
|
||||
stats = self.stats.get_stats()
|
||||
|
||||
# 修正:计算爬虫运行时间
|
||||
start_time = stats.get('start_time')
|
||||
if start_time:
|
||||
# 将 datetime 对象转换为时间戳
|
||||
start_timestamp = start_time.timestamp()
|
||||
uptime = time.time() - start_timestamp
|
||||
else:
|
||||
uptime = 0
|
||||
|
||||
# 构建统计摘要
|
||||
stats_summary = {
|
||||
't': datetime.now().strftime('%H:%M:%S'),
|
||||
'spider': self.spider_name,
|
||||
'interval(s)': int(uptime),
|
||||
'recv_cnt': stats.get('response_received_count', 0),
|
||||
'total_req': stats.get('downloader/request_count', 0),
|
||||
'200_cnt': stats.get('downloader/response_status_count/200', 0),
|
||||
'404_cnt': stats.get('downloader/response_status_count/404', 0),
|
||||
'log_err_cnt': stats.get('log_count/ERROR', 0)
|
||||
}
|
||||
|
||||
# 打印统计信息
|
||||
logger.info(f"Stats Summary: {stats_summary}")
|
||||
|
||||
# 如果配置了shell脚本,则调用它
|
||||
if self.script_path:
|
||||
self._call_shell_script_async(stats_summary)
|
||||
|
||||
def _call_shell_script(self, stats):
|
||||
try:
|
||||
# 将统计信息转换为JSON字符串作为参数传递给shell脚本
|
||||
import json
|
||||
stats_json = json.dumps(stats)
|
||||
|
||||
# 使用subprocess调用shell脚本
|
||||
result = subprocess.run(
|
||||
[self.script_path, stats_json],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
logger.info(f"Shell script executed successfully: {result.stdout}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Error executing shell script: {e.stderr}")
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error calling shell script: {e}")
|
||||
|
||||
def _call_shell_script_async(self, stats):
|
||||
try:
|
||||
import json
|
||||
stats_json = json.dumps(stats)
|
||||
|
||||
# 非阻塞执行shell脚本
|
||||
subprocess.Popen(
|
||||
[self.script_path, stats_json],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
logger.info(f"Shell script started in background")
|
||||
except Exception as e:
|
||||
logger.error(f"Error starting shell script: {e}")
|
||||
22
scrapy_proj/scrapy_proj/items.py
Normal file
22
scrapy_proj/scrapy_proj/items.py
Normal file
@ -0,0 +1,22 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
# items.py
|
||||
import scrapy
|
||||
|
||||
class U001Item(scrapy.Item):
|
||||
category = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
torrent_url = scrapy.Field()
|
||||
magnet_url = scrapy.Field()
|
||||
size_text = scrapy.Field()
|
||||
size_gb = scrapy.Field()
|
||||
update_date = scrapy.Field()
|
||||
|
||||
class Sis001Item(scrapy.Item):
|
||||
title = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
plate_name = scrapy.Field()
|
||||
100
scrapy_proj/scrapy_proj/middlewares.py
Normal file
100
scrapy_proj/scrapy_proj/middlewares.py
Normal file
@ -0,0 +1,100 @@
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class ScrapyProjSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, or item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request or item objects.
|
||||
pass
|
||||
|
||||
async def process_start(self, start):
|
||||
# Called with an async iterator over the spider start() method or the
|
||||
# maching method of an earlier spider middleware.
|
||||
async for item_or_request in start:
|
||||
yield item_or_request
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class ScrapyProjDownloaderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
209
scrapy_proj/scrapy_proj/pipelines.py
Normal file
209
scrapy_proj/scrapy_proj/pipelines.py
Normal file
@ -0,0 +1,209 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
#from itemadapter import ItemAdapter
|
||||
#class ScrapyProjPipeline:
|
||||
# def process_item(self, item, spider):
|
||||
# return item
|
||||
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from scrapy_proj.items import U001Item, Sis001Item
|
||||
|
||||
home_dir = os.path.expanduser("~")
|
||||
global_share_data_dir = f'{home_dir}/sharedata'
|
||||
default_dbpath = f"{global_share_data_dir}/sqlite/scrapy.db"
|
||||
|
||||
# 数据库基类,封装了通用的操作。
|
||||
class SQLiteDBHandler:
|
||||
def __init__(self, db_path=None):
|
||||
# 使用传入的 db_path 或默认路径
|
||||
self.DB_PATH = db_path or default_dbpath
|
||||
|
||||
# 验证路径是否存在(可选)
|
||||
if db_path and not os.path.exists(os.path.dirname(db_path)):
|
||||
os.makedirs(os.path.dirname(db_path))
|
||||
|
||||
self.conn = sqlite3.connect(self.DB_PATH, check_same_thread=False)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
# 检查 SQLite 版本
|
||||
self.lower_sqlite_version = False
|
||||
sqlite_version = sqlite3.sqlite_version_info
|
||||
if sqlite_version < (3, 24, 0):
|
||||
self.lower_sqlite_version = True
|
||||
|
||||
def get_table_columns_and_defaults(self, tbl_name):
|
||||
try:
|
||||
self.cursor.execute(f"PRAGMA table_info({tbl_name})")
|
||||
columns = self.cursor.fetchall()
|
||||
column_info = {}
|
||||
for col in columns:
|
||||
col_name = col[1]
|
||||
default_value = col[4]
|
||||
column_info[col_name] = default_value
|
||||
return column_info
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error getting table columns: {e}")
|
||||
return None
|
||||
|
||||
def check_and_process_data(self, data, tbl_name):
|
||||
column_info = self.get_table_columns_and_defaults(tbl_name)
|
||||
if column_info is None:
|
||||
return None
|
||||
processed_data = {}
|
||||
for col, default in column_info.items():
|
||||
if col == 'id' or col == 'created_at': # 自增主键,不需要用户提供; 创建日期,使用建表默认值
|
||||
continue
|
||||
if col == 'updated_at': # 日期函数,用户自己指定即可
|
||||
processed_data[col] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
if col in data:
|
||||
processed_data[col] = data[col]
|
||||
|
||||
return processed_data
|
||||
|
||||
def insert_or_update_common(self, data, tbl_name, uniq_key='url'):
|
||||
if self.lower_sqlite_version:
|
||||
return self.insert_or_update_common_lower(data, tbl_name, uniq_key)
|
||||
|
||||
try:
|
||||
processed_data = self.check_and_process_data(data, tbl_name)
|
||||
if processed_data is None:
|
||||
return None
|
||||
|
||||
columns = ', '.join(processed_data.keys())
|
||||
values = list(processed_data.values())
|
||||
placeholders = ', '.join(['?' for _ in values])
|
||||
update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key])
|
||||
|
||||
sql = f'''
|
||||
INSERT INTO {tbl_name} ({columns})
|
||||
VALUES ({placeholders})
|
||||
ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}
|
||||
'''
|
||||
self.cursor.execute(sql, values)
|
||||
self.conn.commit()
|
||||
|
||||
# 获取插入或更新后的记录 ID
|
||||
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
||||
record_id = self.cursor.fetchone()[0]
|
||||
return record_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'):
|
||||
try:
|
||||
processed_data = self.check_and_process_data(data, tbl_name)
|
||||
if processed_data is None:
|
||||
return None
|
||||
|
||||
columns = ', '.join(processed_data.keys())
|
||||
values = list(processed_data.values())
|
||||
placeholders = ', '.join(['?' for _ in values])
|
||||
|
||||
# 先尝试插入数据
|
||||
try:
|
||||
sql = f'''
|
||||
INSERT INTO {tbl_name} ({columns})
|
||||
VALUES ({placeholders})
|
||||
'''
|
||||
self.cursor.execute(sql, values)
|
||||
self.conn.commit()
|
||||
except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作
|
||||
update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key])
|
||||
update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key]
|
||||
update_values.append(data[uniq_key])
|
||||
sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?"
|
||||
self.cursor.execute(sql, update_values)
|
||||
self.conn.commit()
|
||||
|
||||
# 获取插入或更新后的记录 ID
|
||||
self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],))
|
||||
record_id = self.cursor.fetchone()[0]
|
||||
return record_id
|
||||
except sqlite3.Error as e:
|
||||
logging.error(f"Error inserting or updating data: {e}")
|
||||
return None
|
||||
|
||||
def get_id_by_key(self, tbl, uniq_key, val):
|
||||
self.cursor.execute(f"SELECT id FROM {tbl} WHERE {uniq_key} = ?", (val,))
|
||||
row = self.cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
def close(self):
|
||||
self.cursor.close()
|
||||
self.conn.close()
|
||||
|
||||
class SQLitePipeline(SQLiteDBHandler):
|
||||
def __init__(self, db_path=None):
|
||||
super().__init__(db_path)
|
||||
self.tbl_name_u3c3 = 'u3c3'
|
||||
self.tbl_name_sis = 'sis'
|
||||
self._create_tables()
|
||||
|
||||
def _create_tables(self):
|
||||
# 创建 u001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_u3c3} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
category TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
torrent_url TEXT,
|
||||
magnet_url TEXT,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
|
||||
# 创建 sis001 数据表
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {self.tbl_name_sis} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
plate_name TEXT,
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
size_text TEXT,
|
||||
size_gb REAL,
|
||||
update_date TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
||||
)
|
||||
''')
|
||||
self.conn.commit()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if isinstance(item, U001Item):
|
||||
self._process_u001_item(item)
|
||||
elif isinstance(item, Sis001Item):
|
||||
self._process_sis001_item(item)
|
||||
return item
|
||||
|
||||
def _process_u001_item(self, item):
|
||||
return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url')
|
||||
|
||||
def _process_sis001_item(self, item):
|
||||
self.cursor.execute('''
|
||||
INSERT OR IGNORE INTO sis001_data
|
||||
(title, url, plate_name)
|
||||
VALUES (?,?,?)
|
||||
''', (
|
||||
item.get('title'),
|
||||
item.get('url'),
|
||||
item.get('plate_name')
|
||||
))
|
||||
self.conn.commit()
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.conn.close()
|
||||
140
scrapy_proj/scrapy_proj/settings.py
Normal file
140
scrapy_proj/scrapy_proj/settings.py
Normal file
@ -0,0 +1,140 @@
|
||||
# Scrapy settings for scrapy_proj project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
# 创建日志目录
|
||||
LOG_DIR = './log'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
log_date = datetime.now().strftime('%Y%m%d')
|
||||
# 配置全局日志
|
||||
LOG_LEVEL = 'INFO' # 设置为INFO级别
|
||||
LOG_FILE = os.path.join(LOG_DIR, f'scrapy_{log_date}.log') # 日志文件路径
|
||||
# 配置日志格式
|
||||
LOG_FORMAT = '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] (%(funcName)s) - %(message)s'
|
||||
LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'
|
||||
|
||||
BOT_NAME = "scrapy_proj"
|
||||
|
||||
SPIDER_MODULES = ["scrapy_proj.spiders"]
|
||||
NEWSPIDER_MODULE = "scrapy_proj.spiders"
|
||||
|
||||
ADDONS = {}
|
||||
|
||||
# 并发设置
|
||||
CONCURRENT_REQUESTS = 1
|
||||
CONCURRENT_ITEMS = 100
|
||||
|
||||
# 下载延迟
|
||||
DOWNLOAD_DELAY = 1
|
||||
|
||||
# 启用管道
|
||||
ITEM_PIPELINES = {
|
||||
'scrapy_proj.pipelines.SQLitePipeline': 300,
|
||||
}
|
||||
|
||||
# 用户代理池
|
||||
USER_AGENT_LIST = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
|
||||
# 更多 UA...
|
||||
]
|
||||
|
||||
# 随机用户代理中间件
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
|
||||
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': None,
|
||||
}
|
||||
|
||||
# settings.py
|
||||
EXTENSIONS = {
|
||||
'scrapy_proj.extensions.stats_extension.StatsExtension': 500,
|
||||
}
|
||||
|
||||
# 配置统计导出参数
|
||||
STATS_EXPORT_INTERVAL = 1800 # 每10分钟导出一次
|
||||
STATS_EXPORT_SCRIPT = '/root/projects/resources/scrapy_proj/scrapy_proj/extensions/push_to_wecom.sh' # 本地shell脚本路径
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = "scrapy_proj (+http://www.yourdomain.com)"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "scrapy_proj.middlewares.ScrapyProjSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# "scrapy_proj.middlewares.ScrapyProjDownloaderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# "scrapy_proj.pipelines.ScrapyProjPipeline": 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
4
scrapy_proj/scrapy_proj/spiders/__init__.py
Normal file
4
scrapy_proj/scrapy_proj/spiders/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
20
scrapy_proj/scrapy_proj/spiders/sis_spider.py
Normal file
20
scrapy_proj/scrapy_proj/spiders/sis_spider.py
Normal file
@ -0,0 +1,20 @@
|
||||
import scrapy
|
||||
from scrapy_proj.items import Sis001Item
|
||||
|
||||
class Sis001Spider(scrapy.Spider):
|
||||
name = "sis"
|
||||
allowed_domains = ["sis001.com"]
|
||||
start_urls = ["https://sis001.com/forum/forum-25-1.html"]
|
||||
|
||||
def parse(self, response):
|
||||
for row in response.css('table[id="forum_25"] tbody[id^="normalthread_"] tr'):
|
||||
item = Sis001Item()
|
||||
item['title'] = row.css('td a::text').get()
|
||||
item['url'] = response.urljoin(row.css('td a::attr(href)').get())
|
||||
item['plate_name'] = '亚无转帖'
|
||||
yield item
|
||||
|
||||
# 翻页逻辑
|
||||
next_page = response.css('a.nxt::attr(href)').get()
|
||||
if next_page:
|
||||
yield response.follow(next_page, self.parse)
|
||||
32
scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
Normal file
32
scrapy_proj/scrapy_proj/spiders/u3c3_spider.py
Normal file
@ -0,0 +1,32 @@
|
||||
import scrapy
|
||||
from scrapy_proj.items import U001Item
|
||||
from scrapy_proj.utils.size_converter import parse_size
|
||||
|
||||
class U001Spider(scrapy.Spider):
|
||||
name = "u3c3"
|
||||
allowed_domains = ["u001.25img.com"]
|
||||
start_urls = ["https://u001.25img.com/?p=1"]
|
||||
|
||||
def parse(self, response):
|
||||
for row in response.css('table.torrent-list tbody tr'):
|
||||
item = U001Item()
|
||||
item['category'] = row.css('td:nth-child(1) a::attr(title)').get()
|
||||
item['title'] = row.css('td:nth-child(2) a::attr(title)').get()
|
||||
item['url'] = response.urljoin(row.css('td:nth-child(2) a::attr(href)').get())
|
||||
|
||||
links = row.css('td:nth-child(3) a::attr(href)').getall()
|
||||
item['torrent_url'] = response.urljoin(links[0]) if links else ''
|
||||
item['magnet_url'] = links[1] if len(links) > 1 else ''
|
||||
|
||||
size_text = row.css('td:nth-child(4)::text').get(default='').strip()
|
||||
item['size_text'] = size_text
|
||||
item['size_gb'] = parse_size(size_text)
|
||||
|
||||
item['update_date'] = row.css('td:nth-child(5)::text').get(default='').strip()
|
||||
yield item
|
||||
|
||||
# 翻页逻辑
|
||||
current_page = int(response.url.split('=')[-1])
|
||||
total_pages = int(response.css('script:contains("totalPages")').re_first(r'totalPages:\s*(\d+)'))
|
||||
if current_page < total_pages:
|
||||
yield response.follow(f"?p={current_page + 1}", self.parse)
|
||||
19
scrapy_proj/scrapy_proj/utils/size_converter.py
Normal file
19
scrapy_proj/scrapy_proj/utils/size_converter.py
Normal file
@ -0,0 +1,19 @@
|
||||
import re
|
||||
|
||||
def parse_size(size_text):
|
||||
try:
|
||||
match = re.search(r'(\d+\.\d+|\d+)\s*([A-Za-z]+)', size_text)
|
||||
if not match:
|
||||
return 0.0
|
||||
value, unit = match.groups()
|
||||
value = float(value)
|
||||
if unit.lower() == 'mb':
|
||||
return round(value / 1024, 2)
|
||||
elif unit.lower() == 'kb':
|
||||
return round(value / 1024 / 1024, 2)
|
||||
elif unit.lower() == 'gb':
|
||||
return round(value, 2)
|
||||
else:
|
||||
return 0.0
|
||||
except Exception:
|
||||
return 0.0
|
||||
Reference in New Issue
Block a user