93 lines
3.6 KiB
Python
93 lines
3.6 KiB
Python
# Define your item pipelines here
|
||
#
|
||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||
|
||
|
||
# useful for handling different item types with a single interface
|
||
#from itemadapter import ItemAdapter
|
||
#class ScrapyProjPipeline:
|
||
# def process_item(self, item, spider):
|
||
# return item
|
||
import json
|
||
import scrapy
|
||
from scrapy_proj.items import U001Item, Sis001Item, PBoxStuItem
|
||
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler, PboxDBHandler
|
||
from scrapy_proj.db_wapper.mysql_handler import mysql_handler_registry, U3C3MysqlHandler
|
||
|
||
class SQLitePipeline():
|
||
def __init__(self):
|
||
self.db_handlers = {}
|
||
|
||
def open_spider(self, spider):
|
||
spider_name = spider.name.lower()
|
||
handler_class = spider_handler_registry.get(spider_name)
|
||
if not handler_class:
|
||
#raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")
|
||
spider.logger.warning(f"未注册 Spider {spider_name} 的Sqlite数据库处理类,跳过数据库操作")
|
||
return
|
||
self.db_handlers[spider_name] = handler_class()
|
||
|
||
|
||
def close_spider(self, spider):
|
||
spider_name = spider.name.lower()
|
||
handler = self.db_handlers.pop(spider_name, None)
|
||
if handler:
|
||
pass
|
||
#handler.close() # 这里不关闭,由统计中间件去关闭
|
||
|
||
def process_item(self, item, spider):
|
||
spider_name = spider.name.lower()
|
||
handler = self.db_handlers.get(spider_name)
|
||
|
||
# 转换为单行JSON格式
|
||
#item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
|
||
#spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
|
||
|
||
if not handler:
|
||
spider.logger.debug(f"未找到 Spider {spider_name} 的数据库处理器,跳过数据库操作")
|
||
return item
|
||
#raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
|
||
|
||
handler.insert_item(item)
|
||
return item
|
||
|
||
|
||
class MysqlPipeline():
|
||
def __init__(self):
|
||
self.db_handlers = {}
|
||
|
||
def open_spider(self, spider):
|
||
spider_name = spider.name.lower()
|
||
handler_class = mysql_handler_registry.get(spider_name)
|
||
if not handler_class:
|
||
#raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")
|
||
spider.logger.warning(f"未注册 Spider {spider_name} 的数据库处理类,跳过数据库操作")
|
||
return
|
||
self.db_handlers[spider_name] = handler_class()
|
||
|
||
|
||
def close_spider(self, spider):
|
||
spider_name = spider.name.lower()
|
||
handler = self.db_handlers.pop(spider_name, None)
|
||
if handler:
|
||
pass
|
||
#handler.close() # 这里不关闭,由统计中间件去关闭
|
||
|
||
def process_item(self, item, spider):
|
||
spider_name = spider.name.lower()
|
||
handler = self.db_handlers.get(spider_name)
|
||
|
||
# 转换为单行JSON格式
|
||
#item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
|
||
#spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
|
||
|
||
if not handler:
|
||
spider.logger.warning(f"未找到 Spider {spider_name} 的数据库处理器,跳过数据库操作")
|
||
return item
|
||
#raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
|
||
|
||
handler.insert_item(item)
|
||
return item
|
||
|