This repository has been archived on 2026-01-07. You can view files and clone it, but cannot push or open issues or pull requests.
Files
resources/scrapy_proj/scrapy_proj/pipelines.py
2025-11-10 11:35:44 +08:00

93 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
#from itemadapter import ItemAdapter
#class ScrapyProjPipeline:
# def process_item(self, item, spider):
# return item
import json
import scrapy
from scrapy_proj.items import U001Item, Sis001Item, PBoxStuItem
from scrapy_proj.db_wapper.spider_db_handler import spider_handler_registry, U3C3DBHandler, SisDBHandler, IAFDDBHandler, PboxDBHandler
from scrapy_proj.db_wapper.mysql_handler import mysql_handler_registry, U3C3MysqlHandler
class SQLitePipeline():
def __init__(self):
self.db_handlers = {}
def open_spider(self, spider):
spider_name = spider.name.lower()
handler_class = spider_handler_registry.get(spider_name)
if not handler_class:
#raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")
spider.logger.warning(f"未注册 Spider {spider_name} 的Sqlite数据库处理类跳过数据库操作")
return
self.db_handlers[spider_name] = handler_class()
def close_spider(self, spider):
spider_name = spider.name.lower()
handler = self.db_handlers.pop(spider_name, None)
if handler:
pass
#handler.close() # 这里不关闭,由统计中间件去关闭
def process_item(self, item, spider):
spider_name = spider.name.lower()
handler = self.db_handlers.get(spider_name)
# 转换为单行JSON格式
#item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
#spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
if not handler:
spider.logger.debug(f"未找到 Spider {spider_name} 的数据库处理器,跳过数据库操作")
return item
#raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
handler.insert_item(item)
return item
class MysqlPipeline():
def __init__(self):
self.db_handlers = {}
def open_spider(self, spider):
spider_name = spider.name.lower()
handler_class = mysql_handler_registry.get(spider_name)
if not handler_class:
#raise ValueError(f"未注册 Spider {spider_name} 的数据库处理类")
spider.logger.warning(f"未注册 Spider {spider_name} 的数据库处理类,跳过数据库操作")
return
self.db_handlers[spider_name] = handler_class()
def close_spider(self, spider):
spider_name = spider.name.lower()
handler = self.db_handlers.pop(spider_name, None)
if handler:
pass
#handler.close() # 这里不关闭,由统计中间件去关闭
def process_item(self, item, spider):
spider_name = spider.name.lower()
handler = self.db_handlers.get(spider_name)
# 转换为单行JSON格式
#item_json = json.dumps(dict(item), ensure_ascii=False, separators=(',', ':'))
#spider.logger.debug(f"spider name: {spider_name}, item: {item_json}")
if not handler:
spider.logger.warning(f"未找到 Spider {spider_name} 的数据库处理器,跳过数据库操作")
return item
#raise ValueError(f"未找到 Spider {spider_name} 的数据库处理器")
handler.insert_item(item)
return item