diff --git a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py index 50184c9..b349e91 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py +++ b/scrapy_proj/scrapy_proj/db_wapper/sqlite_base.py @@ -60,9 +60,9 @@ class SQLiteDBHandler: return processed_data - def insert_or_update_common(self, data, tbl_name, uniq_key='url'): + def insert_or_update_common(self, data, tbl_name, uniq_key='url', exists_do_nothing=False): if self.lower_sqlite_version: - return self.insert_or_update_common_lower(data, tbl_name, uniq_key) + return self.insert_or_update_common_lower(data, tbl_name, uniq_key, exists_do_nothing) try: processed_data = self.check_and_process_data(data, tbl_name) @@ -72,12 +72,17 @@ class SQLiteDBHandler: columns = ', '.join(processed_data.keys()) values = list(processed_data.values()) placeholders = ', '.join(['?' for _ in values]) - update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key]) + + if exists_do_nothing: + conflict_clause = f'ON CONFLICT ({uniq_key}) DO NOTHING' + else: + update_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in processed_data.keys() if col != uniq_key]) + conflict_clause = f"ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause}" sql = f''' INSERT INTO {tbl_name} ({columns}) - VALUES ({placeholders}) - ON CONFLICT ({uniq_key}) DO UPDATE SET {update_clause} + VALUES ({placeholders}) + {conflict_clause} ''' self.cursor.execute(sql, values) self.conn.commit() @@ -90,7 +95,7 @@ class SQLiteDBHandler: logging.error(f"Error inserting or updating data: {e}") return None - def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url'): + def insert_or_update_common_lower(self, data, tbl_name, uniq_key='url', exists_do_nothing=False): try: processed_data = self.check_and_process_data(data, tbl_name) if processed_data is None: @@ -109,12 +114,13 @@ class SQLiteDBHandler: self.cursor.execute(sql, values) self.conn.commit() except sqlite3.IntegrityError: # 唯一键冲突,执行更新操作 - update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) - update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key] - update_values.append(data[uniq_key]) - sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?" - self.cursor.execute(sql, update_values) - self.conn.commit() + if not exists_do_nothing: + update_clause = ', '.join([f"{col}=?" for col in processed_data.keys() if col != uniq_key]) + update_values = [processed_data[col] for col in processed_data.keys() if col != uniq_key] + update_values.append(data[uniq_key]) + sql = f"UPDATE {tbl_name} SET {update_clause} WHERE {uniq_key} = ?" + self.cursor.execute(sql, update_values) + self.conn.commit() # 获取插入或更新后的记录 ID self.cursor.execute(f"SELECT id FROM {tbl_name} WHERE {uniq_key} = ?", (data[uniq_key],)) diff --git a/scrapy_proj/scrapy_proj/pipelines.py b/scrapy_proj/scrapy_proj/pipelines.py index 77015b3..b39769d 100644 --- a/scrapy_proj/scrapy_proj/pipelines.py +++ b/scrapy_proj/scrapy_proj/pipelines.py @@ -76,7 +76,7 @@ class SQLitePipeline(SQLiteDBHandler): def _process_u001_item(self, item, spider): logging.debug(f"insert one item. href:{spider.name}") - return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url') + return self.insert_or_update_common(item, tbl_name=self.tbl_name_u3c3, uniq_key='url', exists_do_nothing=True) def _process_sis001_item(self, item, spider): self.cursor.execute('''