Skip to content

Instantly share code, notes, and snippets.

@mbourgon
Created May 5, 2015 18:59
Show Gist options
  • Save mbourgon/d390aca43cf9252a402f to your computer and use it in GitHub Desktop.
Save mbourgon/d390aca43cf9252a402f to your computer and use it in GitHub Desktop.
Basic dynamic CSV ripper - complete implementation.
USE tempdb
/*
2015/04/22 - mdb - 2.00 - pull raw CSV, shred via function, pivot into a virtual table, then insert into
actual table based on existing fields. Practically: forwards/backwards compatability.
Add new fields to your target table, and as they show up in the new file they'll be added.
Practically, they'll have to rename the file when they do this, so that we know the
names of the new fields. But that's just a parameter change in the job.
2015/05/05 - mdb - 2.1 - very basic implementation, complete code. Just fill out the parameters!
Downside: SLOW?! A 5mb file, which has 106k rows, takes 7 seconds.
PREREQs: Jeff Moden's 8k CSV parser, or something like it.
*/
--BULK INSERT requires dynamic SQL
DECLARE @filename VARCHAR(500), @sql NVARCHAR(4000), @base_filename VARCHAR(500), @servername VARCHAR(255)
DECLARE @Columns VARCHAR (MAX), @Columns_Short VARCHAR (MAX), @Columns_Insert VARCHAR(MAX)
DECLARE @target_table sysname
DECLARE @ErrorMessage NVARCHAR(4000);
DECLARE @ErrorSeverity INT;
DECLARE @ErrorState INT;
SELECT @filename = 'c:\temp\blah.csv'
SELECT @base_filename = RIGHT(@filename, CHARINDEX('\',REVERSE(@filename))-1)
SELECT @target_table = 'blahtemp'
--the code has to be run in database with the table, since I use nondynamic sql for the columns
-------------------------------
--Creating tables for process--
-------------------------------
--two temp tables, since we need an ID column
if object_id('tempdb..#csv_shred_stage_headers_insert') is not null
DROP TABLE #csv_shred_stage_headers_insert
CREATE TABLE #csv_shred_stage_headers_insert
(
[resultant] [varchar] (4000)
)
--two temp tables, since we need an ID column
if object_id('tempdb..#csv_shred_insert') is not null
DROP TABLE #csv_shred_insert
CREATE TABLE #csv_shred_insert
(
[resultant] [varchar] (4000)
)
if object_id('tempdb..#csv_shred_stage') is not null
DROP TABLE #csv_shred_stage
CREATE TABLE #csv_shred_stage
(
ID INT IDENTITY,
[resultant] [varchar] (4000)
)
--Column to handle the staged keypairs
if object_id('tempdb..#csv_shred_split') is not null
DROP TABLE #csv_shred_split
CREATE TABLE #csv_shred_split
(
[id] [int] NOT NULL,
[ItemNumber] [int] NULL,
[Item] [varchar] (500)
)
CREATE UNIQUE CLUSTERED INDEX ucidx__csv_shred_split ON #csv_shred_split ([id], [ItemNumber])
--------------------------
--Import JUST the Header--
--------------------------
PRINT CONVERT(VARCHAR(30),GETDATE(),120) + ' BULK INSERT of header begins for ' + @filename
SET @sql =
'BULK INSERT #csv_shred_stage_headers_insert
FROM ''' + @filename + '''
WITH
(
LASTROW = 1,
FIELDTERMINATOR = ''\0'' --the \0 is "null terminator"; needed to make sure it doesnt try and parse
)'
BEGIN TRY
EXEC sp_executesql @sql
END TRY
BEGIN CATCH
PRINT 'File either locked, Does Not Exist, or format has changed; see error message for more details'
SELECT @ErrorMessage = ERROR_MESSAGE(),
@ErrorSeverity = ERROR_SEVERITY(),
@ErrorState = ERROR_STATE();
IF @@TRANCOUNT > 0
ROLLBACK TRANSACTION;
PRINT 'Error Severity: ' + CONVERT(VARCHAR(30), @ErrorSeverity)
PRINT 'Error State: ' + CONVERT(VARCHAR(30), @ErrorState)
PRINT 'Error Severity: ' + @ErrorMessage
--commented this out so we can run it without files.
--RAISERROR (@ErrorMessage, -- Message text.
-- @ErrorSeverity, -- Severity.
-- @ErrorState -- State.
-- );
END CATCH
---------------
--Import Data--
---------------
PRINT CONVERT(VARCHAR(30),GETDATE(),120) + ' BULK INSERT of data begins for ' + @filename
--inserts into view in order to add the ID column so the PIVOT works. insertview is just a select *, minus the ID
SET @sql =
'BULK INSERT #csv_shred_insert
FROM ''' + @filename + '''
WITH
(
FIRSTROW = 2,
FIELDTERMINATOR = ''\0'' --the \0 is "null terminator"; needed to make sure it doesnt try and parse
)'
BEGIN TRY
EXEC sp_executesql @sql
END TRY
BEGIN CATCH
PRINT 'File either locked, Does Not Exist, or format has changed; see error message for more details'
SELECT @ErrorMessage = ERROR_MESSAGE(),
@ErrorSeverity = ERROR_SEVERITY(),
@ErrorState = ERROR_STATE();
IF @@TRANCOUNT > 0
ROLLBACK TRANSACTION;
PRINT 'Error Severity: ' + CONVERT(VARCHAR(30), @ErrorSeverity)
PRINT 'Error State: ' + CONVERT(VARCHAR(30), @ErrorState)
PRINT 'Error Severity: ' + @ErrorMessage
--commented this out so we can run it without files.
--RAISERROR (@ErrorMessage, -- Message text.
-- @ErrorSeverity, -- Severity.
-- @ErrorState -- State.
-- );
END CATCH
PRINT CONVERT(VARCHAR(30),GETDATE(),120) + ' BULK INSERT ends for ' + @filename
--Copy the data from the first temp table to the second. We could also do this if we had permanent table + view
INSERT INTO #csv_shred_stage (resultant)
SELECT resultant FROM #csv_shred_insert
--===== Split the CSV column for the whole table using CROSS APPLY
INSERT INTO #csv_shred_split
(id, ItemNumber, Item)
SELECT stage.id, split.ItemNumber, split.item
FROM #csv_shred_stage stage
CROSS APPLY dbo.DelimitedSplit8K(resultant,',') split
--this is needed because blank values ('') don't convert to null; they come back with error converting varchar to decimal
UPDATE #csv_shred_split SET item = NULL WHERE item = ''
----------------------
--Building the PIVOT--
----------------------
--(I think we) have to do this part here, and not earlier - we need to match up all the tables
-- (though I wonder if we could fake the "split" and just self-join)
SELECT @Columns=COALESCE(@Columns + ',','') + QUOTENAME(ItemNumber) + ' as ' + QUOTENAME(item)
, @Columns_Short = COALESCE(@Columns_Short + ',','') + QUOTENAME(ItemNumber)
, @Columns_Insert = COALESCE(@Columns_Insert + ',','') + QUOTENAME(item)
FROM
(
SELECT DISTINCT split.ItemNumber, headers.item
From #csv_shred_split split
INNER JOIN
(
SELECT split.ItemNumber, split.item
FROM #csv_shred_stage_headers_insert headers
CROSS APPLY dbo.DelimitedSplit8K(resultant,',') split
)
headers
ON headers.ItemNumber = split.ItemNumber
INNER JOIN INFORMATION_SCHEMA.COLUMNS
ON TABLE_NAME = @target_table
AND columns.COLUMN_NAME = headers.item
) AS B
ORDER BY B.ItemNumber
--We need the CTE so that we can calculate the import_datetime more easily.
-- (import_datetime not included in this stripped down version)
SET @SQL='
;with CTE_Import as
(
SELECT ' + @Columns
+ ' FROM
(
SELECT id, ItemNumber, item FROM
#csv_shred_split ) AS source
PIVOT
(max(item) FOR source.ItemNumber IN (' + @Columns_Short + ')
)AS pvt
)
insert into ' + @target_table + ' (' + @Columns_Insert + ')
select ' + @Columns_Insert + '
from CTE_Import'
PRINT @sql
BEGIN
EXEC (@sql)
END
PRINT CONVERT(VARCHAR(30),GETDATE(),120) + ' insert into _main ends for ' + @filename
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment