Skip to content

Instantly share code, notes, and snippets.

@potatoqualitee
Last active February 24, 2021 15:11
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save potatoqualitee/67b8bcfe5443c9e4d931 to your computer and use it in GitHub Desktop.
Save potatoqualitee/67b8bcfe5443c9e4d931 to your computer and use it in GitHub Desktop.
Insanely fast CSV to SQL Server imports using the FAF technique (batched datasets and in-line multithreading)
<#
PowerShell, SqlBulkCopy and RunSpaces Speed Test
-- To be used for speed testing purposes with million.csv --
-- Will drop and recreate specified database if it exists --
This script will
- Automatically download the million row dataset if it doesn't exist
- Drop the specified database if it exists
- Recreate it with import optimized settings
- Create a table suitable for the dataset
- Import really quickly
#>
# Set variables
$sqlserver = "sql2012-super"
$database = "pssqlbulkcopy"
$table = "longsandlats"
$csvfile = "$([Environment]::GetFolderPath('MyDocuments'))\million.csv"
# Check for CSV
if ((Test-Path $csvfile) -eq $false) {
Write-Output "Going grab the 20MB CSV zip file from onedrive"
Add-Type -Assembly "System.Io.Compression.FileSystem"
$zipfile = "$([Environment]::GetFolderPath('MyDocuments'))\million.zip"
Invoke-WebRequest -Uri http://1drv.ms/1QZybEo -OutFile $zipfile
[Io.Compression.ZipFile]::ExtractToDirectory($zipfile, [Environment]::GetFolderPath('MyDocuments'))
Remove-Item $zipfile
}
<#
SQL Section. Drops database, and recreates database.
#>
# Build the SQL Server Connection
$sqlconn = New-Object System.Data.SqlClient.SqlConnection
$sqlconn.ConnectionString = "Data Source=$sqlserver;Integrated Security=true;Initial Catalog=master;"
$sqlconn.Open()
$dbsql = "IF EXISTS (SELECT name FROM master.dbo.sysdatabases WHERE name = N'$database')
BEGIN
ALTER DATABASE [$database] SET SINGLE_USER WITH ROLLBACK IMMEDIATE
DROP DATABASE [$database]
END
CREATE DATABASE [$database]
ALTER DATABASE [$database] MODIFY FILE ( NAME = N'$database', SIZE = 4GB )
ALTER DATABASE [$database] SET RECOVERY SIMPLE WITH NO_WAIT
ALTER DATABASE [$database] SET PAGE_VERIFY NONE
ALTER DATABASE [$database] SET AUTO_UPDATE_STATISTICS OFF
ALTER DATABASE [$database] SET AUTO_CREATE_STATISTICS OFF
"
Write-Output "Recreating database"
$dbcmd = New-Object System.Data.SqlClient.SqlCommand
$dbcmd.CommandTimeout = 60
$dbcmd.connection = $sqlconn
$dbcmd.commandtext = $dbsql
$dbcmd.executenonquery() > $null
Write-Output "Creating table"
$tablesql = "CREATE TABLE [dbo].[$table](
[GeoNameId] [int],
[Name] [nvarchar](200),
[AsciiName] [nvarchar](200),
[AlternateNames] [nvarchar](max),
[Latitude] [float],
[Longitude] [float],
[FeatureClass] [char](1),
[FeatureCode] [varchar](10),
[CountryCode] [char](2),
[Cc2] [varchar](255),
[Admin1Code] [varchar](20),
[Admin2Code] [varchar](80),
[Admin3Code] [varchar](20),
[Admin4Code] [varchar](20),
[Population] [bigint],
[Elevation] [varchar](255),
[Dem] [int],
[Timezone] [varchar](40),
[ModificationDate] [smalldatetime]
)
"
$sqlconn.ChangeDatabase($database)
$tablecmd = New-Object System.Data.SqlClient.SqlCommand
$tablecmd.Connection = $sqlconn
$tablecmd.CommandText = $tablesql
$tablecmd.ExecuteNonQuery() > $null
$sqlconn.Close()
$sqlconn.Dispose()
<#
Data processing section
#>
# Set some vars
$delimiter = "`t"
$batchsize = 2000
# Setup datatable since SqlBulkCopy.WriteToServer can consume it
$datatable = New-Object System.Data.DataTable
$columns = (Get-Content $csvfile -First 1).Split($delimiter)
foreach ($column in $columns) {
$null = $datatable.Columns.Add()
}
# Setup runspace pool and the scriptblock that runs inside each runspace
$pool = [RunspaceFactory]::CreateRunspacePool(1,5)
$pool.Open()
$jobs = @()
$scriptblock = {
Param (
[string]$connectionString,
[string]$table,
[object]$dtbatch
)
$bulkcopy = New-Object Data.SqlClient.SqlBulkCopy($connectionstring, @("TableLock","KeepNulls"))
$bulkcopy.DestinationTableName = $table
$bulkcopy.BatchSize = 2000
$bulkcopy.WriteToServer($dtbatch)
$bulkcopy.Close()
$dtbatch.Clear()
}
Write-Output "Starting insert"
$elapsed = [System.Diagnostics.Stopwatch]::StartNew()
# Use StreamReader to process csv file. Efficiently add each row to the datatable.
# Once batchsize is reached, send it off to a runspace to be processed, then empty the datatable.
$reader = New-Object System.IO.StreamReader($csvfile)
$connectionString = "Data Source=$sqlserver;Integrated Security=true;Initial Catalog=$database;"
while (($line = $reader.ReadLine()) -ne $null) {
$null = $datatable.Rows.Add($line.Split($delimiter))
if ($datatable.rows.count % $batchsize -eq 0) {
$job = [PowerShell]::Create()
$null = $job.AddScript($scriptblock)
$null = $job.AddArgument($connectionString)
$null = $job.AddArgument($table)
$null = $job.AddArgument($datatable.Copy())
$job.RunspacePool = $pool
$jobs += [PSCustomObject]@{ Status = $job.BeginInvoke() }
$datatable.Clear()
}
}
$reader.close()
# Process any remaining rows
if ($datatable.rows.count -gt 0) {
$bulkcopy = New-Object Data.SqlClient.SqlBulkCopy($connectionstring, [System.Data.SqlClient.SqlBulkCopyOptions]::TableLock)
$bulkcopy.DestinationTableName = $table
$bulkcopy.BulkCopyTimeout = 0
$bulkcopy.WriteToServer($datatable)
$bulkcopy.Close()
$datatable.Clear()
}
# Wait for runspaces to complete
while ($jobs.Status.IsCompleted -notcontains $true) {}
$secs = $elapsed.Elapsed.TotalSeconds
# Write out stats for million row csv file
$rs = "{0:N0}" -f [int](1000000 / $secs)
$rm = "{0:N0}" -f [int](1000000 / $secs * 60)
$mill = "{0:N0}" -f 1000000
Write-Output "$mill rows imported in $([math]::round($secs,2)) seconds ($rs rows/sec and $rm rows/min)"
@DanOBrien
Copy link

Oh my god! It's full of stars!

@potatoqualitee
Copy link
Author

You didn't even leave a star :(

@VijayGanjiDevOps
Copy link

Good Work and I am learning lot from it.Thanks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment