Skip to content

Instantly share code, notes, and snippets.

@gitfvb
Last active April 10, 2019 09:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gitfvb/0e2b134eccf4b7354443b0c298a36a1a to your computer and use it in GitHub Desktop.
Save gitfvb/0e2b134eccf4b7354443b0c298a36a1a to your computer and use it in GitHub Desktop.
merge large files via powershell and load it via sqlite
################################################
#
# PREPARATION / ASSEMBLIES
#
################################################
# Load scriptpath
if ($MyInvocation.MyCommand.CommandType -eq "ExternalScript") {
$scriptPath = Split-Path -Parent -Path $MyInvocation.MyCommand.Definition
} else {
$scriptPath = Split-Path -Parent -Path ([Environment]::GetCommandLineArgs()[0])
}
$preloadDir = ( Get-Item -Path $scriptPath ).FullName
# change dir to data path
$dataDir = ( Get-Item -Path "$( $scriptPath )\..\data" ).FullName
# set the skipper to 0
$skipFirstLine = $false
cd $dataDir
$targetFilename = "output.csv"
# load and combine data
Measure-Command {
Get-ChildItem -path "$( $dataDir )\source" -Filter *.csv | ForEach {
#https://stackoverflow.com/questions/9668427/remove-top-line-of-a-huge-300m-text-file-with-powershell
#$reader = [IO.File]::OpenText($_.FullName)
$reader = New-Object System.IO.StreamReader($_.FullName, [System.Text.Encoding]::ASCII)
$writer = New-Object System.IO.StreamWriter("$( $dataDir )\$( $targetFilename )", $true, [System.Text.Encoding]::UTF8) # the true means to "append"
if ($skipFirstLine -eq $true) {
$reader.ReadLine() > $null # Skip first line.
}
while ($reader.Peek() -ge 0) {
$writer.writeline($reader.ReadLine())
}
$reader.Close()
$writer.Close()
$skipFirstLine = $true
}
}
$targetTablename = "Umsatz"
$targetFile = Get-Item -Path $targetFilename
'.mode csv','.separator ";"',".import $( $targetFile.FullName -replace "\\", "/" ) $( $targetTablename )" | ..\preload\sqlite3.exe "$( $dataDir )\DAT_Umsatz.sqlite"
# this is very fast and efficent way to scan through a csv file without parsing it as csv
Measure-Command {
Get-ChildItem -path "tabelle.csv" | ForEach {
$reader = New-Object System.IO.StreamReader($_.FullName, [System.Text.Encoding]::UTF8)
$encodingObject = New-Object System.Text.UTF8Encoding $false # use new object instead of [System.Text.UTF8Encoding]::new($false) to ensure better compatibility
$writer = New-Object System.IO.StreamWriter("$( $exportDir )\$( $filename ).fast", $true, $encodingObject) # the true means to "append"; the false in the encoding does not use the BOM at the beginning
# https://stackoverflow.com/questions/3268622/regex-to-split-line-csv-file
while ($reader.Peek() -ge 0) {
$line = $reader.ReadLine() # Read line
$line = [Regex]::Replace($line,'"', "") # Remove quotes
$items = $line.Split(",") # String -> Array by delimiter
$line = $items -join ";" # Join items together again
#$items \w
#$items = [Regex]::Split($line,",")
#$line = [String]::Join($items,";")
#[String]:: $line
#$line = [Regex]::Replace($line,",", ";")
$writer.writeline($line)
}
$reader.Close()
$writer.Close()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment