Last active
April 10, 2019 09:02
-
-
Save gitfvb/0e2b134eccf4b7354443b0c298a36a1a to your computer and use it in GitHub Desktop.
merge large files via powershell and load it via sqlite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################ | |
# | |
# PREPARATION / ASSEMBLIES | |
# | |
################################################ | |
# Load scriptpath | |
if ($MyInvocation.MyCommand.CommandType -eq "ExternalScript") { | |
$scriptPath = Split-Path -Parent -Path $MyInvocation.MyCommand.Definition | |
} else { | |
$scriptPath = Split-Path -Parent -Path ([Environment]::GetCommandLineArgs()[0]) | |
} | |
$preloadDir = ( Get-Item -Path $scriptPath ).FullName | |
# change dir to data path | |
$dataDir = ( Get-Item -Path "$( $scriptPath )\..\data" ).FullName | |
# set the skipper to 0 | |
$skipFirstLine = $false | |
cd $dataDir | |
$targetFilename = "output.csv" | |
# load and combine data | |
Measure-Command { | |
Get-ChildItem -path "$( $dataDir )\source" -Filter *.csv | ForEach { | |
#https://stackoverflow.com/questions/9668427/remove-top-line-of-a-huge-300m-text-file-with-powershell | |
#$reader = [IO.File]::OpenText($_.FullName) | |
$reader = New-Object System.IO.StreamReader($_.FullName, [System.Text.Encoding]::ASCII) | |
$writer = New-Object System.IO.StreamWriter("$( $dataDir )\$( $targetFilename )", $true, [System.Text.Encoding]::UTF8) # the true means to "append" | |
if ($skipFirstLine -eq $true) { | |
$reader.ReadLine() > $null # Skip first line. | |
} | |
while ($reader.Peek() -ge 0) { | |
$writer.writeline($reader.ReadLine()) | |
} | |
$reader.Close() | |
$writer.Close() | |
$skipFirstLine = $true | |
} | |
} | |
$targetTablename = "Umsatz" | |
$targetFile = Get-Item -Path $targetFilename | |
'.mode csv','.separator ";"',".import $( $targetFile.FullName -replace "\\", "/" ) $( $targetTablename )" | ..\preload\sqlite3.exe "$( $dataDir )\DAT_Umsatz.sqlite" | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is very fast and efficent way to scan through a csv file without parsing it as csv | |
Measure-Command { | |
Get-ChildItem -path "tabelle.csv" | ForEach { | |
$reader = New-Object System.IO.StreamReader($_.FullName, [System.Text.Encoding]::UTF8) | |
$encodingObject = New-Object System.Text.UTF8Encoding $false # use new object instead of [System.Text.UTF8Encoding]::new($false) to ensure better compatibility | |
$writer = New-Object System.IO.StreamWriter("$( $exportDir )\$( $filename ).fast", $true, $encodingObject) # the true means to "append"; the false in the encoding does not use the BOM at the beginning | |
# https://stackoverflow.com/questions/3268622/regex-to-split-line-csv-file | |
while ($reader.Peek() -ge 0) { | |
$line = $reader.ReadLine() # Read line | |
$line = [Regex]::Replace($line,'"', "") # Remove quotes | |
$items = $line.Split(",") # String -> Array by delimiter | |
$line = $items -join ";" # Join items together again | |
#$items \w | |
#$items = [Regex]::Split($line,",") | |
#$line = [String]::Join($items,";") | |
#[String]:: $line | |
#$line = [Regex]::Replace($line,",", ";") | |
$writer.writeline($line) | |
} | |
$reader.Close() | |
$writer.Close() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment