Created
March 31, 2015 08:41
-
-
Save potatoqualitee/9be7f3b2767e371afd62 to your computer and use it in GitHub Desktop.
Quickly Find Duplicates in Large CSV Files using PowerShell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From https://blog.netnerds.net/2015/01/quickly-find-duplicates-from-csv-using-powershell/ | |
# The Text OleDB driver is only available in PowerShell x86. Start x86 shell if using x64. | |
# This has to be the first check this script performs. | |
if ($env:Processor_Architecture -ne "x86") { | |
Write-Warning "Switching to x86 shell" | |
&"$env:windir\syswow64\windowspowershell\v1.0\powershell.exe" "$PSCommandPath $args"; return | |
} | |
# Change to your CSV file name, must end in .csv or .tsv | |
$csvfile = "C:\temp\million-commas.txt" | |
# Does the first row contain column names? | |
$firstRowColumns = $false | |
# What's the delimiter? Use `t for tabbed. | |
$csvdelimter = "," | |
# By default, OleDbconnection columns are named F1, F2, F3, etc unless $firstRowColumns = $true | |
# Alternatively, you could make it check all rows. I'll add that to the script later and post it. | |
$checkColumns = "F2, F3" | |
################### No need to modify anything below ################### | |
$datasource = Split-Path $csvfile | |
$tablename = (Split-Path $csvfile -leaf).Replace(".","#") | |
switch ($firstRowColumns) { | |
$true { $firstRowColumns = "Yes" } | |
$false { $firstRowColumns = "No" } | |
} | |
$elapsed = [System.Diagnostics.Stopwatch]::StartNew() | |
[void][Reflection.Assembly]::LoadWithPartialName("System.Data") | |
# Setup OleDB using Microsoft Text Driver. | |
$connstring = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=$datasource;Extended Properties='text;HDR=$firstRowColumns;FMT=Delimited($csvdelimter)';" | |
$conn = New-Object System.Data.OleDb.OleDbconnection | |
$conn.ConnectionString = $connstring | |
$conn.Open() | |
$cmd = New-Object System.Data.OleDB.OleDBCommand | |
$cmd.Connection = $conn | |
# Perform select on CSV file, then add results to a datatable using ExecuteReader | |
$sql = "SELECT $checkColumns, COUNT(*) as DupeCount FROM [$tablename] GROUP BY $checkColumns HAVING COUNT(*) > 1" | |
$cmd.CommandText = $sql | |
$dt = New-Object System.Data.DataTable | |
$dt.BeginLoadData() | |
$dt.Load($cmd.ExecuteReader([System.Data.CommandBehavior]::CloseConnection)) | |
$dt.EndLoadData() | |
$totaltime = [math]::Round($elapsed.Elapsed.TotalSeconds,2) | |
# Get Total Row Count | |
$conn.Open() | |
$cmd.CommandText = "SELECT COUNT(*) as TotalRows FROM [$tablename]" | |
$totalrows = $cmd.ExecuteScalar() | |
$conn.Close() | |
# Output some stats | |
$dupecount = $dt.Rows.Count | |
Write-Host "Total Elapsed Time: $totaltime seconds. $dupecount duplicates found out of $totalrows total rows. You can access these dupes using `$dt." -ForegroundColor Green |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
how if there are more than one file csv ?