potatoqualitee/Get-DupesinCSV.ps1

## Get-DupesinCSV.ps1
# From https://blog.netnerds.net/2015/01/quickly-find-duplicates-from-csv-using-powershell/
# The Text OleDB driver is only available in PowerShell x86. Start x86 shell if using x64.
# This has to be the first check this script performs.
if ($env:Processor_Architecture -ne "x86")   {
 Write-Warning "Switching to x86 shell"
 &"$env:windir\syswow64\windowspowershell\v1.0\powershell.exe" "$PSCommandPath $args"; return
}

# Change to your CSV file name, must end in .csv or .tsv
$csvfile = "C:\temp\million-commas.txt"

# Does the first row contain column names?
$firstRowColumns = $false

# What's the delimiter? Use `t for tabbed.
$csvdelimter = ","

# By default, OleDbconnection columns are named F1, F2, F3, etc unless $firstRowColumns = $true
# Alternatively, you could make it check all rows. I'll add that to the script later and post it.
$checkColumns = "F2, F3"

################### No need to modify anything below ###################
$datasource = Split-Path $csvfile
$tablename = (Split-Path $csvfile -leaf).Replace(".","#")

switch ($firstRowColumns) {
    $true { $firstRowColumns = "Yes" }
    $false { $firstRowColumns = "No" }
}

$elapsed = [System.Diagnostics.Stopwatch]::StartNew()
[void][Reflection.Assembly]::LoadWithPartialName("System.Data")

# Setup OleDB using Microsoft Text Driver.
$connstring = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=$datasource;Extended Properties='text;HDR=$firstRowColumns;FMT=Delimited($csvdelimter)';"

$conn = New-Object System.Data.OleDb.OleDbconnection
$conn.ConnectionString = $connstring
$conn.Open()
$cmd = New-Object System.Data.OleDB.OleDBCommand
$cmd.Connection = $conn

# Perform select on CSV file, then add results to a datatable using ExecuteReader
$sql = "SELECT $checkColumns, COUNT(*) as DupeCount FROM [$tablename] GROUP BY $checkColumns HAVING COUNT(*) > 1"
$cmd.CommandText = $sql
$dt = New-Object System.Data.DataTable
$dt.BeginLoadData()
$dt.Load($cmd.ExecuteReader([System.Data.CommandBehavior]::CloseConnection))
$dt.EndLoadData()
$totaltime = [math]::Round($elapsed.Elapsed.TotalSeconds,2)

# Get Total Row Count
$conn.Open()
$cmd.CommandText = "SELECT COUNT(*) as TotalRows FROM [$tablename]"
$totalrows = $cmd.ExecuteScalar()
$conn.Close()

# Output some stats
$dupecount = $dt.Rows.Count
Write-Host "Total Elapsed Time: $totaltime seconds. $dupecount duplicates found out of $totalrows total rows. You can access these dupes using `$dt." -ForegroundColor Green
	# From https://blog.netnerds.net/2015/01/quickly-find-duplicates-from-csv-using-powershell/
	# The Text OleDB driver is only available in PowerShell x86. Start x86 shell if using x64.
	# This has to be the first check this script performs.
	if ($env:Processor_Architecture -ne "x86") {
	Write-Warning "Switching to x86 shell"
	&"$env:windir\syswow64\windowspowershell\v1.0\powershell.exe" "$PSCommandPath $args"; return
	}

	# Change to your CSV file name, must end in .csv or .tsv
	$csvfile = "C:\temp\million-commas.txt"

	# Does the first row contain column names?
	$firstRowColumns = $false

	# What's the delimiter? Use `t for tabbed.
	$csvdelimter = ","

	# By default, OleDbconnection columns are named F1, F2, F3, etc unless $firstRowColumns = $true
	# Alternatively, you could make it check all rows. I'll add that to the script later and post it.
	$checkColumns = "F2, F3"

	################### No need to modify anything below ###################
	$datasource = Split-Path $csvfile
	$tablename = (Split-Path $csvfile -leaf).Replace(".","#")

	switch ($firstRowColumns) {
	$true { $firstRowColumns = "Yes" }
	$false { $firstRowColumns = "No" }
	}

	$elapsed = [System.Diagnostics.Stopwatch]::StartNew()
	[void][Reflection.Assembly]::LoadWithPartialName("System.Data")

	# Setup OleDB using Microsoft Text Driver.
	$connstring = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=$datasource;Extended Properties='text;HDR=$firstRowColumns;FMT=Delimited($csvdelimter)';"

	$conn = New-Object System.Data.OleDb.OleDbconnection
	$conn.ConnectionString = $connstring
	$conn.Open()
	$cmd = New-Object System.Data.OleDB.OleDBCommand
	$cmd.Connection = $conn

	# Perform select on CSV file, then add results to a datatable using ExecuteReader
	$sql = "SELECT $checkColumns, COUNT() as DupeCount FROM [$tablename] GROUP BY $checkColumns HAVING COUNT() > 1"
	$cmd.CommandText = $sql
	$dt = New-Object System.Data.DataTable
	$dt.BeginLoadData()
	$dt.Load($cmd.ExecuteReader([System.Data.CommandBehavior]::CloseConnection))
	$dt.EndLoadData()
	$totaltime = [math]::Round($elapsed.Elapsed.TotalSeconds,2)

	# Get Total Row Count
	$conn.Open()
	$cmd.CommandText = "SELECT COUNT(*) as TotalRows FROM [$tablename]"
	$totalrows = $cmd.ExecuteScalar()
	$conn.Close()

	# Output some stats
	$dupecount = $dt.Rows.Count
	Write-Host "Total Elapsed Time: $totaltime seconds. $dupecount duplicates found out of $totalrows total rows. You can access these dupes using `$dt." -ForegroundColor Green