Skip to content

Instantly share code, notes, and snippets.

@AMMullan
Created August 16, 2022 08:47
Show Gist options
  • Save AMMullan/2cb4538b85ba90d6e239dc679d80a49f to your computer and use it in GitHub Desktop.
Save AMMullan/2cb4538b85ba90d6e239dc679d80a49f to your computer and use it in GitHub Desktop.
Finding Duplicate Files Fast
# https://powershell.one/tricks/filesystem/finding-duplicate-files
function Find-PSOneDuplicateFile
{
<#
.SYNOPSIS
Identifies files with duplicate content
.DESCRIPTION
Returns a hashtable with the hashes that have at least two files (duplicates)
.EXAMPLE
$Path = [Environment]::GetFolderPath('MyDocuments')
Find-PSOneDuplicateFile -Path $Path
Find duplicate files in the user documents folder
.EXAMPLE
Find-PSOneDuplicateFile -Path c:\windows -Filter *.log
find log files in the Windows folder with duplicate content
.LINK
https://powershell.one
#>
param
(
# Path of folder to recursively search
[String]
[Parameter(Mandatory)]
$Path,
# Filter to apply. Default is '*' (all Files)
[String]
$Filter = '*'
)
# get a hashtable of all files of size greater 0
# grouped by their length
# ENUMERATE ALL FILES RECURSIVELY
# call scriptblocks directly and pipe them together
# this is by far the fastest way and much faster than
# using Foreach-Object:
& {
try
{
# try and use the fast API way of enumerating files recursively
# this FAILS whenever there is any "Access Denied" errors
Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method'
[IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories')
}
catch
{
# use PowerShell's own (slow) way of enumerating files if any error occurs:
Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method'
Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore
}
} |
# EXCLUDE EMPTY FILES:
# use direct process blocks with IF (which is much faster than Where-Object):
& {
process
{
# if the file has content...
if ($_.Length -gt 0)
{
# let it pass through:
$_
}
}
} |
# GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
# OTHER FILE WITH SAME SIZE
# use direct scriptblocks with own hashtable (which is much faster than Group-Object)
& {
begin
# start with an empty hashtable
{ $hash = @{} }
process
{
# group files by their length
# (use "length" as hashtable key)
$file = $_
$key = $file.Length.toString()
# if we see this key for the first time, create a generic
# list to hold group items, and store FileInfo objects in this list
# (specialized generic lists are faster than ArrayList):
if ($hash.ContainsKey($key) -eq $false)
{
$hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new()
}
# add file to appropriate hashtable key:
$hash[$key].Add($file)
}
end
{
# return only the files from groups with at least two files
# (if there is only one file with a given length, then it
# cannot have any duplicates for sure):
foreach($pile in $hash.Values)
{
# are there at least 2 files in this pile?
if ($pile.Count -gt 1)
{
# yes, add it to the candidates
$pile
}
}
}
} |
# CALCULATE THE NUMBER OF FILES TO HASH
# collect all files and hand over en-bloc
& {
end { ,@($input) }
} |
# GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
# use a direct scriptblock call with a hashtable (much faster than Group-Object):
& {
begin
{
# start with an empty hashtable
$hash = @{}
# since this is a length procedure, a progress bar is in order
# keep a counter of processed files:
$c = 0
}
process
{
$totalNumber = $_.Count
foreach($file in $_)
{
# update progress bar
$c++
# update progress bar every 20 files:
if ($c % 20 -eq 0)
{
$percentComplete = $c * 100 / $totalNumber
Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete
}
# use the file hash of this file PLUS file length as a key to the hashtable
# use the fastest algorithm SHA1
$result = Get-FileHash -Path $file.FullName -Algorithm SHA1
$key = '{0}:{1}' -f $result.Hash, $file.Length
# if we see this key the first time, add a generic list to this key:
if ($hash.ContainsKey($key) -eq $false)
{
$hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new())
}
# add the file to the approriate group:
$hash[$key].Add($file)
}
}
end
{
# remove all hashtable keys with only one file in them
# first, CLONE the list of hashtable keys
# (we cannot remove hashtable keys while enumerating the live
# keys list):
# remove keys
$keys = @($hash.Keys).Clone()
# enumerate all keys...
foreach($key in $keys)
{
# ...if key has only one file, remove it:
if ($hash[$key].Count -eq 1)
{
$hash.Remove($key)
}
}
# return the hashtable with only duplicate files left:
$hash
}
}
}
# get path to personal documents folder
$Path = [Environment]::GetFolderPath('MyDocuments')
# check for duplicate files:
$result = Find-PSOneDuplicateFile -Path $Path
# output duplicates
& { foreach($key in $result.Keys)
{
foreach($file in $result[$key])
{
$file |
Add-Member -MemberType NoteProperty -Name Hash -Value $key -PassThru |
Select-Object Hash, Length, FullName
}
}
} | Format-Table -GroupBy Hash -Property FullName
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment