AMMullan/find_duplicate_files.ps1

## find_duplicate_files.ps1
# https://powershell.one/tricks/filesystem/finding-duplicate-files

function Find-PSOneDuplicateFile
{
  <#
      .SYNOPSIS
      Identifies files with duplicate content

      .DESCRIPTION
      Returns a hashtable with the hashes that have at least two files (duplicates)

      .EXAMPLE
      $Path = [Environment]::GetFolderPath('MyDocuments')
	  Find-PSOneDuplicateFile -Path $Path
      Find duplicate files in the user documents folder

      .EXAMPLE
      Find-PSOneDuplicateFile -Path c:\windows -Filter *.log
      find log files in the Windows folder with duplicate content

      .LINK
      https://powershell.one
  #>


  param
  (
    # Path of folder to recursively search
    [String]
    [Parameter(Mandatory)]
    $Path,

    # Filter to apply. Default is '*' (all Files)
    [String]
    $Filter = '*'
  )

  # get a hashtable of all files of size greater 0
  # grouped by their length


  # ENUMERATE ALL FILES RECURSIVELY
  # call scriptblocks directly and pipe them together
  # this is by far the fastest way and much faster than
  # using Foreach-Object:
  & {
    try
    {
      # try and use the fast API way of enumerating files recursively
      # this FAILS whenever there is any "Access Denied" errors
      Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method'
      [IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories')
    }
    catch
    {
      # use PowerShell's own (slow) way of enumerating files if any error occurs:
      Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method'
      Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore
    }
  } |
  # EXCLUDE EMPTY FILES:
  # use direct process blocks with IF (which is much faster than Where-Object):
  & {
    process
    {
      # if the file has content...
      if ($_.Length -gt 0)
      {
        # let it pass through:
        $_
      }
    }
  } |
  # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
  # OTHER FILE WITH SAME SIZE
  # use direct scriptblocks with own hashtable (which is much faster than Group-Object)
  & {
    begin
    # start with an empty hashtable
    { $hash = @{} }

    process
    {
      # group files by their length
      # (use "length" as hashtable key)
      $file = $_
      $key = $file.Length.toString()

      # if we see this key for the first time, create a generic
      # list to hold group items, and store FileInfo objects in this list
      # (specialized generic lists are faster than ArrayList):
      if ($hash.ContainsKey($key) -eq $false)
      {
        $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new()
      }
      # add file to appropriate hashtable key:
      $hash[$key].Add($file)
    }

    end
    {
      # return only the files from groups with at least two files
      # (if there is only one file with a given length, then it
      # cannot have any duplicates for sure):
      foreach($pile in $hash.Values)
      {
        # are there at least 2 files in this pile?
        if ($pile.Count -gt 1)
        {
          # yes, add it to the candidates
          $pile
        }
      }
    }
  } |
  # CALCULATE THE NUMBER OF FILES TO HASH
  # collect all files and hand over en-bloc
  & {
    end { ,@($input) }
  } |
  # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
  # use a direct scriptblock call with a hashtable (much faster than Group-Object):
  & {
    begin
    {
      # start with an empty hashtable
      $hash = @{}

      # since this is a length procedure, a progress bar is in order
      # keep a counter of processed files:
      $c = 0
    }

    process
    {
      $totalNumber = $_.Count
      foreach($file in $_)
      {

        # update progress bar
        $c++

        # update progress bar every 20 files:
        if ($c % 20 -eq 0)
        {
          $percentComplete = $c * 100 / $totalNumber
          Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete
        }

        # use the file hash of this file PLUS file length as a key to the hashtable
        # use the fastest algorithm SHA1
        $result = Get-FileHash -Path $file.FullName -Algorithm SHA1
        $key = '{0}:{1}' -f $result.Hash, $file.Length

        # if we see this key the first time, add a generic list to this key:
        if ($hash.ContainsKey($key) -eq $false)
        {
          $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new())
        }

        # add the file to the approriate group:
        $hash[$key].Add($file)
      }
    }

    end
    {
      # remove all hashtable keys with only one file in them

      # first, CLONE the list of hashtable keys
      # (we cannot remove hashtable keys while enumerating the live
      # keys list):
      # remove keys
      $keys = @($hash.Keys).Clone()

      # enumerate all keys...
      foreach($key in $keys)
      {
        # ...if key has only one file, remove it:
        if ($hash[$key].Count -eq 1)
        {
          $hash.Remove($key)
        }
      }

      # return the hashtable with only duplicate files left:
      $hash
    }
  }
}


# get path to personal documents folder
$Path = [Environment]::GetFolderPath('MyDocuments')

# check for duplicate files:
$result = Find-PSOneDuplicateFile -Path $Path

# output duplicates
& { foreach($key in $result.Keys)
{
    foreach($file in $result[$key])
    {
        $file |
            Add-Member -MemberType NoteProperty -Name Hash -Value $key -PassThru |
            Select-Object Hash, Length, FullName
    }
}
} | Format-Table -GroupBy Hash -Property FullName
	# https://powershell.one/tricks/filesystem/finding-duplicate-files

	function Find-PSOneDuplicateFile
	{
	<#
	.SYNOPSIS
	Identifies files with duplicate content

	.DESCRIPTION
	Returns a hashtable with the hashes that have at least two files (duplicates)

	.EXAMPLE
	$Path = [Environment]::GetFolderPath('MyDocuments')
	Find-PSOneDuplicateFile -Path $Path
	Find duplicate files in the user documents folder

	.EXAMPLE
	Find-PSOneDuplicateFile -Path c:\windows -Filter *.log
	find log files in the Windows folder with duplicate content

	.LINK
	https://powershell.one
	#>


	param
	(
	# Path of folder to recursively search
	[String]
	[Parameter(Mandatory)]
	$Path,

	# Filter to apply. Default is '*' (all Files)
	[String]
	$Filter = '*'
	)

	# get a hashtable of all files of size greater 0
	# grouped by their length


	# ENUMERATE ALL FILES RECURSIVELY
	# call scriptblocks directly and pipe them together
	# this is by far the fastest way and much faster than
	# using Foreach-Object:
	& {
	try
	{
	# try and use the fast API way of enumerating files recursively
	# this FAILS whenever there is any "Access Denied" errors
	Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method'
	[IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories')
	}
	catch
	{
	# use PowerShell's own (slow) way of enumerating files if any error occurs:
	Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method'
	Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore
	}
	} \|
	# EXCLUDE EMPTY FILES:
	# use direct process blocks with IF (which is much faster than Where-Object):
	& {
	process
	{
	# if the file has content...
	if ($_.Length -gt 0)
	{
	# let it pass through:
	$_
	}
	}
	} \|
	# GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
	# OTHER FILE WITH SAME SIZE
	# use direct scriptblocks with own hashtable (which is much faster than Group-Object)
	& {
	begin
	# start with an empty hashtable
	{ $hash = @{} }

	process
	{
	# group files by their length
	# (use "length" as hashtable key)
	$file = $_
	$key = $file.Length.toString()

	# if we see this key for the first time, create a generic
	# list to hold group items, and store FileInfo objects in this list
	# (specialized generic lists are faster than ArrayList):
	if ($hash.ContainsKey($key) -eq $false)
	{
	$hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new()
	}
	# add file to appropriate hashtable key:
	$hash[$key].Add($file)
	}

	end
	{
	# return only the files from groups with at least two files
	# (if there is only one file with a given length, then it
	# cannot have any duplicates for sure):
	foreach($pile in $hash.Values)
	{
	# are there at least 2 files in this pile?
	if ($pile.Count -gt 1)
	{
	# yes, add it to the candidates
	$pile
	}
	}
	}
	} \|
	# CALCULATE THE NUMBER OF FILES TO HASH
	# collect all files and hand over en-bloc
	& {
	end { ,@($input) }
	} \|
	# GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
	# use a direct scriptblock call with a hashtable (much faster than Group-Object):
	& {
	begin
	{
	# start with an empty hashtable
	$hash = @{}

	# since this is a length procedure, a progress bar is in order
	# keep a counter of processed files:
	$c = 0
	}

	process
	{
	$totalNumber = $_.Count
	foreach($file in $_)
	{

	# update progress bar
	$c++

	# update progress bar every 20 files:
	if ($c % 20 -eq 0)
	{
	$percentComplete = $c * 100 / $totalNumber
	Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete
	}

	# use the file hash of this file PLUS file length as a key to the hashtable
	# use the fastest algorithm SHA1
	$result = Get-FileHash -Path $file.FullName -Algorithm SHA1
	$key = '{0}:{1}' -f $result.Hash, $file.Length

	# if we see this key the first time, add a generic list to this key:
	if ($hash.ContainsKey($key) -eq $false)
	{
	$hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new())
	}

	# add the file to the approriate group:
	$hash[$key].Add($file)
	}
	}

	end
	{
	# remove all hashtable keys with only one file in them

	# first, CLONE the list of hashtable keys
	# (we cannot remove hashtable keys while enumerating the live
	# keys list):
	# remove keys
	$keys = @($hash.Keys).Clone()

	# enumerate all keys...
	foreach($key in $keys)
	{
	# ...if key has only one file, remove it:
	if ($hash[$key].Count -eq 1)
	{
	$hash.Remove($key)
	}
	}

	# return the hashtable with only duplicate files left:
	$hash
	}
	}
	}


	# get path to personal documents folder
	$Path = [Environment]::GetFolderPath('MyDocuments')

	# check for duplicate files:
	$result = Find-PSOneDuplicateFile -Path $Path

	# output duplicates
	& { foreach($key in $result.Keys)
	{
	foreach($file in $result[$key])
	{
	$file \|
	Add-Member -MemberType NoteProperty -Name Hash -Value $key -PassThru \|
	Select-Object Hash, Length, FullName
	}
	}
	} \| Format-Table -GroupBy Hash -Property FullName