Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@Hashbrown777
Last active March 9, 2021 23:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Hashbrown777/7f7ab4831951dfabf563ebee51b9a718 to your computer and use it in GitHub Desktop.
Save Hashbrown777/7f7ab4831951dfabf563ebee51b9a718 to your computer and use it in GitHub Desktop.
Deduplicates identical files with preferences for which filenames to choose between the options.
$dirs='.'
Get-ChildItem -Path 'dedupe' | Remove-Item -Recurse
$dirs = $dirs | %{ ($_ | Get-Item).FullName }
$matchDirs = '^(' + (($dirs | %{ [Regex]::Escape($_) }) -join '|') + ')'
$preferences = (
('Act \d{4}( No \d+| \(\d{4} No[. ]\d+\))? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$', 'Act,( \d{4},?)?( No \d+| \(\d{4} No[. ]\d+\))? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act,? \d{4},? No\.\d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act,? \d{4},? No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '[^,t ] \d{4},? No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4}( N0 \d+| |) - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+( - \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' , 'Act \d{4}\.pdf$' ),
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No \d+\.pdf$' ),
('Act \d{4}( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4}\.pdf$' ),
('Act(,| of|) \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act(,| of|) \d{4} - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act of \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4}( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act, \d{4} - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No\d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4}( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act(,| of|) \(?\d{4}\)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act(,| of|) \d{4} No [a-z]+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('\) Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4}\) No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4}( No ?\d+| .\d{4} No \d+.|) - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '^[^ ]+\.pdf$' ),
('^[^\u2019]+$' , '\u2019' ),
('^[^\u2026]+$' , '\u2026' ),
('[^t]\.pdf$' , 'moved to sharepoint\.pdf$' ),
('[^)]\.pdf$' , ' \(\d\)\.pdf$' ),
('[^)E]\.pdf$' , ' (\(no date entry\)|START CHECKING HERE)\.pdf$' ),
('- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , ' - \d{4}\.pdf$' ),
('Act \d{4} No \d+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '\d{4} {0,2}- ?\d{2,4} No \d+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('Act \d{4} No \d+( ?- \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' , '\d{4} {0,2}- ?\d{2,4} No \d+\.pdf$' ),
('Act \d{4}( No \d+)? ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '\d{4} {0,2}- ?\d{2,4}( ?- \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' ),
('Act \d{4}( No \d+)?( ?- \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' , '\d{4} {0,2}- ?\d{2,4}\.pdf$' ),
('^The Presbyterian Church .New South Wales. Property Trust Act 1936' , '^Presbyterian Church .New South Wales. Property Trust Act 1936' ),
('^Administrative Decisions Tribunal Act 1997 No 76 - 10.2.2009' , '^Administrative Decisions Review Act 1997 No 76 - 10.2.2009' ),
('^Baptist Union Incorporation Act 1919 - 29.7.1985' , '^The Baptist Union Incorporation Act 1919 - 29.7.1985' ),
('^Local Government Act 1919 No 41 Part [^ ]+ \(s( |s [\dA-Z]+-)[\dA-Z]+\)', '^Local Government Act 1919 No 41 Part [^ ]+ (\(ss?\. |-)' ),
('Act \d{4}(, \d+ Vic No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No \d+[a-z]* - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ),
('^Necropolis Act 1901 .1902 No 20. - 28.10.1991' , '^Necropolis Act 1901 - 28.10.1991' ),
('^Public Authorities .Financial Arrangements. Act 1987 No 33 - 14.4.1994' , '^Public Authorities .Financial Arrangements. Act 1987 No 33 - 1.8.1996 - 14.4.1994'),
('^Marine Administration Act 1989 No 93 - 21.10.1991' , '^Marine Administration Act 1989 No 23 - 21.10.1991.pdf' ),
('(.+Act|Act.+)\.pdf$' , '^Act.pdf$' ),
('^Administration of Justice Act 1924 No 42 - 18.6.1975.pdf$' , '^Administration of Justice Act 1924 No 42 - 18.6.1925.pdf$' ),
('^Stamp Duties Act 1920 No 47 - 2.6.1970.pdf$' , '^Stamp Duties Act 1970 No 47 - 2.6.1970.pdf$' )
)
$dirs = $dirs `
| Get-ChildItem -File -Recurse -Filter '*.pdf' `
| Group-Object -Property 'Length'
$count = 0
$dirs `
| %{
Write-Progress `
-Activity "Comparing $((++$count)) of $($dirs.Length)" `
-Status "$([math]::Round($_.Name / 1024 / 1024, 2))MB files" `
-PercentComplete ($count * 100 / $dirs.Length)
if ($_.Count -lt 2) {
$_.Group
}
else {
$_.Group `
| Group-Object -Property {
($_ | Get-FileHash -Algorithm SHA1).Hash
}
}
} `
| %{
if ($_ -is [System.IO.FileInfo]) {
$_
return
}
if ($_.Count -lt 2) {
$_.Group
return
}
$group = $_.Group | Group-Object -Property 'Name'
#if ($group | ?{ $_.Name -match '^Baptist Union Incorporation Act 1919 - 26.9.1979' }) {
# $_.Group.Name | Out-Host
#}
while ($group.Length -gt 1) {
$index = 0
foreach ($preference in $preferences) {
if (
($group | ?{ $_.Name -match $preference[0] }) -and
($group | ?{ $_.Name -match $preference[1] })
) {
$group = $group `
| ?{ $_.Name -notmatch $preference[1] }
break
}
++$index
}
if ($index -eq $preferences.Length) {
$group.Name | Out-Host
exit
}
}
if ($group.Length -lt 1) {
$_.Group.Name | Out-Host
throw $count
}
($group.Group)[0]
} `
| %{
if ($_ -isnot [System.IO.FileInfo]) {
throw $_
}
$destination = $_.DirectoryName -replace $matchDirs,'' -replace '^[/\\]|[/\\]$',''
if ($destination) {
$destination = New-Item `
-Force `
-Path 'dedupe' `
-Name $destination `
-Type Directory
}
else {
$destination = 'dedupe'
}
$_ | Copy-Item -Destination $destination
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment