Last active
March 9, 2021 23:05
-
-
Save Hashbrown777/7f7ab4831951dfabf563ebee51b9a718 to your computer and use it in GitHub Desktop.
Deduplicates identical files with preferences for which filenames to choose between the options.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$dirs='.' | |
Get-ChildItem -Path 'dedupe' | Remove-Item -Recurse | |
$dirs = $dirs | %{ ($_ | Get-Item).FullName } | |
$matchDirs = '^(' + (($dirs | %{ [Regex]::Escape($_) }) -join '|') + ')' | |
$preferences = ( | |
('Act \d{4}( No \d+| \(\d{4} No[. ]\d+\))? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$', 'Act,( \d{4},?)?( No \d+| \(\d{4} No[. ]\d+\))? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act,? \d{4},? No\.\d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act,? \d{4},? No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '[^,t ] \d{4},? No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4}( N0 \d+| |) - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+( - \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' , 'Act \d{4}\.pdf$' ), | |
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No \d+\.pdf$' ), | |
('Act \d{4}( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4}\.pdf$' ), | |
('Act(,| of|) \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act(,| of|) \d{4} - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act of \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4}( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act, \d{4} - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No\d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4}( No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act(,| of|) \(?\d{4}\)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act(,| of|) \d{4} No [a-z]+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('\) Act \d{4} No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4}\) No \d+ - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4}( No ?\d+| .\d{4} No \d+.|) - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '^[^ ]+\.pdf$' ), | |
('^[^\u2019]+$' , '\u2019' ), | |
('^[^\u2026]+$' , '\u2026' ), | |
('[^t]\.pdf$' , 'moved to sharepoint\.pdf$' ), | |
('[^)]\.pdf$' , ' \(\d\)\.pdf$' ), | |
('[^)E]\.pdf$' , ' (\(no date entry\)|START CHECKING HERE)\.pdf$' ), | |
('- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , ' - \d{4}\.pdf$' ), | |
('Act \d{4} No \d+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '\d{4} {0,2}- ?\d{2,4} No \d+ ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('Act \d{4} No \d+( ?- \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' , '\d{4} {0,2}- ?\d{2,4} No \d+\.pdf$' ), | |
('Act \d{4}( No \d+)? ?- \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , '\d{4} {0,2}- ?\d{2,4}( ?- \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' ), | |
('Act \d{4}( No \d+)?( ?- \d{1,2}\.\d{1,2}\.\d{4})?\.pdf$' , '\d{4} {0,2}- ?\d{2,4}\.pdf$' ), | |
('^The Presbyterian Church .New South Wales. Property Trust Act 1936' , '^Presbyterian Church .New South Wales. Property Trust Act 1936' ), | |
('^Administrative Decisions Tribunal Act 1997 No 76 - 10.2.2009' , '^Administrative Decisions Review Act 1997 No 76 - 10.2.2009' ), | |
('^Baptist Union Incorporation Act 1919 - 29.7.1985' , '^The Baptist Union Incorporation Act 1919 - 29.7.1985' ), | |
('^Local Government Act 1919 No 41 Part [^ ]+ \(s( |s [\dA-Z]+-)[\dA-Z]+\)', '^Local Government Act 1919 No 41 Part [^ ]+ (\(ss?\. |-)' ), | |
('Act \d{4}(, \d+ Vic No \d+)? - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' , 'Act \d{4} No \d+[a-z]* - \d{1,2}\.\d{1,2}\.\d{4}\.pdf$' ), | |
('^Necropolis Act 1901 .1902 No 20. - 28.10.1991' , '^Necropolis Act 1901 - 28.10.1991' ), | |
('^Public Authorities .Financial Arrangements. Act 1987 No 33 - 14.4.1994' , '^Public Authorities .Financial Arrangements. Act 1987 No 33 - 1.8.1996 - 14.4.1994'), | |
('^Marine Administration Act 1989 No 93 - 21.10.1991' , '^Marine Administration Act 1989 No 23 - 21.10.1991.pdf' ), | |
('(.+Act|Act.+)\.pdf$' , '^Act.pdf$' ), | |
('^Administration of Justice Act 1924 No 42 - 18.6.1975.pdf$' , '^Administration of Justice Act 1924 No 42 - 18.6.1925.pdf$' ), | |
('^Stamp Duties Act 1920 No 47 - 2.6.1970.pdf$' , '^Stamp Duties Act 1970 No 47 - 2.6.1970.pdf$' ) | |
) | |
$dirs = $dirs ` | |
| Get-ChildItem -File -Recurse -Filter '*.pdf' ` | |
| Group-Object -Property 'Length' | |
$count = 0 | |
$dirs ` | |
| %{ | |
Write-Progress ` | |
-Activity "Comparing $((++$count)) of $($dirs.Length)" ` | |
-Status "$([math]::Round($_.Name / 1024 / 1024, 2))MB files" ` | |
-PercentComplete ($count * 100 / $dirs.Length) | |
if ($_.Count -lt 2) { | |
$_.Group | |
} | |
else { | |
$_.Group ` | |
| Group-Object -Property { | |
($_ | Get-FileHash -Algorithm SHA1).Hash | |
} | |
} | |
} ` | |
| %{ | |
if ($_ -is [System.IO.FileInfo]) { | |
$_ | |
return | |
} | |
if ($_.Count -lt 2) { | |
$_.Group | |
return | |
} | |
$group = $_.Group | Group-Object -Property 'Name' | |
#if ($group | ?{ $_.Name -match '^Baptist Union Incorporation Act 1919 - 26.9.1979' }) { | |
# $_.Group.Name | Out-Host | |
#} | |
while ($group.Length -gt 1) { | |
$index = 0 | |
foreach ($preference in $preferences) { | |
if ( | |
($group | ?{ $_.Name -match $preference[0] }) -and | |
($group | ?{ $_.Name -match $preference[1] }) | |
) { | |
$group = $group ` | |
| ?{ $_.Name -notmatch $preference[1] } | |
break | |
} | |
++$index | |
} | |
if ($index -eq $preferences.Length) { | |
$group.Name | Out-Host | |
exit | |
} | |
} | |
if ($group.Length -lt 1) { | |
$_.Group.Name | Out-Host | |
throw $count | |
} | |
($group.Group)[0] | |
} ` | |
| %{ | |
if ($_ -isnot [System.IO.FileInfo]) { | |
throw $_ | |
} | |
$destination = $_.DirectoryName -replace $matchDirs,'' -replace '^[/\\]|[/\\]$','' | |
if ($destination) { | |
$destination = New-Item ` | |
-Force ` | |
-Path 'dedupe' ` | |
-Name $destination ` | |
-Type Directory | |
} | |
else { | |
$destination = 'dedupe' | |
} | |
$_ | Copy-Item -Destination $destination | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment