Skip to content

Instantly share code, notes, and snippets.

@crazy4groovy
Last active June 16, 2018 19:06
Show Gist options
  • Save crazy4groovy/2040997 to your computer and use it in GitHub Desktop.
Save crazy4groovy/2040997 to your computer and use it in GitHub Desktop.
Duplicate file finder (and deleter) via MD5 hash
import java.util.regex.Pattern
import java.security.MessageDigest
import groovy.io.FileType
import groovy.transform.*
def input = System.console().&readLine
@Field minFileSize
@Field maxFileSize
String ROOT_DIRS = ( input("Enter the root directory [./] ") ?: "./" )
def SELECT_FILENAMES_WITH_REGEX = Pattern.compile( input("Enter a regex filename filter [.*] ") ?: ".*" )
boolean waitForOK = input("Approve each file delete? [y/N] ").toLowerCase().contains("y") ? true : false
int minSize = (input("Min file size (KB)? [0] ") ?: "0").toInteger()
int maxSize = (input("Max file size (KB)? [-1] ") ?: "-1").toInteger()
minFileSize = minSize
maxFileSize = maxSize
////UTILS////
String generateMD5(final file) {
MessageDigest digest = MessageDigest.getInstance("MD5")
file.withInputStream() { is ->
byte[] buffer = new byte[8192]
int read = 0
while( (read = is.read(buffer)) > 0 )
digest.update(buffer, 0, read);
}
byte[] md5sum = digest.digest()
BigInteger bigInt = new BigInteger(1, md5sum)
return bigInt.toString(16)
}
def foundClosure = { file ->
String md5 = generateMD5(file)
//println md5+file.absolutePath
int size = file.length() / 1024 // bytes -> KB
if (size < minFileSize || (maxFileSize > -1 && size > maxFileSize)) return
if (waitForOK || !md5FilePaths[md5]) {
md5FilePaths[md5] << file.absolutePath
print "."
}
else if (md5FilePaths[md5]){
new File(file.absolutePath).delete()
print "x"
}
fileCnt++
}
def walkFiles = {filepath, nameFilter, onFind, onEnd = {} ->
try {
File f = new File(filepath)
f.traverse([type:FileType.FILES, nameFilter:nameFilter], onFind)
onEnd()
}
catch (FileNotFoundException e) { println "ERROR: invalid file/directory"}
}
////MAIN////
@Field Map md5FilePaths = [:].withDefault{[]}
@Field int fileCnt = 0
def ts = Calendar.instance.timeInMillis
def totalDelCount = 0
ROOT_DIRS.split(',').each { dir ->
md5FilePaths = [:].withDefault{[]}
fileCnt = 0
walkFiles(dir, SELECT_FILENAMES_WITH_REGEX, foundClosure)
println "\nTotal: ${fileCnt}\nUnique: ${md5FilePaths.keySet().size()}"
if (!waitForOK) {
totalDelCount += (fileCnt - md5FilePaths.keySet().size())
return;
}
File resultsFile = new File('dupFound.bak.txt') << ''
resultsFile.delete()
md5FilePaths.values().each {
if (it.size() > 1)
resultsFile << it[1..-1].join(';')
resultsFile << '\n'
}
println "***DELETING***"
String isOK
int delCount = 0
md5FilePaths.each { k,v ->
if (waitForOK && v.size() > 1)
println "Comparison file: ${v[0]}"
while (v.size() > 1) {
String f = v.pop()
if (waitForOK) {
isOK = input("OK to delete ${f}? [y/n/a] ")
if (isOK.toLowerCase().contains("a"))
waitForOK = false
}
else {
println f
isOK = "y"
}
if (!waitForOK || isOK.toLowerCase().contains("y")) {
new File(f).delete()
delCount++
}
else
println "Skipped: ${f}"
}
}
resultsFile.delete()
println "***DELETED $delCount FILES***"
totalDelCount += delCount
}
def time = ((Calendar.instance.timeInMillis - ts) / 1000)
time = Math.round((time * 100) / 60 ) / 100
println "total time: ${time} min @ ${(new Date()).toString()}"
println "total deleted: ${totalDelCount}"
println "*" * 40
System.console().readLine('Press a key to quit: ')
import java.util.regex.Pattern
import java.security.MessageDigest
import groovy.io.FileType
import groovy.transform.*
def input = System.console().&readLine
String ROOT_DIRS = ( input("Enter the root directory [./] ") ?: "./" )
def SELECT_FILENAMES_WITH_REGEX = Pattern.compile( input("Enter a file filter [.*] ") ?: ".*" )
boolean waitForOK = input("Approve each file delete? [y/N] ").toLowerCase().contains("y") ? true : false
String generateMD5(final file) {
MessageDigest digest = MessageDigest.getInstance("MD5")
file.withInputStream() { is ->
byte[] buffer = new byte[8192]
int read = 0
while( (read = is.read(buffer)) > 0 ) {
digest.update(buffer, 0, read);
}
}
byte[] md5sum = digest.digest()
BigInteger bigInt = new BigInteger(1, md5sum)
return bigInt.toString(16)
}
def foundClosure = { file ->
String md5 = generateMD5(file)
//println md5+file.absolutePath
if (waitForOK || !md5FilePaths[md5]) {
md5FilePaths[md5] << file.absolutePath
print "."
}
else if (md5FilePaths[md5]){
new File(file.absolutePath).delete()
print "x"
}
fileCnt++
}
def walkFiles = {filepath, filterOnly, onFind, onEnd = {} ->
try {
File f = new File(filepath)
f.traverse([type:FileType.FILES, nameFilter:filterOnly], onFind)
onEnd()
}
catch (FileNotFoundException e) { println "ERROR: invalid file/directory"}
}
////MAIN////
@Field Map md5FilePaths = [:].withDefault{[]}
@Field int fileCnt = 0
def ts = Calendar.instance.timeInMillis
def totalDelCount = 0
ROOT_DIRS.split(',').each { dir ->
md5FilePaths = [:].withDefault{[]}
fileCnt = 0
walkFiles(dir, SELECT_FILENAMES_WITH_REGEX, foundClosure)
println "\nTotal: ${fileCnt}\nUnique: ${md5FilePaths.keySet().size()}"
if (!waitForOK) {
totalDelCount += (fileCnt - md5FilePaths.keySet().size())
return;
}
File resultsFile = new File('dupFound.bak.txt') << ''
resultsFile.delete()
md5FilePaths.values().each {
if (it.size() > 1)
resultsFile << it[1..-1].join(';')
resultsFile << '\n'
}
println "***DELETING***"
String isOK
int delCount = 0
md5FilePaths.each { k,v ->
if (waitForOK && v.size() > 1)
println "Comparison file: ${v[0]}"
while (v.size() > 1) {
String f = v.pop()
if (waitForOK) {
isOK = input("OK to delete ${f}? [y/n/a] ")
if (isOK.toLowerCase().contains("a"))
waitForOK = false
}
else {
println f
isOK = "y"
}
if (!waitForOK || isOK.toLowerCase().contains("y")) {
new File(f).delete()
delCount++
}
else
println "Skipped: ${f}"
}
}
resultsFile.delete()
println "***DELETED $delCount FILES***"
totalDelCount += delCount
}
def time = ((Calendar.instance.timeInMillis - ts) / 1000)
time = Math.round((time * 100) / 60 ) / 100
println "total time: ${time} min @ ${(new Date()).toString()}"
println "total deleted: ${totalDelCount}"
println "*" * 40
System.console().readLine('Press a key to quit: ')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment