ianatha/duplicate_files.kt

## duplicate_files.kt
import java.io.RandomAccessFile
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
import java.security.MessageDigest
import java.security.DigestInputStream
import kotlin.streams.toList

fun ByteArray.toHexString(): String {
    val result = StringBuilder()
    this.forEach { result.append(String.format("%02X", it)) }
    return result.toString()
}

fun Path.contentsToHash(algorithm: String = "MD5"): String {
    if (!Files.isRegularFile(this)) {
        throw IllegalArgumentException("can't hash a directory")
    }

    val hash = MessageDigest.getInstance(algorithm)

    Files.newInputStream(this).use { inputStream ->
        DigestInputStream(inputStream, hash).use { digestStream ->
            val buffer = ByteArray(1024)
            while (digestStream.read(buffer) > -1) {
                // "a reader lives a thousand lives before he dies"
            }
        }
    }
    return hash.digest().toHexString()
}

fun Path.contentsToQuickHash(algorithm: String = "MD5", sampleSize: Int = 1024): String {
    if (!Files.isRegularFile(this)) {
        throw IllegalArgumentException("can't hash a directory")
    }

    val hash = MessageDigest.getInstance(algorithm)

    val fileLength = RandomAccessFile(this.toFile(), "r" /* read only mode */).use { file ->
        var buffer = ByteArray(sampleSize)
        file.read(buffer)
        hash.update(buffer)

        if (file.length() > sampleSize) {
            file.seek(file.length() - sampleSize)
            file.read(buffer)
            hash.update(buffer)
        }

        file.length()
    }

    return fileLength.toString() + "-" + hash.digest().toHexString()
}

fun main(args: Array<String>) {
    val startPath = "/Users/user/my_files"

    val pathHashes = Files.walk(Paths.get(startPath))
            .parallel()
            .map { file ->
                if (Files.isRegularFile(file)) {
                    (file to file.contentsToQuickHash())
                } else {
                    null
                }
            }
            .toList()

    val duplicateFileCandidates = pathHashes.filterNotNull().groupBy({ it.second }, { it.first }).filter { it.value.size > 1 }

    println("Narrowing down " + duplicateFileCandidates.size + " instances of potentially duplicated content.")

    val actuallyDuplicateFiles = duplicateFileCandidates
            .flatMap { it.value }
            .parallelStream()
            .map { file ->
                (file to file.contentsToHash())
            }
            .toList()
            .groupBy({ it.second }, { it.first })
            .filter { it.value.size > 1 }

    println("Found " + actuallyDuplicateFiles.size + " instances of duplicated content.")

    actuallyDuplicateFiles.forEach { (hash, files) ->
        println("${files.size} copies of $hash content were found:")
        files.forEach { file ->
            println(" * $file")
        }
    }
}
	import java.io.RandomAccessFile
	import java.nio.file.Files
	import java.nio.file.Path
	import java.nio.file.Paths
	import java.security.MessageDigest
	import java.security.DigestInputStream
	import kotlin.streams.toList

	fun ByteArray.toHexString(): String {
	val result = StringBuilder()
	this.forEach { result.append(String.format("%02X", it)) }
	return result.toString()
	}

	fun Path.contentsToHash(algorithm: String = "MD5"): String {
	if (!Files.isRegularFile(this)) {
	throw IllegalArgumentException("can't hash a directory")
	}

	val hash = MessageDigest.getInstance(algorithm)

	Files.newInputStream(this).use { inputStream ->
	DigestInputStream(inputStream, hash).use { digestStream ->
	val buffer = ByteArray(1024)
	while (digestStream.read(buffer) > -1) {
	// "a reader lives a thousand lives before he dies"
	}
	}
	}
	return hash.digest().toHexString()
	}

	fun Path.contentsToQuickHash(algorithm: String = "MD5", sampleSize: Int = 1024): String {
	if (!Files.isRegularFile(this)) {
	throw IllegalArgumentException("can't hash a directory")
	}

	val hash = MessageDigest.getInstance(algorithm)

	val fileLength = RandomAccessFile(this.toFile(), "r" /* read only mode */).use { file ->
	var buffer = ByteArray(sampleSize)
	file.read(buffer)
	hash.update(buffer)

	if (file.length() > sampleSize) {
	file.seek(file.length() - sampleSize)
	file.read(buffer)
	hash.update(buffer)
	}

	file.length()
	}

	return fileLength.toString() + "-" + hash.digest().toHexString()
	}

	fun main(args: Array<String>) {
	val startPath = "/Users/user/my_files"

	val pathHashes = Files.walk(Paths.get(startPath))
	.parallel()
	.map { file ->
	if (Files.isRegularFile(file)) {
	(file to file.contentsToQuickHash())
	} else {
	null
	}
	}
	.toList()

	val duplicateFileCandidates = pathHashes.filterNotNull().groupBy({ it.second }, { it.first }).filter { it.value.size > 1 }

	println("Narrowing down " + duplicateFileCandidates.size + " instances of potentially duplicated content.")

	val actuallyDuplicateFiles = duplicateFileCandidates
	.flatMap { it.value }
	.parallelStream()
	.map { file ->
	(file to file.contentsToHash())
	}
	.toList()
	.groupBy({ it.second }, { it.first })
	.filter { it.value.size > 1 }

	println("Found " + actuallyDuplicateFiles.size + " instances of duplicated content.")

	actuallyDuplicateFiles.forEach { (hash, files) ->
	println("${files.size} copies of $hash content were found:")
	files.forEach { file ->
	println(" * $file")
	}
	}
	}