Last active
April 26, 2019 10:59
-
-
Save stoerr/8e38149ac5c776398560 to your computer and use it in GitHub Desktop.
Finds duplicate files (i.e., with the same content) in a given set of directories.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.stoerr.dirdupfinder | |
import java.nio.ByteBuffer | |
import java.nio.file.{FileSystems, Files, Path, StandardOpenOption} | |
import java.security.MessageDigest | |
import sun.misc.BASE64Encoder | |
import scala.collection.JavaConversions._ | |
/** | |
* Simple file duplicate finder: scans all directories given as arguments | |
* for files that have the same content. It prints the duplicates in CSV-ish | |
* format ordered by descending size. <p> | |
* | |
* Call as: <code>scala FileDupFinder.scala dir1 dir2 ...</code> | |
* where dir1 dir2 ... are the directories you want searched. | |
* | |
* @author <a href="http://www.stoerr.net/">Hans-Peter Stoerr</a> | |
* @since 02.09.2014 | |
*/ | |
object FileDupFinder { | |
val sha1 = MessageDigest.getInstance("SHA-1") | |
val b64 = new BASE64Encoder() | |
val buf = ByteBuffer.allocate(1024 * 1024) | |
/** Sorted iterator over all files of a directory; treats links etc as normal files. */ | |
def fileIterator(path: Path): Iterator[Path] = { | |
if (!Files.isDirectory(path)) return Iterator(path) | |
val children = Files.newDirectoryStream(path).iterator().toArray | |
children.sorted.toIterator.flatMap(fileIterator) | |
} | |
def digest(path: Path): String = { | |
sha1.reset() | |
// return b64.encode(sha1.digest(Files.readAllBytes(file.toPath))) | |
val channel = Files.newByteChannel(path, StandardOpenOption.READ) | |
buf.clear() | |
var bytesRead: Int = channel.read(buf) | |
while (bytesRead > 0) { | |
buf.flip() | |
sha1.update(buf) | |
buf.clear() | |
bytesRead = channel.read(buf) | |
} | |
channel.close() | |
b64.encode(sha1.digest()) | |
} | |
def printFileDuplicates(files: Iterator[Path]) = { | |
val filesAndContents = files.map(f => (f, digest(f))).toList | |
val groupedDuplicates = filesAndContents.groupBy(_._2).filter(_._2.size > 1).values | |
groupedDuplicates.toArray.sortBy(p => -Files.size(p(0)._1)) foreach { paths => | |
paths foreach { case (path, md) => | |
println(md + " , " + Files.size(path) + " , " + path) | |
} | |
println() | |
} | |
} | |
def main(args: Array[String]): Unit = { | |
if (args.isEmpty) sys.error("Please give some directories in which I should look for duplicate files as arguments.") | |
val files = args.sorted.map(f => fileIterator(FileSystems.getDefault.getPath(f))).toIterator.reduce(_ ++ _) | |
printFileDuplicates(files) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
There is obvious room for improvement by saving the size and lazily calculating the digests if sizes are equivalent. I didn't need that for a long while, so I didn't do that yet.