Skip to content

Instantly share code, notes, and snippets.

@janosgyerik
Created May 30, 2015 19:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save janosgyerik/22e3868b3b162294a895 to your computer and use it in GitHub Desktop.
Save janosgyerik/22e3868b3b162294a895 to your computer and use it in GitHub Desktop.

Don't you hate it when the same file exists at multiple places. I do so, I wrote a simple utility to help clean them up.

The program takes a directory name on the command line, and traverses the filesystem tree recursively, collecting all files. Since comparing content can be expensive, to minimize the number of comparisons, I thought a merge sort logic would be effective.

As output, the program prints sets of duplicate files, with an empty line between sets. Another script or utility can be built on top of this with a friendly user interface asking how to deal with the duplicates, but that's for another day. The program could use some rich command line options, that's also for another day.

I'm looking for a review of all aspects of this code. I have the nagging feeling that some (or all) of this might be reinventing the wheel, or overcomplicating things. The utility is on GitHub.

The main class: DuplicateFilesFinder

import java.io.File;
import java.io.IOException;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;

public class DuplicateFilesFinder {

    private DuplicateFilesFinder() {
        // utility class, forbidden constructor
    }

    public static void main(String[] args) throws IOException {
        for (String arg : args) {
            findDuplicateFiles(new File(arg)).forEach(set -> {
                set.forEach(System.out::println);
                System.out.println();
            });
        }
    }

    private static class Finder extends SimpleFileVisitor<Path> {
        private final List<File> files = new ArrayList<>();

        @Override
        public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) throws IOException {
            files.add(path.toFile());
            return FileVisitResult.CONTINUE;
        }
    }

    public static Collection<Set<File>> findDuplicateFiles(File basedir) throws IOException {
        List<File> filesList = findFiles(basedir);
        File[] files = filesList.toArray(new File[filesList.size()]);

        DuplicateTracker<File> tracker = new DuplicateTracker<>();
        mergeSort(files, 0, files.length, tracker);

        return tracker.getDuplicates();
    }

    private static List<File> findFiles(File basedir) throws IOException {
        Finder finder = new Finder();
        Files.walkFileTree(Paths.get(basedir.getAbsolutePath()), finder);
        return finder.files;
    }

    private static void mergeSort(File[] files, int start, int end, DuplicateTracker<File> tracker) {
        int diff = end - start;
        if (diff < 2) {
            return;
        }

        int mid = start + diff / 2;
        mergeSort(files, start, mid, tracker);
        mergeSort(files, mid, end, tracker);
        merge(files, start, mid, end, tracker);
    }

    private static void merge(File[] files, int start, int mid, int end, DuplicateTracker<File> tracker) {
        int i = start;
        int j = mid;
        int k = 0;
        File[] merged = new File[end - start];

        while (i < mid && j < end) {
            int cmp = compare(files[i], files[j]);
            if (cmp <= 0) {
                if (cmp == 0) {
                    tracker.add(files[i], files[j]);
                }
                merged[k++] = files[i++];
            } else {
                merged[k++] = files[j++];
            }
        }
        while (i < mid) {
            merged[k++] = files[i++];
        }
        while (j < end) {
            merged[k++] = files[j++];
        }

        System.arraycopy(merged, 0, files, start, merged.length);
    }

    private static int compare(File file1, File file2) {
        int cmp = compareSizes(file1, file2);
        if (cmp != 0) {
            return cmp;
        }

        return compareContent(file1, file2);
    }

    private static int compareSizes(File file1, File file2) {
        return Long.compare(file1.length(), file2.length());
    }

    /**
     * Return 0 if the content of the two files match.
     * Otherwise return non-zero. Doesn't matter if positive or negative,
     * as long as it is a consistent ordering of the files,
     * not necessarily by content.
     *
     * @param file1 first file
     * @param file2 second file
     * @return 0 if the files have identical content, otherwise non-zero
     */
    private static int compareContent(File file1, File file2) {
        try {
            if (IOUtils.contentEquals(file1, file2)) {
                return 0;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return file1.compareTo(file2);
    }
}

Utility class: DuplicateTracker

import java.util.*;

public class DuplicateTracker<T> {

    private Map<T, Set<T>> dups = new HashMap<>();

    public void add(T item1, T item2) {
        Set<T> dups1 = dups.get(item1);
        Set<T> dups2 = dups.get(item2);

        if (dups1 != null && dups2 != null) {
            dups1.addAll(dups2);
            for (T item : dups2) {
                dups.put(item, dups1);
            }
        } else if (dups1 != null) {
            dups1.add(item2);
            dups.put(item2, dups1);
        } else if (dups2 != null) {
            dups2.add(item1);
            dups.put(item1, dups2);
        } else {
            dups1 = new HashSet<>();
            dups1.add(item1);
            dups1.add(item2);
            dups.put(item1, dups1);
            dups.put(item2, dups1);
        }
    }

    public Collection<Set<T>> getDuplicates() {
        return new HashSet<>(dups.values());
    }
}

Unit tests for DuplicateTracker

import org.junit.Test;

import java.util.Collection;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;

import static org.junit.Assert.*;

public class DuplicateTrackerTest {

    private String toString(Collection<Set<String>> duplicates) {
        Collection<String> sorted = duplicates.stream().map(set -> new TreeSet<>(set).toString())
                .collect(Collectors.toCollection(TreeSet::new));
        return sorted.toString();
    }

    @Test
    public void test_a_b_plus_c_d() {
        DuplicateTracker<String> tracker = new DuplicateTracker<>();
        tracker.add("a", "b");
        tracker.add("c", "d");
        assertEquals("[[a, b], [c, d]]", toString(tracker.getDuplicates()));
    }

    @Test
    public void test_a_b_plus_c_d_plus_a_x() {
        DuplicateTracker<String> tracker = new DuplicateTracker<>();
        tracker.add("a", "b");
        tracker.add("c", "d");
        tracker.add("a", "x");
        assertEquals("[[c, d], [a, b, x]]", tracker.getDuplicates().toString());
    }

    @Test
    public void test_a_b_plus_c_d_plus_b_d() {
        DuplicateTracker<String> tracker = new DuplicateTracker<>();
        tracker.add("a", "b");
        tracker.add("c", "d");
        tracker.add("b", "d");
        assertEquals("[[a, b, c, d]]", tracker.getDuplicates().toString());
    }

    @Test
    public void test_a_b_plus_c_d_plus_a_c() {
        DuplicateTracker<String> tracker = new DuplicateTracker<>();
        tracker.add("a", "b");
        tracker.add("c", "d");
        tracker.add("a", "c");
        assertEquals("[[a, b, c, d]]", tracker.getDuplicates().toString());
    }

    @Test
    public void test_a_b_plus_c_d_plus_e_f_plus_a_f_plus_d_e() {
        DuplicateTracker<String> tracker = new DuplicateTracker<>();
        tracker.add("a", "b");
        tracker.add("c", "d");
        tracker.add("e", "f");
        tracker.add("a", "f");
        tracker.add("d", "e");
        assertEquals("[[a, b, c, d, e, f]]", tracker.getDuplicates().toString());
    }
}

Utility class: IOUtils

import java.io.*;

public class IOUtils {

    private static final int EOF = -1;

    private IOUtils() {
        // utility class, forbidden constructor
    }

    public static boolean contentEquals(File file1, File file2) throws IOException {
        return contentEquals(new FileInputStream(file1), new FileInputStream(file2));
    }

    // http://grepcode.com/file/repo1.maven.org/maven2/commons-io/commons-io/2.4/org/apache/commons/io/IOUtils.java
    public static boolean contentEquals(InputStream input1, InputStream input2) throws IOException {
        if (!(input1 instanceof BufferedInputStream)) {
            input1 = new BufferedInputStream(input1);
        }
        if (!(input2 instanceof BufferedInputStream)) {
            input2 = new BufferedInputStream(input2);
        }

        int ch = input1.read();
        while (EOF != ch) {
            int ch2 = input2.read();
            if (ch != ch2) {
                return false;
            }
            ch = input1.read();
        }

        int ch2 = input2.read();
        return ch2 == EOF;
    }

    public static void closeQuietly(Closeable closeable) {
        if (closeable != null) {
            try {
                closeable.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment