frytoli/CompareDir.java

## CompareDir.java
/*
  Quick Java class to compare files within the same directory.
    * Public method "compare" takes in a String path to a directory and String path to an output csv file (i.e. obj.compare(String "./files", String "out.csv");)
    * Files (excluding . files) are retrieved from the given directory, evaluated/compared in parallel, and the results are written to a csv file
    * Results for each file include: file name, file MIME type, sha256 checksum of file, and similarity comparison (between 0.0 and 1.0) to all other files in the directory
    * CSV header rows include: file_name, mime_type, sha256_checksum, and names of all files in the directory...

  One Warning: When testing comparing original text files to copies created via Mac's Finder application (i.e. right click, duplicate) and edited slightly (at the end of the
               file) in a text editor, the copied file was observed to be missing the very first byte. This means that every byte at index i in the copied file is equal to
               the byte at index i+1 in the original file, outside of the part of the file was consciously changed for testing. The simple byte-by-byte comparison algorithm
               employed in this code to counts all mismatched bytes, and therefore in this situation reports a low amount of similarity even though the files are in actuality
               pretty similar. Perhaps this issue could be solved by implementing a fuzzy-matching-like algorithm in the future.

  I just wrote this a little Java refresher. I am no Java developer and am always open to advice/recommendations.
  Leave a comment if you see something I can improve -- Thanks!
*/

package tools;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.FileNotFoundException;
import java.lang.Math;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.stream.Collectors;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

/* Requires Tika */
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;


public class CompareDir {
  private static String getChecksum(File file) {
    // Initialize new vars
    FileInputStream fIStream;
    MessageDigest digest = null;

    // Get Message Digest
    try {
      digest = MessageDigest.getInstance("SHA-256");
    } catch (NoSuchAlgorithmException e) {
      // Output error
      System.err.println(e.getMessage());
      // Return with empty checksum
      return "";
    }

    // Get file input stream for reading the file content
    try {
      fIStream = new FileInputStream(file);
    } catch (FileNotFoundException e) {
      // Output error
      System.err.println(e.getMessage());
      // Return with empty checksum
      return "";
    }

    // Create byte array to read data in chunks
    byte[] byteArray = new byte[1024];
    int bytesCount = 0;

    // Read file data and update in message digest
    try {
      while ((bytesCount = fIStream.read(byteArray)) != -1) {
          digest.update(byteArray, 0, bytesCount);
      };
    } catch (IOException e) {
      // Output error
      System.err.println(e.getMessage());
      // Return with empty checksum
      return "";
    }

    // Get the hash's bytes
    byte[] bytes = digest.digest();

    // This bytes[] has bytes in decimal format;
    // Convert it to hexadecimal format
    StringBuilder sb = new StringBuilder();
    for(int i=0; i< bytes.length ;i++)
    {
        sb.append(Integer.toString((bytes[i] & 0xff) + 0x100, 16).substring(1));
    }

    // return complete hash
    return sb.toString();
  }

  public static double getPercentSimilarity(File file1, File file2) {
    // Initialize new vars
    FileInputStream fIStream1;
    FileInputStream fIStream2;

    // Create file input stream for each file content
    try {
      fIStream1 = new FileInputStream(file1);
      fIStream2 = new FileInputStream(file2);
    } catch (FileNotFoundException e) {
      // Output error
      System.err.println(e.getMessage());
      // Return
      return -1.0;
    }

    // Create byte arrays to read data in chunks
    byte[] byteArray1 = new byte[1024];
    byte[] byteArray2 = new byte[1024];

    // Read files as chunks of byte arrays and compare
    try {
      double totalMaxByteCount = 0;
      double notMatchCount = 0;
      int maxBytes = 0;
      int minBytes = 0;
      // While at least one file has more bytes (set bytesCount1 and bytesCount2 before the loop to ensure that both are actually set)
      int bytesCount1 = fIStream1.read(byteArray1);
      int bytesCount2 = fIStream2.read(byteArray2);
      while ((bytesCount1 != -1) || (bytesCount2 != -1)) {
        // If count of both byte arrays is > -1, compare all available bytes (handle different counts)
        if ((bytesCount1 != -1) && (bytesCount2 != -1)) {
          // Evaluate max and min of current byte counts
          maxBytes = Math.max(bytesCount1, bytesCount2);
          minBytes = Math.min(bytesCount1, bytesCount2);
          // Add to total max byte count
          totalMaxByteCount += maxBytes;
          // Iterate over byte arrays and avoid out of scope errors
          for (int itr=0; itr<minBytes; itr++) {
            // If the bytes at the same index are not equal, count the incident
            if (byteArray1[itr] != byteArray2[itr]) {
              notMatchCount += 1;
            }
          }
          // Handle any additional bytes
          notMatchCount += maxBytes-minBytes;
        } else {
          // Else, find not empty byte array and add to notMatchCount
          notMatchCount += Math.max(bytesCount1, bytesCount2);
        }
        // Read the next bytes
        bytesCount1 = fIStream1.read(byteArray1);
        bytesCount2 = fIStream2.read(byteArray2);
      }
      // Evaluate percent not match
      return 1 - (notMatchCount / totalMaxByteCount);
    } catch (IOException e) {
      // Output error
      System.err.println(e.getMessage());
      // Return
      return -1.0;
    }
  }

  // Takes in a File and a list of Files to compare it to and returns a List of Strings
  private static List<String> getComparison(File file, List<File> files) {
    // Initialize new vars
    String mimeType = "";
    List<String> responseList = new ArrayList<String>();

    // Get file name
    responseList.add(file.getName());

    // Detect MIME type with Apache Tika (Holla Prof Mattmann!)
    try {
      Tika tika = new Tika();
      mimeType = tika.detect(file);
      responseList.add(mimeType);
    } catch (TikaException e) {
      // Output error
      System.err.println(e.getMessage());
    }
    responseList.add(mimeType);

    // Get sha-256 hash (checksum) of file
    String shaHash = getChecksum(file);
    responseList.add(shaHash);

    // Find the similarity of the File compared to the Files in the provided list
    for (int itr=0; itr<files.size(); itr+=1) {
      // Hash the file and compare to the main file's checksum
      String shaHash2 = getChecksum(files.get(itr));
      // If the hashes are the same, note that the file is 100% similar
      if (shaHash.equals(shaHash2)) {
        responseList.add("1.00");
      } else {
        // Find the percent similarity of the files
        responseList.add(String.format("%.2f", getPercentSimilarity(file, files.get(itr))));
      }
    }

    // Return list
    return responseList;
  }

  private static void writeToCSV(String pathToOutfile, List<String> headers, List<List<String>> data) throws IOException {
    // Create new FileWriter object
    FileWriter csvWriter = new FileWriter(pathToOutfile);

    // Write header row
    csvWriter.append(String.join(",", headers));
    csvWriter.append("\n");

    // Write other row data
    for (List<String> row : data) {
      csvWriter.append(String.join(",", row));
      csvWriter.append("\n");
    }

    // Flush and close
    csvWriter.flush();
    csvWriter.close();
  }

  // Arguments: path to directory of files to compare, path to output csv file
  public static void compare(String pathToDir, String pathToOutfile) {
    // Initialize new vars
    List<File> files = new ArrayList<File>();
    List<List<String>> fileComparisons = new ArrayList<List<String>>();
    List<String> headers = new ArrayList<String>();

    // Get the list of files from the directory in question and add them to a stream
    try {
      File f = new File(pathToDir);

      // Override FilenameFilter.accept to ignore . files
      FilenameFilter filter = new FilenameFilter() {
        @Override
        public boolean accept(File f, String name) {
          if (name.startsWith(".")) {
            return false;
          } else {
            return true;
          }
        }
      };

      // Create a List of Files from an Array of File objects
      files = Arrays.asList(f.listFiles(filter));
    } catch (Exception e) {
      // Output error
      System.err.println(e.getMessage());
    }
    // Make final clone of files for use in parallel stream
    final List<File> filesClone = new ArrayList<File>(files);

    // Evaluate/compare these files in parallel
    try {
      // https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collectors.html (Lots of cool capabilities I'm not using here)
      fileComparisons = files.parallelStream()
                       .map(f -> getComparison(f, filesClone))
                       .collect(Collectors.toList());
    } catch (Exception e) {
      // Output error
      System.err.println(e.getMessage());
    }

    // Compile headers row
    headers.add("file_name");
    headers.add("mime_type");
    headers.add("sha256_checksum");
    for (File file : filesClone) {
      headers.add(file.getName());
    }
   try {
      // Write output to file
      writeToCSV(pathToOutfile, headers, fileComparisons);
    } catch (IOException e) {
      // Output error and exit
      System.err.println("Comparison data could not be written: " + e.getMessage());
      return;
    }
    System.out.println("Comparison data written to " + pathToOutfile);
  }
}
	/*
	Quick Java class to compare files within the same directory.
	* Public method "compare" takes in a String path to a directory and String path to an output csv file (i.e. obj.compare(String "./files", String "out.csv");)
	* Files (excluding . files) are retrieved from the given directory, evaluated/compared in parallel, and the results are written to a csv file
	* Results for each file include: file name, file MIME type, sha256 checksum of file, and similarity comparison (between 0.0 and 1.0) to all other files in the directory
	* CSV header rows include: file_name, mime_type, sha256_checksum, and names of all files in the directory...

	One Warning: When testing comparing original text files to copies created via Mac's Finder application (i.e. right click, duplicate) and edited slightly (at the end of the
	file) in a text editor, the copied file was observed to be missing the very first byte. This means that every byte at index i in the copied file is equal to
	the byte at index i+1 in the original file, outside of the part of the file was consciously changed for testing. The simple byte-by-byte comparison algorithm
	employed in this code to counts all mismatched bytes, and therefore in this situation reports a low amount of similarity even though the files are in actuality
	pretty similar. Perhaps this issue could be solved by implementing a fuzzy-matching-like algorithm in the future.

	I just wrote this a little Java refresher. I am no Java developer and am always open to advice/recommendations.
	Leave a comment if you see something I can improve -- Thanks!
	*/

	package tools;

	import java.io.File;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.FileInputStream;
	import java.io.FilenameFilter;
	import java.io.FileNotFoundException;
	import java.lang.Math;
	import java.util.List;
	import java.util.Arrays;
	import java.util.ArrayList;
	import java.util.stream.Collectors;
	import java.security.MessageDigest;
	import java.security.NoSuchAlgorithmException;

	/* Requires Tika */
	import org.apache.tika.Tika;
	import org.apache.tika.exception.TikaException;


	public class CompareDir {
	private static String getChecksum(File file) {
	// Initialize new vars
	FileInputStream fIStream;
	MessageDigest digest = null;

	// Get Message Digest
	try {
	digest = MessageDigest.getInstance("SHA-256");
	} catch (NoSuchAlgorithmException e) {
	// Output error
	System.err.println(e.getMessage());
	// Return with empty checksum
	return "";
	}

	// Get file input stream for reading the file content
	try {
	fIStream = new FileInputStream(file);
	} catch (FileNotFoundException e) {
	// Output error
	System.err.println(e.getMessage());
	// Return with empty checksum
	return "";
	}

	// Create byte array to read data in chunks
	byte[] byteArray = new byte[1024];
	int bytesCount = 0;

	// Read file data and update in message digest
	try {
	while ((bytesCount = fIStream.read(byteArray)) != -1) {
	digest.update(byteArray, 0, bytesCount);
	};
	} catch (IOException e) {
	// Output error
	System.err.println(e.getMessage());
	// Return with empty checksum
	return "";
	}

	// Get the hash's bytes
	byte[] bytes = digest.digest();

	// This bytes[] has bytes in decimal format;
	// Convert it to hexadecimal format
	StringBuilder sb = new StringBuilder();
	for(int i=0; i< bytes.length ;i++)
	{
	sb.append(Integer.toString((bytes[i] & 0xff) + 0x100, 16).substring(1));
	}

	// return complete hash
	return sb.toString();
	}

	public static double getPercentSimilarity(File file1, File file2) {
	// Initialize new vars
	FileInputStream fIStream1;
	FileInputStream fIStream2;

	// Create file input stream for each file content
	try {
	fIStream1 = new FileInputStream(file1);
	fIStream2 = new FileInputStream(file2);
	} catch (FileNotFoundException e) {
	// Output error
	System.err.println(e.getMessage());
	// Return
	return -1.0;
	}

	// Create byte arrays to read data in chunks
	byte[] byteArray1 = new byte[1024];
	byte[] byteArray2 = new byte[1024];

	// Read files as chunks of byte arrays and compare
	try {
	double totalMaxByteCount = 0;
	double notMatchCount = 0;
	int maxBytes = 0;
	int minBytes = 0;
	// While at least one file has more bytes (set bytesCount1 and bytesCount2 before the loop to ensure that both are actually set)
	int bytesCount1 = fIStream1.read(byteArray1);
	int bytesCount2 = fIStream2.read(byteArray2);
	while ((bytesCount1 != -1) \|\| (bytesCount2 != -1)) {
	// If count of both byte arrays is > -1, compare all available bytes (handle different counts)
	if ((bytesCount1 != -1) && (bytesCount2 != -1)) {
	// Evaluate max and min of current byte counts
	maxBytes = Math.max(bytesCount1, bytesCount2);
	minBytes = Math.min(bytesCount1, bytesCount2);
	// Add to total max byte count
	totalMaxByteCount += maxBytes;
	// Iterate over byte arrays and avoid out of scope errors
	for (int itr=0; itr<minBytes; itr++) {
	// If the bytes at the same index are not equal, count the incident
	if (byteArray1[itr] != byteArray2[itr]) {
	notMatchCount += 1;
	}
	}
	// Handle any additional bytes
	notMatchCount += maxBytes-minBytes;
	} else {
	// Else, find not empty byte array and add to notMatchCount
	notMatchCount += Math.max(bytesCount1, bytesCount2);
	}
	// Read the next bytes
	bytesCount1 = fIStream1.read(byteArray1);
	bytesCount2 = fIStream2.read(byteArray2);
	}
	// Evaluate percent not match
	return 1 - (notMatchCount / totalMaxByteCount);
	} catch (IOException e) {
	// Output error
	System.err.println(e.getMessage());
	// Return
	return -1.0;
	}
	}

	// Takes in a File and a list of Files to compare it to and returns a List of Strings
	private static List<String> getComparison(File file, List<File> files) {
	// Initialize new vars
	String mimeType = "";
	List<String> responseList = new ArrayList<String>();

	// Get file name
	responseList.add(file.getName());

	// Detect MIME type with Apache Tika (Holla Prof Mattmann!)
	try {
	Tika tika = new Tika();
	mimeType = tika.detect(file);
	responseList.add(mimeType);
	} catch (TikaException e) {
	// Output error
	System.err.println(e.getMessage());
	}
	responseList.add(mimeType);

	// Get sha-256 hash (checksum) of file
	String shaHash = getChecksum(file);
	responseList.add(shaHash);

	// Find the similarity of the File compared to the Files in the provided list
	for (int itr=0; itr<files.size(); itr+=1) {
	// Hash the file and compare to the main file's checksum
	String shaHash2 = getChecksum(files.get(itr));
	// If the hashes are the same, note that the file is 100% similar
	if (shaHash.equals(shaHash2)) {
	responseList.add("1.00");
	} else {
	// Find the percent similarity of the files
	responseList.add(String.format("%.2f", getPercentSimilarity(file, files.get(itr))));
	}
	}

	// Return list
	return responseList;
	}

	private static void writeToCSV(String pathToOutfile, List<String> headers, List<List<String>> data) throws IOException {
	// Create new FileWriter object
	FileWriter csvWriter = new FileWriter(pathToOutfile);

	// Write header row
	csvWriter.append(String.join(",", headers));
	csvWriter.append("\n");

	// Write other row data
	for (List<String> row : data) {
	csvWriter.append(String.join(",", row));
	csvWriter.append("\n");
	}

	// Flush and close
	csvWriter.flush();
	csvWriter.close();
	}

	// Arguments: path to directory of files to compare, path to output csv file
	public static void compare(String pathToDir, String pathToOutfile) {
	// Initialize new vars
	List<File> files = new ArrayList<File>();
	List<List<String>> fileComparisons = new ArrayList<List<String>>();
	List<String> headers = new ArrayList<String>();

	// Get the list of files from the directory in question and add them to a stream
	try {
	File f = new File(pathToDir);

	// Override FilenameFilter.accept to ignore . files
	FilenameFilter filter = new FilenameFilter() {
	@Override
	public boolean accept(File f, String name) {
	if (name.startsWith(".")) {
	return false;
	} else {
	return true;
	}
	}
	};

	// Create a List of Files from an Array of File objects
	files = Arrays.asList(f.listFiles(filter));
	} catch (Exception e) {
	// Output error
	System.err.println(e.getMessage());
	}
	// Make final clone of files for use in parallel stream
	final List<File> filesClone = new ArrayList<File>(files);

	// Evaluate/compare these files in parallel
	try {
	// https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collectors.html (Lots of cool capabilities I'm not using here)
	fileComparisons = files.parallelStream()
	.map(f -> getComparison(f, filesClone))
	.collect(Collectors.toList());
	} catch (Exception e) {
	// Output error
	System.err.println(e.getMessage());
	}

	// Compile headers row
	headers.add("file_name");
	headers.add("mime_type");
	headers.add("sha256_checksum");
	for (File file : filesClone) {
	headers.add(file.getName());
	}
	try {
	// Write output to file
	writeToCSV(pathToOutfile, headers, fileComparisons);
	} catch (IOException e) {
	// Output error and exit
	System.err.println("Comparison data could not be written: " + e.getMessage());
	return;
	}
	System.out.println("Comparison data written to " + pathToOutfile);
	}
	}