Skip to content

Instantly share code, notes, and snippets.

@frytoli
Last active February 5, 2021 21:10
Show Gist options
  • Save frytoli/0ac76db6d1786083dd4b0700260fdde0 to your computer and use it in GitHub Desktop.
Save frytoli/0ac76db6d1786083dd4b0700260fdde0 to your computer and use it in GitHub Desktop.
/*
Quick Java class to compare files within the same directory.
* Public method "compare" takes in a String path to a directory and String path to an output csv file (i.e. obj.compare(String "./files", String "out.csv");)
* Files (excluding . files) are retrieved from the given directory, evaluated/compared in parallel, and the results are written to a csv file
* Results for each file include: file name, file MIME type, sha256 checksum of file, and similarity comparison (between 0.0 and 1.0) to all other files in the directory
* CSV header rows include: file_name, mime_type, sha256_checksum, and names of all files in the directory...
One Warning: When testing comparing original text files to copies created via Mac's Finder application (i.e. right click, duplicate) and edited slightly (at the end of the
file) in a text editor, the copied file was observed to be missing the very first byte. This means that every byte at index i in the copied file is equal to
the byte at index i+1 in the original file, outside of the part of the file was consciously changed for testing. The simple byte-by-byte comparison algorithm
employed in this code to counts all mismatched bytes, and therefore in this situation reports a low amount of similarity even though the files are in actuality
pretty similar. Perhaps this issue could be solved by implementing a fuzzy-matching-like algorithm in the future.
I just wrote this a little Java refresher. I am no Java developer and am always open to advice/recommendations.
Leave a comment if you see something I can improve -- Thanks!
*/
package tools;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.FileNotFoundException;
import java.lang.Math;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.stream.Collectors;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/* Requires Tika */
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
public class CompareDir {
private static String getChecksum(File file) {
// Initialize new vars
FileInputStream fIStream;
MessageDigest digest = null;
// Get Message Digest
try {
digest = MessageDigest.getInstance("SHA-256");
} catch (NoSuchAlgorithmException e) {
// Output error
System.err.println(e.getMessage());
// Return with empty checksum
return "";
}
// Get file input stream for reading the file content
try {
fIStream = new FileInputStream(file);
} catch (FileNotFoundException e) {
// Output error
System.err.println(e.getMessage());
// Return with empty checksum
return "";
}
// Create byte array to read data in chunks
byte[] byteArray = new byte[1024];
int bytesCount = 0;
// Read file data and update in message digest
try {
while ((bytesCount = fIStream.read(byteArray)) != -1) {
digest.update(byteArray, 0, bytesCount);
};
} catch (IOException e) {
// Output error
System.err.println(e.getMessage());
// Return with empty checksum
return "";
}
// Get the hash's bytes
byte[] bytes = digest.digest();
// This bytes[] has bytes in decimal format;
// Convert it to hexadecimal format
StringBuilder sb = new StringBuilder();
for(int i=0; i< bytes.length ;i++)
{
sb.append(Integer.toString((bytes[i] & 0xff) + 0x100, 16).substring(1));
}
// return complete hash
return sb.toString();
}
public static double getPercentSimilarity(File file1, File file2) {
// Initialize new vars
FileInputStream fIStream1;
FileInputStream fIStream2;
// Create file input stream for each file content
try {
fIStream1 = new FileInputStream(file1);
fIStream2 = new FileInputStream(file2);
} catch (FileNotFoundException e) {
// Output error
System.err.println(e.getMessage());
// Return
return -1.0;
}
// Create byte arrays to read data in chunks
byte[] byteArray1 = new byte[1024];
byte[] byteArray2 = new byte[1024];
// Read files as chunks of byte arrays and compare
try {
double totalMaxByteCount = 0;
double notMatchCount = 0;
int maxBytes = 0;
int minBytes = 0;
// While at least one file has more bytes (set bytesCount1 and bytesCount2 before the loop to ensure that both are actually set)
int bytesCount1 = fIStream1.read(byteArray1);
int bytesCount2 = fIStream2.read(byteArray2);
while ((bytesCount1 != -1) || (bytesCount2 != -1)) {
// If count of both byte arrays is > -1, compare all available bytes (handle different counts)
if ((bytesCount1 != -1) && (bytesCount2 != -1)) {
// Evaluate max and min of current byte counts
maxBytes = Math.max(bytesCount1, bytesCount2);
minBytes = Math.min(bytesCount1, bytesCount2);
// Add to total max byte count
totalMaxByteCount += maxBytes;
// Iterate over byte arrays and avoid out of scope errors
for (int itr=0; itr<minBytes; itr++) {
// If the bytes at the same index are not equal, count the incident
if (byteArray1[itr] != byteArray2[itr]) {
notMatchCount += 1;
}
}
// Handle any additional bytes
notMatchCount += maxBytes-minBytes;
} else {
// Else, find not empty byte array and add to notMatchCount
notMatchCount += Math.max(bytesCount1, bytesCount2);
}
// Read the next bytes
bytesCount1 = fIStream1.read(byteArray1);
bytesCount2 = fIStream2.read(byteArray2);
}
// Evaluate percent not match
return 1 - (notMatchCount / totalMaxByteCount);
} catch (IOException e) {
// Output error
System.err.println(e.getMessage());
// Return
return -1.0;
}
}
// Takes in a File and a list of Files to compare it to and returns a List of Strings
private static List<String> getComparison(File file, List<File> files) {
// Initialize new vars
String mimeType = "";
List<String> responseList = new ArrayList<String>();
// Get file name
responseList.add(file.getName());
// Detect MIME type with Apache Tika (Holla Prof Mattmann!)
try {
Tika tika = new Tika();
mimeType = tika.detect(file);
responseList.add(mimeType);
} catch (TikaException e) {
// Output error
System.err.println(e.getMessage());
}
responseList.add(mimeType);
// Get sha-256 hash (checksum) of file
String shaHash = getChecksum(file);
responseList.add(shaHash);
// Find the similarity of the File compared to the Files in the provided list
for (int itr=0; itr<files.size(); itr+=1) {
// Hash the file and compare to the main file's checksum
String shaHash2 = getChecksum(files.get(itr));
// If the hashes are the same, note that the file is 100% similar
if (shaHash.equals(shaHash2)) {
responseList.add("1.00");
} else {
// Find the percent similarity of the files
responseList.add(String.format("%.2f", getPercentSimilarity(file, files.get(itr))));
}
}
// Return list
return responseList;
}
private static void writeToCSV(String pathToOutfile, List<String> headers, List<List<String>> data) throws IOException {
// Create new FileWriter object
FileWriter csvWriter = new FileWriter(pathToOutfile);
// Write header row
csvWriter.append(String.join(",", headers));
csvWriter.append("\n");
// Write other row data
for (List<String> row : data) {
csvWriter.append(String.join(",", row));
csvWriter.append("\n");
}
// Flush and close
csvWriter.flush();
csvWriter.close();
}
// Arguments: path to directory of files to compare, path to output csv file
public static void compare(String pathToDir, String pathToOutfile) {
// Initialize new vars
List<File> files = new ArrayList<File>();
List<List<String>> fileComparisons = new ArrayList<List<String>>();
List<String> headers = new ArrayList<String>();
// Get the list of files from the directory in question and add them to a stream
try {
File f = new File(pathToDir);
// Override FilenameFilter.accept to ignore . files
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File f, String name) {
if (name.startsWith(".")) {
return false;
} else {
return true;
}
}
};
// Create a List of Files from an Array of File objects
files = Arrays.asList(f.listFiles(filter));
} catch (Exception e) {
// Output error
System.err.println(e.getMessage());
}
// Make final clone of files for use in parallel stream
final List<File> filesClone = new ArrayList<File>(files);
// Evaluate/compare these files in parallel
try {
// https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collectors.html (Lots of cool capabilities I'm not using here)
fileComparisons = files.parallelStream()
.map(f -> getComparison(f, filesClone))
.collect(Collectors.toList());
} catch (Exception e) {
// Output error
System.err.println(e.getMessage());
}
// Compile headers row
headers.add("file_name");
headers.add("mime_type");
headers.add("sha256_checksum");
for (File file : filesClone) {
headers.add(file.getName());
}
try {
// Write output to file
writeToCSV(pathToOutfile, headers, fileComparisons);
} catch (IOException e) {
// Output error and exit
System.err.println("Comparison data could not be written: " + e.getMessage());
return;
}
System.out.println("Comparison data written to " + pathToOutfile);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment