Last active
February 5, 2021 21:10
-
-
Save frytoli/0ac76db6d1786083dd4b0700260fdde0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Quick Java class to compare files within the same directory. | |
* Public method "compare" takes in a String path to a directory and String path to an output csv file (i.e. obj.compare(String "./files", String "out.csv");) | |
* Files (excluding . files) are retrieved from the given directory, evaluated/compared in parallel, and the results are written to a csv file | |
* Results for each file include: file name, file MIME type, sha256 checksum of file, and similarity comparison (between 0.0 and 1.0) to all other files in the directory | |
* CSV header rows include: file_name, mime_type, sha256_checksum, and names of all files in the directory... | |
One Warning: When testing comparing original text files to copies created via Mac's Finder application (i.e. right click, duplicate) and edited slightly (at the end of the | |
file) in a text editor, the copied file was observed to be missing the very first byte. This means that every byte at index i in the copied file is equal to | |
the byte at index i+1 in the original file, outside of the part of the file was consciously changed for testing. The simple byte-by-byte comparison algorithm | |
employed in this code to counts all mismatched bytes, and therefore in this situation reports a low amount of similarity even though the files are in actuality | |
pretty similar. Perhaps this issue could be solved by implementing a fuzzy-matching-like algorithm in the future. | |
I just wrote this a little Java refresher. I am no Java developer and am always open to advice/recommendations. | |
Leave a comment if you see something I can improve -- Thanks! | |
*/ | |
package tools; | |
import java.io.File; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.FileInputStream; | |
import java.io.FilenameFilter; | |
import java.io.FileNotFoundException; | |
import java.lang.Math; | |
import java.util.List; | |
import java.util.Arrays; | |
import java.util.ArrayList; | |
import java.util.stream.Collectors; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
/* Requires Tika */ | |
import org.apache.tika.Tika; | |
import org.apache.tika.exception.TikaException; | |
public class CompareDir { | |
private static String getChecksum(File file) { | |
// Initialize new vars | |
FileInputStream fIStream; | |
MessageDigest digest = null; | |
// Get Message Digest | |
try { | |
digest = MessageDigest.getInstance("SHA-256"); | |
} catch (NoSuchAlgorithmException e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
// Return with empty checksum | |
return ""; | |
} | |
// Get file input stream for reading the file content | |
try { | |
fIStream = new FileInputStream(file); | |
} catch (FileNotFoundException e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
// Return with empty checksum | |
return ""; | |
} | |
// Create byte array to read data in chunks | |
byte[] byteArray = new byte[1024]; | |
int bytesCount = 0; | |
// Read file data and update in message digest | |
try { | |
while ((bytesCount = fIStream.read(byteArray)) != -1) { | |
digest.update(byteArray, 0, bytesCount); | |
}; | |
} catch (IOException e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
// Return with empty checksum | |
return ""; | |
} | |
// Get the hash's bytes | |
byte[] bytes = digest.digest(); | |
// This bytes[] has bytes in decimal format; | |
// Convert it to hexadecimal format | |
StringBuilder sb = new StringBuilder(); | |
for(int i=0; i< bytes.length ;i++) | |
{ | |
sb.append(Integer.toString((bytes[i] & 0xff) + 0x100, 16).substring(1)); | |
} | |
// return complete hash | |
return sb.toString(); | |
} | |
public static double getPercentSimilarity(File file1, File file2) { | |
// Initialize new vars | |
FileInputStream fIStream1; | |
FileInputStream fIStream2; | |
// Create file input stream for each file content | |
try { | |
fIStream1 = new FileInputStream(file1); | |
fIStream2 = new FileInputStream(file2); | |
} catch (FileNotFoundException e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
// Return | |
return -1.0; | |
} | |
// Create byte arrays to read data in chunks | |
byte[] byteArray1 = new byte[1024]; | |
byte[] byteArray2 = new byte[1024]; | |
// Read files as chunks of byte arrays and compare | |
try { | |
double totalMaxByteCount = 0; | |
double notMatchCount = 0; | |
int maxBytes = 0; | |
int minBytes = 0; | |
// While at least one file has more bytes (set bytesCount1 and bytesCount2 before the loop to ensure that both are actually set) | |
int bytesCount1 = fIStream1.read(byteArray1); | |
int bytesCount2 = fIStream2.read(byteArray2); | |
while ((bytesCount1 != -1) || (bytesCount2 != -1)) { | |
// If count of both byte arrays is > -1, compare all available bytes (handle different counts) | |
if ((bytesCount1 != -1) && (bytesCount2 != -1)) { | |
// Evaluate max and min of current byte counts | |
maxBytes = Math.max(bytesCount1, bytesCount2); | |
minBytes = Math.min(bytesCount1, bytesCount2); | |
// Add to total max byte count | |
totalMaxByteCount += maxBytes; | |
// Iterate over byte arrays and avoid out of scope errors | |
for (int itr=0; itr<minBytes; itr++) { | |
// If the bytes at the same index are not equal, count the incident | |
if (byteArray1[itr] != byteArray2[itr]) { | |
notMatchCount += 1; | |
} | |
} | |
// Handle any additional bytes | |
notMatchCount += maxBytes-minBytes; | |
} else { | |
// Else, find not empty byte array and add to notMatchCount | |
notMatchCount += Math.max(bytesCount1, bytesCount2); | |
} | |
// Read the next bytes | |
bytesCount1 = fIStream1.read(byteArray1); | |
bytesCount2 = fIStream2.read(byteArray2); | |
} | |
// Evaluate percent not match | |
return 1 - (notMatchCount / totalMaxByteCount); | |
} catch (IOException e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
// Return | |
return -1.0; | |
} | |
} | |
// Takes in a File and a list of Files to compare it to and returns a List of Strings | |
private static List<String> getComparison(File file, List<File> files) { | |
// Initialize new vars | |
String mimeType = ""; | |
List<String> responseList = new ArrayList<String>(); | |
// Get file name | |
responseList.add(file.getName()); | |
// Detect MIME type with Apache Tika (Holla Prof Mattmann!) | |
try { | |
Tika tika = new Tika(); | |
mimeType = tika.detect(file); | |
responseList.add(mimeType); | |
} catch (TikaException e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
} | |
responseList.add(mimeType); | |
// Get sha-256 hash (checksum) of file | |
String shaHash = getChecksum(file); | |
responseList.add(shaHash); | |
// Find the similarity of the File compared to the Files in the provided list | |
for (int itr=0; itr<files.size(); itr+=1) { | |
// Hash the file and compare to the main file's checksum | |
String shaHash2 = getChecksum(files.get(itr)); | |
// If the hashes are the same, note that the file is 100% similar | |
if (shaHash.equals(shaHash2)) { | |
responseList.add("1.00"); | |
} else { | |
// Find the percent similarity of the files | |
responseList.add(String.format("%.2f", getPercentSimilarity(file, files.get(itr)))); | |
} | |
} | |
// Return list | |
return responseList; | |
} | |
private static void writeToCSV(String pathToOutfile, List<String> headers, List<List<String>> data) throws IOException { | |
// Create new FileWriter object | |
FileWriter csvWriter = new FileWriter(pathToOutfile); | |
// Write header row | |
csvWriter.append(String.join(",", headers)); | |
csvWriter.append("\n"); | |
// Write other row data | |
for (List<String> row : data) { | |
csvWriter.append(String.join(",", row)); | |
csvWriter.append("\n"); | |
} | |
// Flush and close | |
csvWriter.flush(); | |
csvWriter.close(); | |
} | |
// Arguments: path to directory of files to compare, path to output csv file | |
public static void compare(String pathToDir, String pathToOutfile) { | |
// Initialize new vars | |
List<File> files = new ArrayList<File>(); | |
List<List<String>> fileComparisons = new ArrayList<List<String>>(); | |
List<String> headers = new ArrayList<String>(); | |
// Get the list of files from the directory in question and add them to a stream | |
try { | |
File f = new File(pathToDir); | |
// Override FilenameFilter.accept to ignore . files | |
FilenameFilter filter = new FilenameFilter() { | |
@Override | |
public boolean accept(File f, String name) { | |
if (name.startsWith(".")) { | |
return false; | |
} else { | |
return true; | |
} | |
} | |
}; | |
// Create a List of Files from an Array of File objects | |
files = Arrays.asList(f.listFiles(filter)); | |
} catch (Exception e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
} | |
// Make final clone of files for use in parallel stream | |
final List<File> filesClone = new ArrayList<File>(files); | |
// Evaluate/compare these files in parallel | |
try { | |
// https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collectors.html (Lots of cool capabilities I'm not using here) | |
fileComparisons = files.parallelStream() | |
.map(f -> getComparison(f, filesClone)) | |
.collect(Collectors.toList()); | |
} catch (Exception e) { | |
// Output error | |
System.err.println(e.getMessage()); | |
} | |
// Compile headers row | |
headers.add("file_name"); | |
headers.add("mime_type"); | |
headers.add("sha256_checksum"); | |
for (File file : filesClone) { | |
headers.add(file.getName()); | |
} | |
try { | |
// Write output to file | |
writeToCSV(pathToOutfile, headers, fileComparisons); | |
} catch (IOException e) { | |
// Output error and exit | |
System.err.println("Comparison data could not be written: " + e.getMessage()); | |
return; | |
} | |
System.out.println("Comparison data written to " + pathToOutfile); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment