Skip to content

Instantly share code, notes, and snippets.

@sophistifunk
Created April 22, 2014 14:08
Show Gist options
  • Save sophistifunk/11180650 to your computer and use it in GitHub Desktop.
Save sophistifunk/11180650 to your computer and use it in GitHub Desktop.
Simple, poorly factored proof of concept for de-duping data store algorithm.
package com.expantra;
import com.expantra.buzhash.BuzHash;
import javax.xml.bind.DatatypeConverter;
import java.io.*;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import static java.lang.System.out;
/**
* Created by josh on 16/04/2014.
* <p/>
* Crap, poorly factored code. PoC === PoS. Seriously. Throws IOExceptions it's so weak. Will RTE if you look at
* it funny. Also slow. And did I mention crap? Do not taunt Happy Fun Ball.
*/
public class SaveFile {
public static void main(String[] args) throws IOException, NoSuchAlgorithmException {
if (args.length != 2) {
out.println("Usage: SaveFile inputFile outputDir");
return;
}
final int hashWindow = 67;
final int maxChunkSize = 0x8000; // 32k
final int readBufferLen = 2048;
byte[] readBuffer = new byte[readBufferLen];
byte[] chunkBuffer = new byte[maxChunkSize];
// Input
File inputFile = new File(args[0]);
BufferedInputStream inputStream = new BufferedInputStream(new FileInputStream(inputFile));
// Output
File outputDir = new File(args[1]);
if (!outputDir.isDirectory())
throw new RuntimeException("Output dir \"" + outputDir.getCanonicalPath() + "\" not a directory");
File indexOutFile = new File(outputDir.getCanonicalPath() + File.separator + inputFile.getName());
if (indexOutFile.exists())
throw new RuntimeException("Temp index filename \"" + indexOutFile.getCanonicalPath() + " exists :(");
FileOutputStream indexOut = new FileOutputStream(indexOutFile);
File chunkDir = new File(outputDir.getCanonicalPath() + File.separator + "chunks");
if (!chunkDir.exists())
chunkDir.mkdir();
if (!chunkDir.isDirectory())
throw new RuntimeException("Chunk dir \"" + chunkDir.getCanonicalPath() + "\" not a directory");
// Setup
BuzHash inputHasher = new BuzHash(hashWindow);
MessageDigest fileDigest = MessageDigest.getInstance("SHA-1");
MessageDigest chunkDigest = MessageDigest.getInstance("SHA-1");
int lastHash = -1;
int chunkLength = 0;
int chunkCount = 0;
long chunkBytesWritten = 0;
long totalBytes = 0;
while (inputStream.available() > 0) {
int len = inputStream.read(readBuffer);
totalBytes += len;
fileDigest.update(readBuffer, 0, len);
for (int i = 0; i < len; i++) {
byte b = readBuffer[i];
int rollingHash = inputHasher.addByte(b);
chunkDigest.update(b);
chunkBuffer[chunkLength] = b;
chunkLength++;
if (chunkLength == maxChunkSize || (lastHash != rollingHash && (rollingHash & 0xffff) == 0)) {
out.print(".");
// Write chunk
byte[] chunkHash = chunkDigest.digest();
chunkBytesWritten += writeChunk(chunkBuffer, chunkLength, chunkHash, chunkDir);
// Write chunk id to index
indexOut.write(chunkHash);
// Reset chunk, update count
chunkDigest.reset();
chunkLength = 0;
chunkCount++;
}
lastHash = rollingHash;
}
}
// Final chunk
if (chunkLength > 0) {
out.print(".");
chunkDigest.update(chunkBuffer, 0, chunkLength);
// Write chunk
byte[] chunkHash = chunkDigest.digest();
chunkBytesWritten += writeChunk(chunkBuffer, chunkLength, chunkHash, chunkDir);
// Write chunk id to index
indexOut.write(chunkHash);
chunkCount++;
}
indexOut.close();
byte[] fileHash = fileDigest.digest();
// Rename indexOutFile based on whole-file hash
String finalIndexFileName = indexOutFile.getCanonicalPath() + "-" + DatatypeConverter.printHexBinary(fileHash);
indexOutFile.renameTo(new File(finalIndexFileName));
out.println();
out.println();
out.println("original file : " + inputFile.getCanonicalPath());
out.println(" index file : " + finalIndexFileName);
out.println();
long avgSize = totalBytes / chunkCount;
long chunkIndexSize = chunkCount * 16; // Assuming 128bit hash
long totalStored = chunkBytesWritten + chunkIndexSize;
double storedPercent = (((double) totalStored / (double) totalBytes) * 100);
out.println();
out.println(" number of chunks : " + String.format("%,d",chunkCount));
out.println(" file size : " + String.format("%,d",totalBytes) + " bytes");
out.println(" unique chunk data : " + String.format("%,d",chunkBytesWritten) + " bytes");
out.println(" avg chunk size : " + String.format("%,d",avgSize) + " bytes");
out.println(" index overhead : " + String.format("%,d",chunkIndexSize) + " bytes");
out.println(" unique + index : " + String.format("%,d",totalStored) + " bytes, " + String.format("%,.3g", storedPercent) + "%");
}
private static int writeChunk(byte[] chunkBuffer, int chunkLength, byte[] chunkHash, File chunkDir) throws IOException {
String hashString = DatatypeConverter.printHexBinary(chunkHash);
File chunkFile = new File(chunkDir.getCanonicalPath() + File.separator + hashString);
if (chunkFile.exists())
return 0;
FileOutputStream chunkOut = new FileOutputStream(chunkFile);
chunkOut.write(chunkBuffer, 0, chunkLength);
chunkOut.close();
return chunkLength;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment