Skip to content

Instantly share code, notes, and snippets.

@pawitp
Created October 24, 2021 03:50
Show Gist options
  • Save pawitp/9881e6951eac40babd07a7d05ac74f56 to your computer and use it in GitHub Desktop.
Save pawitp/9881e6951eac40babd07a7d05ac74f56 to your computer and use it in GitHub Desktop.
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Random;
public class Buzhash32Chunking {
// If we use a multiple of 32, we get a lot of cancelling out
private static final int BLOCK_SIZE = 63;
private static final int BH_ROTATE = BLOCK_SIZE % 32;
private static final int BH_ROTATE_COMP = 32 - BH_ROTATE;
private static final int[] HASHES = new int[256];
private static int currentHash;
public static void main(String[] args) throws Exception {
init();
// Read the entire file to memory for simplicity
byte[] input = readInput("ubuntu-20.04.3-live-server-amd64.iso");
long startTime = System.nanoTime();
// Start at the position where we have enough bytes for BLOCK_SIZE
int currentPos = BLOCK_SIZE;
int currentStartPos = 0;
resetHash(input, currentPos);
while (currentPos < input.length) {
roll(input[currentPos - BLOCK_SIZE], input[currentPos]);
if (shouldSplit()) {
System.out.println(currentPos + "," + (currentPos - currentStartPos + 1));
currentStartPos = currentPos + 1;
currentPos += BLOCK_SIZE + 1;
resetHash(input, currentPos);
} else {
currentPos++;
}
}
// Last chunk
currentPos = input.length - 1;
System.out.println(currentPos + "," + (currentPos - currentStartPos + 1));
long endTime = System.nanoTime();
System.out.println("Finished in " + (endTime - startTime) / 1000000 + " ms");
}
private static byte[] readInput(String filename) throws IOException {
File file = new File(filename);
try (FileInputStream fis = new FileInputStream(file)) {
long len = file.length();
byte[] content = new byte[(int) len];
if (len != fis.read(content)) {
throw new IOException("Unexpected bytes read");
}
return content;
}
}
private static void init() {
Random r = new Random(42);
for (int i = 0; i < 256; i++) {
HASHES[i] = r.nextInt();
}
}
private static void resetHash(byte[] b, int pos) {
currentHash = 0;
for (int i = BLOCK_SIZE; i > 0; i--) {
currentHash = (currentHash << 1 | currentHash >>> 31) ^ HASHES[b[pos - i] + 128];
}
}
private static void roll(byte oldByte, byte newByte) {
int oldHash = HASHES[oldByte + 128];
currentHash = (currentHash << 1 | currentHash >>> 31) ^
(oldHash << BH_ROTATE | oldHash >>> BH_ROTATE_COMP) ^
HASHES[newByte + 128];
}
private static boolean shouldSplit() {
// We want 23 bits to be 0
return (currentHash & 0b11111111111111111111111) == 0;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment