Skip to content

Instantly share code, notes, and snippets.

@AshwinJay
Last active September 1, 2016 06:15
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save AshwinJay/8388007 to your computer and use it in GitHub Desktop.
Save AshwinJay/8388007 to your computer and use it in GitHub Desktop.
package com.javaforu.rsync;
import com.javaforu.rsync.Sync.CharRingBuffer.Visitor;
import rollinghash.RabinKarpHash;
import java.util.*;
import java.util.zip.CRC32;
/**
* Author: Ashwin Jayaprakash / ashwin.jayaprakash@gmail.com / http://www.ashwinjayaprakash.com
*
* License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0.html)
*
* Utility class to perform Rsync type of diff/patch.
*/
public abstract class Sync {
private Sync() {
}
/**
* Use the given input sequence and the block size and generate a sequence of "blocks". This
* will be used later for creating patches.
*
* @param str
* @param blockSize
* @return
*/
public static Blocks generateBlocks(CharSequence str, int blockSize) {
CRC32 hash = new CRC32();
RabinKarpHash rollingHash = new RabinKarpHash(blockSize);
CharRingBuffer ringBuffer = new CharRingBuffer(blockSize);
Blocks blocks = new Blocks();
final int strLen = str.length();
for (int i = 0; i < strLen; ) {
int blockEnd = Math.min(i + blockSize, strLen);
hash.reset();
int blockStart = i;
for (; i < blockEnd; i++) {
char c = str.charAt(i);
if (i < blockSize) {
rollingHash.eat(c);
ringBuffer.add(c);
}
else {
char prevChar = ringBuffer.add(c);
rollingHash.update(prevChar, c);
}
hash.update(c >> 8);
hash.update(c);
}
blocks.add(new Block(rollingHash.hashvalue, hash.getValue(), blockStart, blockEnd));
}
return blocks;
}
/**
* Uses the pre-generated blocks and runs it against the given char sequence to identify commonalities/overlap.
* If there are common sequences in the given string and the pre-generated blocks, then the patch will be small.
* Otherwise, the patch will be unable to reuse anything from the blocks.
*
* @param input The string to be patched
* @param blocks The original source against which the input string is being diff'ed and patched.
* @param blockSize The same size that was used for creating the blocks.
* @return The patched output which may reuse data from the blocks if it overlaps considerably with the given
* input string.
* @see #generateBlocks(CharSequence, int)
*/
public static CharSequencePatch generatePatch(CharSequence input, Blocks blocks, int blockSize) {
final CRC32 hash = new CRC32();
Visitor hashInputFiller = new Visitor() {
@Override
public void visit(char c) {
hash.update(c >> 8);
hash.update(c);
}
};
RabinKarpHash rollingHash = new RabinKarpHash(blockSize);
CharRingBuffer rollingHashBuffer = new CharRingBuffer(blockSize);
final CharSequencePatch patch = new CharSequencePatch(blockSize);
final int inputLen = input.length();
for (int i = 0; i < inputLen; ) {
int blockEnd = Math.min(i + blockSize, inputLen);
for (; i < blockEnd; i++) {
char c = input.charAt(i);
if (!rollingHashBuffer.isFull()) {
rollingHashBuffer.add(c);
rollingHash.eat(c);
}
else {
char prevChar = rollingHashBuffer.add(c);
rollingHash.update(prevChar, c);
//Add the bumped off char to the patch.
patch.add(prevChar);
}
if (blocks.blockMaybePresent(rollingHash.hashvalue)) {
hash.reset();
//Peek into rolling hash buffer contents and generate a strong hash.
rollingHashBuffer.iterate(hashInputFiller);
Block foundBlock = blocks.getBlock(rollingHash.hashvalue, hash.getValue());
if (foundBlock != null) {
patch.add(foundBlock);
//Dump the buffers. No need for them as we have the block.
rollingHashBuffer.reset();
rollingHash = new RabinKarpHash(blockSize);
i++;
break;
}
}
}
}
//Fliush any trailing chars.
CharBufferCopier copier = new CharBufferCopier(blockSize);
if (rollingHashBuffer.iterate(copier) > 0) {
for (char x : copier.flushAndReset()) {
patch.add(x);
}
rollingHashBuffer.reset();
}
patch.seal();
return patch;
}
//-------------
public static class CharRingBuffer {
private final char[] chars;
private int size;
private int addAt;
public CharRingBuffer(int capacity) {
this.chars = new char[capacity];
}
public int getCapacity() {
return chars.length;
}
public boolean isFull() {
return size == chars.length;
}
public char add(char c) {
char retVal = chars[addAt];
chars[addAt] = c;
addAt = (addAt + 1) % chars.length;
if (size < chars.length) {
size++;
retVal = 0;
}
return retVal;
}
/**
* @param visitor
* @return The number of characters that were visited.
*/
public int iterate(Visitor visitor) {
int count = 0;
if (size < chars.length) {
for (int i = 0; i < addAt; i++, count++) {
visitor.visit(chars[i]);
}
return count;
}
int i = addAt;
do {
visitor.visit(chars[i]);
i = (i + 1) % chars.length;
count++;
}
while (i != addAt);
return count;
}
public void reset() {
size = 0;
addAt = 0;
}
//-------------
public interface Visitor {
void visit(char c);
}
}
public static class CharBufferCopier implements Visitor {
private char[] copiedChars;
private int addAt;
public CharBufferCopier(int capacity) {
this.copiedChars = new char[capacity];
}
@Override
public void visit(char c) {
//No need to check buffer overflow.
copiedChars[addAt++] = c;
}
public final boolean isfull() {
return (addAt == copiedChars.length);
}
public char[] flushAndReset() {
char[] array = null;
if (addAt > 0) {
if (isfull()) {
array = copiedChars;
copiedChars = new char[copiedChars.length];
}
else {
array = new char[addAt];
System.arraycopy(copiedChars, 0, array, 0, addAt);
}
addAt = 0;
}
return array;
}
}
public static class Blocks {
private final Map<Integer, List<Block>> rollingHashToBlocks;
private Block firstBlock;
private Block lastBlock;
public Blocks() {
this.rollingHashToBlocks = new HashMap<Integer, List<Block>>();
}
public Block getFirstBlock() {
return firstBlock;
}
final void add(Block block) {
Integer key = block.rollingHash;
List<Block> list = rollingHashToBlocks.get(key);
if (list == null) {
list = new LinkedList<Block>();
rollingHashToBlocks.put(key, list);
}
list.add(block);
if (lastBlock != null) {
lastBlock.nextBlock = block;
}
else {
firstBlock = block;
}
lastBlock = block;
}
boolean blockMaybePresent(int rollingHash) {
return rollingHashToBlocks.get(rollingHash) != null;
}
Block getBlock(int rollingHash, long hash) {
List<Block> list = rollingHashToBlocks.get(rollingHash);
if (list == null) {
return null;
}
for (Block block : list) {
if (block.hash == hash) {
return block;
}
}
return null;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Block block = firstBlock;
while (block != null) {
sb.append(block).append("\n");
block = block.nextBlock;
}
return sb.toString();
}
}
/**
* Summaries of the input text block.
*/
public static class Block {
private final int rollingHash;
private final long hash;
private final int startOffsetInc;
private final int endOffsetExc;
private Block nextBlock;
public Block(int rollingHash, long hash, int startOffsetInc, int endOffsetExc) {
this.rollingHash = rollingHash;
this.hash = hash;
this.startOffsetInc = startOffsetInc;
this.endOffsetExc = endOffsetExc;
}
public int getRollingHash() {
return rollingHash;
}
public long getHash() {
return hash;
}
public int getStartOffsetInc() {
return startOffsetInc;
}
public int getEndOffsetExc() {
return endOffsetExc;
}
public Block getNextBlock() {
return nextBlock;
}
@Override
public String toString() {
return "Block{" +
"rollingHash=" + rollingHash +
", hash=" + hash +
", startOffsetInc=" + startOffsetInc +
", endOffsetExc=" + endOffsetExc +
", nextBlock?" + (nextBlock != null) +
'}';
}
}
public static class Patch implements Iterable<Object> {
protected final List<Object> parts;
public Patch() {
parts = new LinkedList<Object>();
}
void add(Block block) {
parts.add(block);
}
void add(Object o) {
parts.add(o);
}
/**
* Should be called once the patch creation is completed.
*/
public void seal() {
}
@Override
public Iterator<Object> iterator() {
return parts.iterator();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (Object part : parts) {
sb.append(part).append("\n");
}
return sb.toString();
}
}
public static class CharSequencePatch extends Patch {
private static final String TEMPLATE_ERR_MSG =
"Only " + Block.class.getName() + " and char[] are supported in the patch. Unrecognized type [%s]";
private CharRingBuffer charBuffer;
private CharBufferCopier charCopier;
public CharSequencePatch(int blockSize) {
this.charBuffer = new CharRingBuffer(blockSize);
this.charCopier = new CharBufferCopier(blockSize);
}
private void flushBuffer() {
//Make some room.
char[] chars = charCopier.flushAndReset();
if (chars != null) {
parts.add(chars);
}
//Copy new chars.
int i = charBuffer.iterate(charCopier);
charBuffer.reset();
//Flush newly copied chars.
if (i > 0) {
chars = charCopier.flushAndReset();
if (chars != null) {
parts.add(chars);
}
}
}
@Override
void add(Block block) {
flushBuffer();
super.add(block);
}
@Override
void add(Object o) {
if (o instanceof Block) {
add((Block) o);
}
else if (o instanceof Character) {
add(((Character) o).charValue());
}
else if (o instanceof char[]) {
for (char c : (char[]) o) {
add(c);
}
}
else if (o instanceof Character[]) {
for (Character c : (Character[]) o) {
add(c.charValue());
}
}
else {
throw new RuntimeException(String.format(TEMPLATE_ERR_MSG, o));
}
}
void add(char c) {
if (charBuffer.isFull()) {
flushBuffer();
}
charBuffer.add(c);
}
@Override
public void seal() {
flushBuffer();
charBuffer = null;
charCopier = null;
}
/**
* @param source The source to be patched (The string that was use to create the {@link #add(Block) blocks}).
* @return The patched string.
*/
public String apply(CharSequence source) {
StringBuilder sb = new StringBuilder(source.length() /*At least this long*/);
for (Object part : parts) {
if (part instanceof Block) {
Block block = (Block) part;
sb.append(source.subSequence(block.startOffsetInc, block.endOffsetExc));
}
else if (part instanceof char[]) {
sb.append((char[]) part);
}
else {
throw new RuntimeException(String.format(TEMPLATE_ERR_MSG, part));
}
}
return sb.toString();
}
/**
* @return An int[] with [0] = "Total number of {@link Block}s" and [1] = "Total number of chars" in this
* patch.
*/
public int[] analyze() {
int blocks = 0;
int chars = 0;
for (Object part : parts) {
if (part instanceof Block) {
blocks++;
}
else if (part instanceof Character) {
chars++;
}
else if (part instanceof char[]) {
for (char c : (char[]) part) {
chars++;
}
}
else if (part instanceof Character[]) {
for (Character c : (Character[]) part) {
chars++;
}
}
else {
}
}
return new int[]{blocks, chars};
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (Object part : parts) {
if (part instanceof Block) {
sb.append(part).append("\n");
}
else if (part instanceof Character) {
sb.append(((Character) part).charValue()).append("\n");
}
else if (part instanceof char[]) {
sb.append("[ ");
for (char c : (char[]) part) {
sb.append(c).append(' ');
}
sb.append("]\n");
}
else if (part instanceof Character[]) {
for (Character c : (Character[]) part) {
sb.append(c.charValue()).append("\n");
}
}
}
return sb.toString();
}
}
}
package com.javaforu.rsync;
import com.javaforu.rsync.Sync.Blocks;
import com.javaforu.rsync.Sync.CharSequencePatch;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import java.lang.reflect.Method;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
/**
* Author: Ashwin Jayaprakash / ashwin.jayaprakash@gmail.com / http://www.ashwinjayaprakash.com
*
* License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0.html)
*/
public class SyncTest {
private static final boolean DEBUG = false;
String testName;
private static void sysPropsToMap(Map<String, String> map) {
for (Entry<Object, Object> entry : System.getProperties().entrySet()) {
map.put((String) entry.getKey(), (String) entry.getValue());
}
}
@BeforeMethod
public void nameBefore(Method method) {
testName = method.getName();
}
String diffAndPatch(int blockSize, String from, String to) {
Blocks blocks = Sync.generateBlocks(from, blockSize);
if (DEBUG) {
System.out.println("From blocks:");
System.out.println(blocks);
}
//At least 1 block is present.
Assert.assertNotNull(blocks.getFirstBlock());
//-------------
CharSequencePatch patch = Sync.generatePatch(to, blocks, blockSize);
if (DEBUG) {
System.out.println("To patch:");
System.out.println(patch);
}
//-------------
if (DEBUG) {
System.out.println();
System.out.println("From : " + from);
System.out.println("To : " + to);
}
Assert.assertFalse(from.equals(to));
String s3 = patch.apply(from);
if (DEBUG) {
System.out.println("Patched: " + s3);
}
Assert.assertEquals(s3, to);
int[] analysis = patch.analyze();
System.out
.printf("(%s) Number of blocks reused [%d], number of new chars sent [%d] versus total number of chars" +
" in naive approach [%d]. Block size [%d] chars%n",
testName, analysis[0], analysis[1], to.length(), blockSize);
return s3;
}
@Test
public void testSimple() throws NoSuchAlgorithmException, CloneNotSupportedException {
final int blockSize = 8;
TreeMap<String, String> props1 = new TreeMap<String, String>();
props1.put("aaa", "100");
props1.put("bbb", "200");
props1.put("ccc", "300");
props1.put("ddd", "400");
props1.put("eee", "500");
props1.put("fff", "600");
props1.put("ggg", "700");
props1.put("hhh", "800");
props1.put("iii", "900");
props1.put("jjj", "1000");
props1.put("kkk", "1100");
props1.put("lll", "1200");
String s1 = props1.toString();
//-------------
TreeMap<String, String> props2 = new TreeMap<String, String>();
props2.put("aaa", "100");
props2.put("bbb", "200");
props2.put("ccc", "300");
//Del.
//props.put("ddd", "400");
props2.put("eee", "500");
props2.put("fff", "600");
props2.put("ggg", "700");
props2.put("hhh", "800");
props2.put("iii", "900");
//Add.
props2.put("www", "777");
props2.put("xxx", "888");
props2.put("jjj", "1000");
props2.put("kkk", "1100");
props2.put("lll", "1200");
String s2 = props2.toString();
diffAndPatch(blockSize, s1, s2);
}
@Test
public void testMediumString() throws NoSuchAlgorithmException, CloneNotSupportedException {
final int blockSize = 32;
String s1 = "" +
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming id quod mazim placerat facer possim assum. Typi non habent claritatem insitam; est usus legentis in iis qui facit eorum claritatem. Investigationes demonstraverunt lectores legere me lius quod ii legunt saepius. Claritas est etiam processus dynamicus, qui sequitur mutationem consuetudium lectorum. Mirum est notare quam littera gothica, quam nunc putamus parum claram, anteposuerit litterarum formas humanitatis per seacula quarta decima et quinta decima. Eodem modo typi, qui nunc nobis videntur parum clari, fiant sollemnes in futurum.";
System.out.println(s1);
//-------------
String s2 = "" +
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation. Claritas est etiam processus dynamicus, qui sequitur mutationem consuetudium lectorum. Mirum est notare quam littera gothica, quam nunc putamus parum claram, anteposuerit litterarum formas humanitatis per seacula quarta decima et quinta decima.";
diffAndPatch(blockSize, s1, s2);
}
@Test
public void testBigString32() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new HashMap<String, String>();
sysPropsToMap(map);
doBigStringTest(32, map);
}
@Test
public void testBigString32Sorted() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new TreeMap<String, String>();
sysPropsToMap(map);
doBigStringTest(32, map);
}
@Test
public void testBigString64() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new HashMap<String, String>();
sysPropsToMap(map);
doBigStringTest(64, map);
}
@Test
public void testBigString64Sorted() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new TreeMap<String, String>();
sysPropsToMap(map);
doBigStringTest(64, map);
}
@Test
public void testBigString96() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new HashMap<String, String>();
sysPropsToMap(map);
doBigStringTest(96, map);
}
@Test
public void testBigString96Sorted() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new TreeMap<String, String>();
sysPropsToMap(map);
doBigStringTest(96, map);
}
@Test
public void testBigString128() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new HashMap<String, String>();
sysPropsToMap(map);
doBigStringTest(128, map);
}
@Test
public void testBigString128Sorted() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new TreeMap<String, String>();
sysPropsToMap(map);
doBigStringTest(128, map);
}
@Test
public void testBigString256() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new HashMap<String, String>();
sysPropsToMap(map);
doBigStringTest(256, map);
}
@Test
public void testBigString256Sorted() throws NoSuchAlgorithmException, CloneNotSupportedException {
Map<String, String> map = new TreeMap<String, String>();
sysPropsToMap(map);
doBigStringTest(256, map);
}
void doBigStringTest(int blockSize, Map<String, String> map) {
HashMap<String, String> diff = new HashMap<String, String>();
diff.put("aaa", "100");
diff.put("bbb", "200");
diff.put("ccc", "300");
diff.put("ddd", "400");
diff.put("eee", "500");
diff.put("fff", "600");
diff.put("ggg", "700");
diff.put("hhh", "800");
diff.put("iii", "900");
diff.put("jjj", "1000");
diff.put("kkk", "1100");
diff.put("lll", "1200");
//Cleanup from previous runs.
for (String s : diff.keySet()) {
map.remove(s);
}
String s1 = map.toString();
for (Entry<String, String> entry : diff.entrySet()) {
map.put(entry.getKey(), entry.getValue());
}
String s2 = map.toString();
diffAndPatch(blockSize, s1, s2);
}
}
(testBigString128) Number of blocks reused [31], number of new chars sent [1136] versus total number of chars in naive approach [5104]. Block size [128] chars
(testBigString128Sorted) Number of blocks reused [35], number of new chars sent [624] versus total number of chars in naive approach [5104]. Block size [128] chars
(testBigString256) Number of blocks reused [13], number of new chars sent [1776] versus total number of chars in naive approach [5104]. Block size [256] chars
(testBigString256Sorted) Number of blocks reused [16], number of new chars sent [1008] versus total number of chars in naive approach [5104]. Block size [256] chars
(testBigString32) Number of blocks reused [149], number of new chars sent [336] versus total number of chars in naive approach [5104]. Block size [32] chars
(testBigString32Sorted) Number of blocks reused [151], number of new chars sent [272] versus total number of chars in naive approach [5104]. Block size [32] chars
(testBigString64) Number of blocks reused [70], number of new chars sent [624] versus total number of chars in naive approach [5104]. Block size [64] chars
(testBigString64Sorted) Number of blocks reused [73], number of new chars sent [432] versus total number of chars in naive approach [5104]. Block size [64] chars
(testBigString96) Number of blocks reused [45], number of new chars sent [784] versus total number of chars in naive approach [5104]. Block size [96] chars
(testBigString96Sorted) Number of blocks reused [47], number of new chars sent [592] versus total number of chars in naive approach [5104]. Block size [96] chars
(testMediumString) Number of blocks reused [12], number of new chars sent [64] versus total number of chars in naive approach [448]. Block size [32] chars
(testSimple) Number of blocks reused [11], number of new chars sent [32] versus total number of chars in naive approach [120]. Block size [8] chars
@AshwinJay
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment