Skip to content

Instantly share code, notes, and snippets.

@amintos
Created July 6, 2013 20:33
Show Gist options
  • Save amintos/5941184 to your computer and use it in GitHub Desktop.
Save amintos/5941184 to your computer and use it in GitHub Desktop.
Simple LZ-Like compression in Java.
/** Fast and simple text compression.
*
* This is a modified variant of LZ77:
* The algorithm inserts "marker bytes" whose bits state which
* of the following 8 symbols are actual symbols (0) or backward
* references (1). Backward references are actually two bytes, the
* first 6 bits encoding the length and the next 10 bits encoding
* the negative offset where the repetition occurred.
* The idea is related to LZO and LZJB compression.
*
* To detect backward references, the position of each trigram is
* stored to a limited-size hash table, eventually being
* overwritten by hash collisions.
*
*
*
* */
public class Compressor {
private static final int TABLE_SIZE = 256;
private static final int MATCH_BITS = 6;
private static final int MATCH_MIN = 3;
private static final int MATCH_MAX =
((1 << MATCH_BITS) + (MATCH_MIN - 1));
private static final int OFFSET_MASK =
((1 << (16 - MATCH_BITS)) - 1);
public static String compress(String text) {
int[] table = new int[TABLE_SIZE];
char[] input = text.toCharArray();
StringBuffer output = new StringBuffer();
int len = input.length;
int src = 0;
int dst = 0;
int cpy = 0;
int copymask = 1 << 7; // next indicator bit to be set
int copymap = 0; // last indicator byte
for (int i = 0; i < TABLE_SIZE; ++i) {
table[i] = 0xdeadbeef;
}
while (src < len) {
copymask <<= 1;
if (copymask == 1 << 8) {
// need next indicator byte.
copymask = 1;
copymap = dst;
output.append((char)0);
dst++;
}
if (src > len - MATCH_MAX) {
// better not match past this point
output.append(input[src++]);
dst++;
continue;
}
// simple hash function
int hash = (((int)(input[src]) + 13) ^
((int)(input[src + 1]) - 13) ^
((int)(input[src + 2]))) % TABLE_SIZE;
// hash-table lookup
int offset = (src - table[hash]) & OFFSET_MASK;
// store current trigram position to hash table
table[hash] = src;
cpy = src - offset;
// check whether real match found or overwritten table entry
if (cpy >= 0 && cpy != src &&
input[src] == input[cpy] &&
input[src + 1] == input[cpy + 1] &&
input[src + 2] == input[cpy + 2]) {
// toggle indicator bit in indicator byte.
output.setCharAt(copymap, (char) (output.charAt(copymap) | copymask));
// do forward matching from there
int mlen;
for (mlen = MATCH_MIN; mlen < MATCH_MAX; ++mlen) {
if (input[src + mlen] != input[cpy + mlen]) break;
}
// output 2 bytes, 6 bit length + 10 bit offset
output.append( (char)(
((mlen - MATCH_MIN ) << (8 - MATCH_BITS)) |
(offset >> 8) ));
output.append( (char)(offset & 0xff) );
src += mlen;
dst += 2;
// no match, just emit uncompressed symbol
} else {
output.append(input[src++]);
dst++;
}
}
return output.toString();
}
public static String decompress(String source) {
char[] input = source.toCharArray();
StringBuffer output = new StringBuffer();
int src = 0;
int dst = 0;
int cpy = 0;
int copymask = 1 << 7;
int copymap = 0;
int len = input.length;
while (src < len) {
copymask <<= 1;
if (copymask == 1 << 8) {
// indicator byte. just read it.
copymask = 1;
copymap = input[src++];
}
// indicator byte indicates a backward reference here
if ((copymap & copymask) > 0) {
// read and decode length and offset of reference
int mlen = (input[src] >> (8 - MATCH_BITS)) + MATCH_MIN;
int offset = ((input[src] << 8) | input[src + 1]) & OFFSET_MASK;
src += 2;
cpy = dst - offset;
// copy all bytes from reference
if (cpy >= 0) {
while (--mlen >= 0) {
output.append(output.charAt(cpy));
cpy++;
dst++;
}
} else {
// stream broken. just emit what we have and cancel.
return output.toString();
//throw new RuntimeException("Compressed data corrupted");
}
// no match, just emit uncompressed symbol
} else {
output.append(input[src++]);
dst++;
}
}
return output.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment