Skip to content

Instantly share code, notes, and snippets.

@avesus
Last active August 10, 2018 17:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avesus/9e8c6801a0d986252de38de55295be98 to your computer and use it in GitHub Desktop.
Save avesus/9e8c6801a0d986252de38de55295be98 to your computer and use it in GitHub Desktop.
"Vanilla" LZ77
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
unsigned int countBits(unsigned int number)
{
return (int)log2(number) + 1;
}
const char* longestPrefixOfInputThatBeginsInWindow (const char* searchTerm, int maxSearchTermLength, const char* windowPtr, int windowSize, int* longestMatchLength, int* prefixOffset, int* srcVarsMap, int* srcVars) {
// printf("\nsearchTerm: %s, inputSize: %d, windowPtr: %s, windowSize: %d\n", searchTerm, maxSearchTermLength, windowPtr, windowSize);
const char* result = 0;
* prefixOffset = 0;
* longestMatchLength = 0;
if (!windowSize) {
return 0;
}
for (int offset = 0; offset < windowSize; ++offset) {
const char* cmpWith = & windowPtr[ windowSize - offset - 1 ];
// Collect statistics about characters
if (! srcVarsMap[ (unsigned char)cmpWith[0] ]) {
++(*srcVars);
++srcVarsMap[ (unsigned char)cmpWith[0] ];
}
// Search a match
for (int currMatchLength = 1; currMatchLength <= maxSearchTermLength; ++currMatchLength) {
if (0 == memcmp(searchTerm, cmpWith, currMatchLength)) {
if (currMatchLength > * longestMatchLength) {
// printf("\nMATCH: %s %s %d\n", searchTerm, cmpWith, currMatchLength);
* longestMatchLength = currMatchLength;
*prefixOffset = offset + 1;
}
}
}
}
return & windowPtr[ windowSize - *prefixOffset ];
}
// Determine window and lookahead buffer sizes depending on coding position
void buffersFromCodingPosition (int pos, int* windowStart, int* windowSize, int* lookaheadStart, int* lookaheadSize, int uncompressedSize, int maxWindowSize, int maxLookaheadSize) {
if (uncompressedSize < maxWindowSize) {
*windowStart = 0;
*windowSize = pos;
} else {
*windowStart = pos > maxWindowSize ? pos - maxWindowSize : 0;
*windowSize = pos - *windowStart;
}
if (uncompressedSize < maxLookaheadSize) {
*lookaheadSize = uncompressedSize - pos;
} else {
*lookaheadSize = pos < uncompressedSize - maxLookaheadSize ? maxLookaheadSize : uncompressedSize - pos;
}
*lookaheadStart = pos;
}
// 4 bytes:
struct lz77rec {
unsigned int offset : 14;
unsigned int len : 10;
char code : 8;
};
lz77rec encoded[ 400000 ] = { 0 };
int encodedSize = 0;
void compress (const char* uncompressed, int uncompressedSize, char* compressed, int* compressedSize, int maxWindowSize = 14, int maxLookaheadSize = 6) {
int codingPosition = 0;
int maxOffset = 0;
int maxSize = 0;
int emptyPointers = 0;
int srcVars = 0;
// Char frequencies for stats to chose optimal packing format
int srcVarsMap[ 256 ] = { 0 };
while (codingPosition < uncompressedSize) {
int windowStart = 0;
int windowSize = 0;
int lookaheadStart = 0;
int lookaheadSize = 0;
buffersFromCodingPosition (codingPosition, & windowStart, & windowSize, & lookaheadStart, & lookaheadSize, uncompressedSize, maxWindowSize, maxLookaheadSize);
// printf("\n[%d] win %d<%d> lookahead %d<%d>: ", codingPosition, windowStart, windowSize, lookaheadStart, lookaheadSize);
int prefixSize = 0;
int prefixOffset = 0;
const char* prefix = longestPrefixOfInputThatBeginsInWindow(
& uncompressed[ lookaheadStart ], lookaheadSize, & uncompressed[ windowStart ], windowSize, & prefixSize, & prefixOffset, srcVarsMap, & srcVars);
if (!prefixOffset) {
++emptyPointers;
}
if (maxOffset < prefixOffset) {
maxOffset = prefixOffset;
}
if (maxSize < prefixSize) {
maxSize = prefixSize;
}
if (prefixSize > prefixOffset) {
}
printf("%04d (from %d) Go back %d copy %d then add \"%c\"\n",
encodedSize, codingPosition, prefixOffset, prefixSize, uncompressed[ codingPosition + prefixSize ]);
encoded[ encodedSize++ ] = { (unsigned int)prefixOffset, (unsigned int)prefixSize, uncompressed[ codingPosition + prefixSize ] };
//for (int u = 0; u < prefixSize; ++u) {
// printf("%c", prefix[ u ]);
//}
codingPosition += prefixSize ? prefixSize + 1 : 1;
}
*compressedSize = encodedSize;
printf("At least ratio %f (packed size at least %d bytes; max go back: %d bits; max copy paste size: %d bits; empty pointers: %d; max src symbol bits: %d (%d symbols in alphabet))\n",
(double)uncompressedSize / double(encodedSize * (countBits(maxOffset) + countBits(maxSize) + countBits(srcVars)) / 8),
encodedSize * (countBits(maxOffset) + countBits(maxSize) + countBits(srcVars)) / 8,
countBits(maxOffset), countBits(maxSize), emptyPointers, countBits(srcVars), srcVars);
// Pack
*compressedSize = 0;
for (int i = 0; i < encodedSize; i += 1) {
unsigned int low = * ((unsigned int*)&encoded[ i ]);
// unsigned long int high = * ((unsigned long int*)&encoded[ i + 1 ]);
// unsigned long int packed7bytes = low | high << 28;
* ((unsigned int*)compressed) = low;
compressed += 4;
*compressedSize += 4;
}
}
void decompress (char* compressed, int compressedSize, char* uncompressed, int* uncompressedSize) {
char* decoded = uncompressed;
int decodedSize = 0;
for (int i = 0; i < compressedSize; ++i) {
lz77rec codePoint = encoded[ i ];
//lz77rec codePoint = * ((lz77rec*)((unsigned int*)compressed));
compressed += 4;
if (codePoint.offset) {
if (codePoint.offset > codePoint.len) {
memcpy(& decoded[ decodedSize ], & decoded[ decodedSize - codePoint.offset ], codePoint.len);
decodedSize += codePoint.len;
} else {
const char* beg = & decoded[ decodedSize ];
for (int n = 0; n < codePoint.len; ++n) {
decoded[ decodedSize ] = decoded[ decodedSize - codePoint.offset ];
++decodedSize;
}
}
}
if (codePoint.code || i < compressedSize - 1) {
decoded[ decodedSize++ ] = codePoint.code;
}
}
*uncompressedSize = decodedSize;
// printf("\n%s\n", decoded);
}
char* readFile (const char* filename, int* fileSize) {
char *buffer = 0;
int string_size, read_size;
FILE *handler = fopen(filename, "r");
if (handler)
{
// Seek the last byte of the file
fseek(handler, 0, SEEK_END);
// Offset from the first to the last byte, or in other words, filesize
string_size = ftell(handler);
// go back to the start of the file
rewind(handler);
// Allocate a string that can hold it all
buffer = (char*) malloc(string_size);
// Read it all in one operation
read_size = fread(buffer, sizeof(char), string_size, handler);
if (string_size != read_size)
{
// Something went wrong, throw away the memory and set
// the buffer to NULL
free(buffer);
buffer = 0;
}
// Always remember to close the file.
fclose(handler);
}
*fileSize = string_size;
return buffer;
}
int main () {
int size = 0;
char* source = readFile("./src", &size);
// printf("%d %s\n", size, source);
if (source) {
char compressedBuffer[ 400000 ] = { 0 };
int compressedSize = 0;
//source = "AABCBBABC";
//size = 9;
source = "abrababababcadabrarray";
size = 22;
compress(source, size, compressedBuffer, &compressedSize, 8192, 256);
printf("\nCompressed size: %d. Ratio: %f.\n", compressedSize, (double)size/(double)compressedSize);
char decompressedBuffer[ 400000 ] = { 0 };
int decompressedSize = 0;
decompress(compressedBuffer, compressedSize, decompressedBuffer, & decompressedSize);
int ok = memcmp(source, decompressedBuffer, size);
printf("%d %d %d %d", ok, size, decompressedSize, strlen(decompressedBuffer));
// free(source);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment