Last active
August 10, 2018 17:24
-
-
Save avesus/9e8c6801a0d986252de38de55295be98 to your computer and use it in GitHub Desktop.
"Vanilla" LZ77
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <math.h> | |
unsigned int countBits(unsigned int number) | |
{ | |
return (int)log2(number) + 1; | |
} | |
const char* longestPrefixOfInputThatBeginsInWindow (const char* searchTerm, int maxSearchTermLength, const char* windowPtr, int windowSize, int* longestMatchLength, int* prefixOffset, int* srcVarsMap, int* srcVars) { | |
// printf("\nsearchTerm: %s, inputSize: %d, windowPtr: %s, windowSize: %d\n", searchTerm, maxSearchTermLength, windowPtr, windowSize); | |
const char* result = 0; | |
* prefixOffset = 0; | |
* longestMatchLength = 0; | |
if (!windowSize) { | |
return 0; | |
} | |
for (int offset = 0; offset < windowSize; ++offset) { | |
const char* cmpWith = & windowPtr[ windowSize - offset - 1 ]; | |
// Collect statistics about characters | |
if (! srcVarsMap[ (unsigned char)cmpWith[0] ]) { | |
++(*srcVars); | |
++srcVarsMap[ (unsigned char)cmpWith[0] ]; | |
} | |
// Search a match | |
for (int currMatchLength = 1; currMatchLength <= maxSearchTermLength; ++currMatchLength) { | |
if (0 == memcmp(searchTerm, cmpWith, currMatchLength)) { | |
if (currMatchLength > * longestMatchLength) { | |
// printf("\nMATCH: %s %s %d\n", searchTerm, cmpWith, currMatchLength); | |
* longestMatchLength = currMatchLength; | |
*prefixOffset = offset + 1; | |
} | |
} | |
} | |
} | |
return & windowPtr[ windowSize - *prefixOffset ]; | |
} | |
// Determine window and lookahead buffer sizes depending on coding position | |
void buffersFromCodingPosition (int pos, int* windowStart, int* windowSize, int* lookaheadStart, int* lookaheadSize, int uncompressedSize, int maxWindowSize, int maxLookaheadSize) { | |
if (uncompressedSize < maxWindowSize) { | |
*windowStart = 0; | |
*windowSize = pos; | |
} else { | |
*windowStart = pos > maxWindowSize ? pos - maxWindowSize : 0; | |
*windowSize = pos - *windowStart; | |
} | |
if (uncompressedSize < maxLookaheadSize) { | |
*lookaheadSize = uncompressedSize - pos; | |
} else { | |
*lookaheadSize = pos < uncompressedSize - maxLookaheadSize ? maxLookaheadSize : uncompressedSize - pos; | |
} | |
*lookaheadStart = pos; | |
} | |
// 4 bytes: | |
struct lz77rec { | |
unsigned int offset : 14; | |
unsigned int len : 10; | |
char code : 8; | |
}; | |
lz77rec encoded[ 400000 ] = { 0 }; | |
int encodedSize = 0; | |
void compress (const char* uncompressed, int uncompressedSize, char* compressed, int* compressedSize, int maxWindowSize = 14, int maxLookaheadSize = 6) { | |
int codingPosition = 0; | |
int maxOffset = 0; | |
int maxSize = 0; | |
int emptyPointers = 0; | |
int srcVars = 0; | |
// Char frequencies for stats to chose optimal packing format | |
int srcVarsMap[ 256 ] = { 0 }; | |
while (codingPosition < uncompressedSize) { | |
int windowStart = 0; | |
int windowSize = 0; | |
int lookaheadStart = 0; | |
int lookaheadSize = 0; | |
buffersFromCodingPosition (codingPosition, & windowStart, & windowSize, & lookaheadStart, & lookaheadSize, uncompressedSize, maxWindowSize, maxLookaheadSize); | |
// printf("\n[%d] win %d<%d> lookahead %d<%d>: ", codingPosition, windowStart, windowSize, lookaheadStart, lookaheadSize); | |
int prefixSize = 0; | |
int prefixOffset = 0; | |
const char* prefix = longestPrefixOfInputThatBeginsInWindow( | |
& uncompressed[ lookaheadStart ], lookaheadSize, & uncompressed[ windowStart ], windowSize, & prefixSize, & prefixOffset, srcVarsMap, & srcVars); | |
if (!prefixOffset) { | |
++emptyPointers; | |
} | |
if (maxOffset < prefixOffset) { | |
maxOffset = prefixOffset; | |
} | |
if (maxSize < prefixSize) { | |
maxSize = prefixSize; | |
} | |
if (prefixSize > prefixOffset) { | |
} | |
printf("%04d (from %d) Go back %d copy %d then add \"%c\"\n", | |
encodedSize, codingPosition, prefixOffset, prefixSize, uncompressed[ codingPosition + prefixSize ]); | |
encoded[ encodedSize++ ] = { (unsigned int)prefixOffset, (unsigned int)prefixSize, uncompressed[ codingPosition + prefixSize ] }; | |
//for (int u = 0; u < prefixSize; ++u) { | |
// printf("%c", prefix[ u ]); | |
//} | |
codingPosition += prefixSize ? prefixSize + 1 : 1; | |
} | |
*compressedSize = encodedSize; | |
printf("At least ratio %f (packed size at least %d bytes; max go back: %d bits; max copy paste size: %d bits; empty pointers: %d; max src symbol bits: %d (%d symbols in alphabet))\n", | |
(double)uncompressedSize / double(encodedSize * (countBits(maxOffset) + countBits(maxSize) + countBits(srcVars)) / 8), | |
encodedSize * (countBits(maxOffset) + countBits(maxSize) + countBits(srcVars)) / 8, | |
countBits(maxOffset), countBits(maxSize), emptyPointers, countBits(srcVars), srcVars); | |
// Pack | |
*compressedSize = 0; | |
for (int i = 0; i < encodedSize; i += 1) { | |
unsigned int low = * ((unsigned int*)&encoded[ i ]); | |
// unsigned long int high = * ((unsigned long int*)&encoded[ i + 1 ]); | |
// unsigned long int packed7bytes = low | high << 28; | |
* ((unsigned int*)compressed) = low; | |
compressed += 4; | |
*compressedSize += 4; | |
} | |
} | |
void decompress (char* compressed, int compressedSize, char* uncompressed, int* uncompressedSize) { | |
char* decoded = uncompressed; | |
int decodedSize = 0; | |
for (int i = 0; i < compressedSize; ++i) { | |
lz77rec codePoint = encoded[ i ]; | |
//lz77rec codePoint = * ((lz77rec*)((unsigned int*)compressed)); | |
compressed += 4; | |
if (codePoint.offset) { | |
if (codePoint.offset > codePoint.len) { | |
memcpy(& decoded[ decodedSize ], & decoded[ decodedSize - codePoint.offset ], codePoint.len); | |
decodedSize += codePoint.len; | |
} else { | |
const char* beg = & decoded[ decodedSize ]; | |
for (int n = 0; n < codePoint.len; ++n) { | |
decoded[ decodedSize ] = decoded[ decodedSize - codePoint.offset ]; | |
++decodedSize; | |
} | |
} | |
} | |
if (codePoint.code || i < compressedSize - 1) { | |
decoded[ decodedSize++ ] = codePoint.code; | |
} | |
} | |
*uncompressedSize = decodedSize; | |
// printf("\n%s\n", decoded); | |
} | |
char* readFile (const char* filename, int* fileSize) { | |
char *buffer = 0; | |
int string_size, read_size; | |
FILE *handler = fopen(filename, "r"); | |
if (handler) | |
{ | |
// Seek the last byte of the file | |
fseek(handler, 0, SEEK_END); | |
// Offset from the first to the last byte, or in other words, filesize | |
string_size = ftell(handler); | |
// go back to the start of the file | |
rewind(handler); | |
// Allocate a string that can hold it all | |
buffer = (char*) malloc(string_size); | |
// Read it all in one operation | |
read_size = fread(buffer, sizeof(char), string_size, handler); | |
if (string_size != read_size) | |
{ | |
// Something went wrong, throw away the memory and set | |
// the buffer to NULL | |
free(buffer); | |
buffer = 0; | |
} | |
// Always remember to close the file. | |
fclose(handler); | |
} | |
*fileSize = string_size; | |
return buffer; | |
} | |
int main () { | |
int size = 0; | |
char* source = readFile("./src", &size); | |
// printf("%d %s\n", size, source); | |
if (source) { | |
char compressedBuffer[ 400000 ] = { 0 }; | |
int compressedSize = 0; | |
//source = "AABCBBABC"; | |
//size = 9; | |
source = "abrababababcadabrarray"; | |
size = 22; | |
compress(source, size, compressedBuffer, &compressedSize, 8192, 256); | |
printf("\nCompressed size: %d. Ratio: %f.\n", compressedSize, (double)size/(double)compressedSize); | |
char decompressedBuffer[ 400000 ] = { 0 }; | |
int decompressedSize = 0; | |
decompress(compressedBuffer, compressedSize, decompressedBuffer, & decompressedSize); | |
int ok = memcmp(source, decompressedBuffer, size); | |
printf("%d %d %d %d", ok, size, decompressedSize, strlen(decompressedBuffer)); | |
// free(source); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment