Skip to content

Instantly share code, notes, and snippets.

@wernsey
Last active August 20, 2022 10:25
Show Gist options
  • Save wernsey/70ce35a0ed019521943942c5d0f5518b to your computer and use it in GitHub Desktop.
Save wernsey/70ce35a0ed019521943942c5d0f5518b to your computer and use it in GitHub Desktop.
LZSS.C -- A Data Compression Program
/**************************************************************
LZSS.C -- A Data Compression Program
(tab = 4 spaces)
***************************************************************
4/6/1989 Haruhiko Okumura
Use, distribute, and modify this program freely.
Please send me your improved versions.
PC-VAN SCIENCE
NIFTY-Serve PAF01022
CompuServe 74050,1022
17/9/2021 Werner Stoop
I made some changes to
* make the code reentrant
* be useable as a single header library
* compress/decompress files or memory buffers
* specify limits on how many bytes to read (so that multiple LZSS
blocks can be put in a single file)
GitHub user r-lyeh also has a modified version of LZSS.c, and I based
some of my changes on what they did:
https://github.com/r-lyeh/stdpack.c/blob/master/src/lzss.c
I found the original in an archive of the Simtel mirrors
https://web.archive.org/web/19990203141013/http://oak.oakland.edu/pub/simtelnet/msdos/arcutils/lz_comp2.zip
Relevant links:
* https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Storer%E2%80%93Szymanski
* https://go-compression.github.io/algorithms/lzss/
To incorporate it as a library in another project:
#define LZSS_IMPLEMENTATION
#include "lzss.c"
Compile with the LZSS_EXEC symbol defined to create an executable.
$ gcc -DLZSS_EXEC -Wall lzss.c
**************************************************************/
#include <stdio.h>
typedef struct LzssIO {
/* The `get` function reads one byte from its input.
It should return EOF when the end of stream is reached */
int (*get)(struct LzssIO *);
/* The `put` function writes one byte to its output.
It should return EOF on error */
int (*put)(char, struct LzssIO *);
size_t limit, count;
union {
FILE *file;
char *buffer;
} io;
} LzssIO;
LzssIO *lzss_file_io(LzssIO *fr, FILE *f, size_t limit);
LzssIO *lzss_buffer_io(LzssIO *br, char *buffer, size_t buffer_len);
size_t lzss_encode(LzssIO *in, LzssIO *out);
size_t lzss_decode(LzssIO *in, LzssIO *out);
#if defined(LZSS_EXEC) && !defined(LZSS_IMPLEMENTATION)
# define LZSS_IMPLEMENTATION
#endif
#if defined(LZSS_IMPLEMENTATION)
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define N 4096 /* size of ring buffer */
#define F 18 /* upper limit for match_length */
#define THRESHOLD 2 /* encode string into position and length
if match_length is greater than this */
#define NIL N /* index for root of binary search trees */
typedef struct {
int match_position, match_length, /* of longest match. These are
set by the InsertNode() procedure. */
lson[N + 1], rson[N + 257], dad[N + 1]; /* left & right children &
parents -- These constitute binary search trees. */
} Tree;
static int file_get(LzssIO *r) {
return getc(r->io.file);
}
static int file_put(char c, LzssIO *w) {
return putc(c, w->io.file);
}
LzssIO *lzss_file_io(LzssIO *fio, FILE *f, size_t limit) {
fio->io.file = f;
fio->get = file_get;
fio->put = file_put;
fio->limit = limit;
return fio;
}
static int buffer_get(LzssIO *r) {
return r->io.buffer[r->count] & 0xFF;
}
static int buffer_put(char c, LzssIO *w) {
w->io.buffer[w->count] = c;
return 1;
}
LzssIO *lzss_buffer_io(LzssIO *bio, char *buffer, size_t buffer_len) {
bio->io.buffer = buffer;
bio->get = buffer_get;
bio->put = buffer_put;
bio->limit = buffer_len;
return bio;
}
static int getbyte(LzssIO *in) {
int c;
if(in->limit > 0 && in->count >= in->limit)
return EOF;
c = in->get(in);
if(c == EOF)
return EOF;
in->count++;
return c;
}
static int putbyte(char c, LzssIO *out) {
int v;
if(out->limit > 0 && out->count > out->limit)
return EOF;
v = out->put(c, out);
if(v == EOF)
return EOF;
out->count++;
return v;
}
static void InitTree(Tree *en) /* initialize trees */
{
int i;
/* For i = 0 to N - 1, rson[i] and lson[i] will be the right and
left children of node i. These nodes need not be initialized.
Also, dad[i] is the parent of node i. These are initialized to
NIL (= N), which stands for 'not used.'
For i = 0 to 255, rson[N + i + 1] is the root of the tree
for strings that begin with character i. These are initialized
to NIL. Note there are 256 trees. */
for (i = N + 1; i <= N + 256; i++)
en->rson[i] = NIL;
for (i = 0; i < N; i++)
en->dad[i] = NIL;
}
static void InsertNode(Tree *en, unsigned char *text_buf, int r)
/* Inserts string of length F, text_buf[r..r+F-1], into one of the
trees (text_buf[r]'th tree) and returns the longest-match position
and length via the global variables match_position and match_length.
If match_length = F, then removes the old node in favor of the new
one, because the old one will be deleted sooner.
Note r plays double role, as tree node and position in buffer. */
{
int i, p, cmp;
unsigned char *key;
cmp = 1; key = &text_buf[r]; p = N + 1 + key[0];
en->rson[r] = en->lson[r] = NIL; en->match_length = 0;
for ( ; ; ) {
if (cmp >= 0) {
if (en->rson[p] != NIL)
p = en->rson[p];
else {
en->rson[p] = r;
en->dad[r] = p;
return;
}
} else {
if (en->lson[p] != NIL)
p = en->lson[p];
else {
en->lson[p] = r;
en->dad[r] = p;
return;
}
}
for (i = 1; i < F; i++)
if ((cmp = key[i] - text_buf[p + i]) != 0)
break;
if (i > en->match_length) {
en->match_position = p;
if ((en->match_length = i) >= F)
break;
}
}
en->dad[r] = en->dad[p]; en->lson[r] = en->lson[p]; en->rson[r] = en->rson[p];
en->dad[en->lson[p]] = r; en->dad[en->rson[p]] = r;
if (en->rson[en->dad[p]] == p)
en->rson[en->dad[p]] = r;
else
en->lson[en->dad[p]] = r;
en->dad[p] = NIL; /* remove p */
}
static void DeleteNode(Tree *en, int p) /* deletes node p from tree */
{
int q;
if (en->dad[p] == NIL)
return; /* not in tree */
if (en->rson[p] == NIL)
q = en->lson[p];
else if (en->lson[p] == NIL)
q = en->rson[p];
else {
q = en->lson[p];
if (en->rson[q] != NIL) {
do {
q = en->rson[q];
} while (en->rson[q] != NIL);
en->rson[en->dad[q]] = en->lson[q];
en->dad[en->lson[q]] = en->dad[q];
en->lson[q] = en->lson[p];
en->dad[en->lson[p]] = q;
}
en->rson[q] = en->rson[p];
en->dad[en->rson[p]] = q;
}
en->dad[q] = en->dad[p];
if (en->rson[en->dad[p]] == p)
en->rson[en->dad[p]] = q;
else
en->lson[en->dad[p]] = q;
en->dad[p] = NIL;
}
size_t lzss_encode(LzssIO *in, LzssIO *out)
{
int i, c, len, r, s, last_match_length, code_buf_ptr;
unsigned char code_buf[17], mask;
Tree en;
unsigned char text_buf[N + F - 1]; /* ring buffer of size N,
with extra F-1 bytes to facilitate string comparison */
in->count = 0;
out->count = 0;
InitTree(&en); /* initialize trees */
code_buf[0] = 0; /* code_buf[1..16] saves eight units of code, and
code_buf[0] works as eight flags, "1" representing that the unit
is an unencoded letter (1 byte), "0" a position-and-length pair
(2 bytes). Thus, eight units require at most 16 bytes of code. */
code_buf_ptr = mask = 1;
s = 0; r = N - F;
for (i = s; i < r; i++) text_buf[i] = ' '; /* Clear the buffer with
any character that will appear often. */
for (len = 0; len < F && (c = getbyte(in)) != EOF; len++)
text_buf[r + len] = c; /* Read F bytes into the last F bytes of
the buffer */
if (len == 0)
return 0; /* text of size zero */
for (i = 1; i <= F; i++)
InsertNode(&en, text_buf, r - i); /* Insert the F strings,
each of which begins with one or more 'space' characters. Note
the order in which these strings are inserted. This way,
degenerate trees will be less likely to occur. */
InsertNode(&en, text_buf, r); /* Finally, insert the whole string just read. The
global variables match_length and match_position are set. */
do {
if (en.match_length > len)
en.match_length = len; /* match_length
may be spuriously long near the end of text. */
if (en.match_length <= THRESHOLD) {
en.match_length = 1; /* Not long enough match. Send one byte. */
code_buf[0] |= mask; /* 'send one byte' flag */
code_buf[code_buf_ptr++] = text_buf[r]; /* Send uncoded. */
} else {
code_buf[code_buf_ptr++] = (unsigned char) en.match_position;
code_buf[code_buf_ptr++] = (unsigned char)
(((en.match_position >> 4) & 0xf0)
| (en.match_length - (THRESHOLD + 1))); /* Send position and
length pair. Note match_length > THRESHOLD. */
}
if ((mask <<= 1) == 0) { /* Shift mask left one bit. */
for (i = 0; i < code_buf_ptr; i++) /* Send at most 8 units of */
if(putbyte(code_buf[i], out) == EOF) /* code together */
return 0;
code_buf[0] = 0; code_buf_ptr = mask = 1;
}
last_match_length = en.match_length;
for (i = 0; i < last_match_length &&
(c = getbyte(in)) != EOF; i++) {
DeleteNode(&en, s); /* Delete old strings and */
text_buf[s] = c; /* read new bytes */
if (s < F - 1)
text_buf[s + N] = c; /* If the position is
near the end of buffer, extend the buffer to make
string comparison easier. */
s = (s + 1) & (N - 1); r = (r + 1) & (N - 1);
/* Since this is a ring buffer, increment the position
modulo N. */
InsertNode(&en, text_buf, r); /* Register the string in text_buf[r..r+F-1] */
}
while (i++ < last_match_length) { /* After the end of text, */
DeleteNode(&en, s); /* no need to read, but */
s = (s + 1) & (N - 1);
r = (r + 1) & (N - 1);
if (--len)
InsertNode(&en, text_buf, r); /* buffer may not be empty. */
}
} while (len > 0); /* until length of string to be processed is zero */
if (code_buf_ptr > 1) { /* Send remaining code. */
for (i = 0; i < code_buf_ptr; i++)
if(putbyte(code_buf[i], out) == EOF)
return 0;
}
return out->count;
}
size_t lzss_decode(LzssIO *in, LzssIO *out) /* Just the reverse of lzss_encode(). */
{
int i, j, k, r, c;
unsigned int flags;
unsigned char text_buf[N + F - 1]; /* ring buffer of size N,
with extra F-1 bytes to facilitate string comparison */
in->count = 0;
out->count = 0;
for (i = 0; i < N - F; i++) text_buf[i] = ' ';
r = N - F; flags = 0;
for ( ; ; ) {
if (((flags >>= 1) & 256) == 0) {
if ((c = getbyte(in)) == EOF)
break;
flags = c | 0xff00; /* uses higher byte cleverly */
} /* to count eight */
if (flags & 1) {
if ((c = getbyte(in)) == EOF)
break;
if(putbyte(c, out) == EOF)
return 0;
text_buf[r++] = c;
r &= (N - 1);
} else {
if ((i = getbyte(in)) == EOF)
break;
if ((j = getbyte(in)) == EOF)
break;
i |= ((j & 0xf0) << 4);
j = (j & 0x0f) + THRESHOLD;
for (k = 0; k <= j; k++) {
c = text_buf[(i + k) & (N - 1)];
if(putbyte(c, out) == EOF)
return 0;
text_buf[r++] = c;
r &= (N - 1);
}
}
}
return out->count;
}
#endif /* defined(LZSS_IMPLEMENTATION) */
#if defined(LZSS_EXEC)
int main(int argc, char *argv[])
{
char *s;
LzssIO in, out;
FILE *infile, *outfile;
size_t bytes;
if (argc != 4) {
printf("'lzss e file1 file2' encodes file1 into file2.\n"
"'lzss d file2 file1' decodes file2 into file1.\n");
return EXIT_FAILURE;
}
if ((s = argv[1], s[1] || strpbrk(s, "DEde") == NULL)
|| (s = argv[2], (infile = fopen(s, "rb")) == NULL)
|| (s = argv[3], (outfile = fopen(s, "wb")) == NULL)) {
printf("??? %s\n", s);
return EXIT_FAILURE;
}
lzss_file_io(&in, infile, 0);
lzss_file_io(&out, outfile, 0);
if (toupper(*argv[1]) == 'E') {
bytes = lzss_encode(&in, &out);
printf("%lu bytes encoded\n", (unsigned long)bytes);
} else {
bytes = lzss_decode(&in, &out);
printf("%lu bytes decoded\n", (unsigned long)bytes);
}
fclose(infile);
fclose(outfile);
return EXIT_SUCCESS;
}
#endif /* defined(LZSS_EXEC) */
#define LZSS_IMPLEMENTATION
#include "lzss.c"
int main(int argc, char *argv[]) {
LzssIO br;
LzssIO bw;
char *inbuffer = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras viverra, mauris fringilla aliquam mollis, odio enim semper lacus, aliquet pharetra enim sem quis dolor. Sed dictum quam eu urna fringilla, at rhoncus metus rhoncus. Nulla facilisi. Nulla sodales erat eget egestas aliquam. Aenean ac venenatis ante. Praesent quis sem eu odio lacinia sodales aliquet quis leo. Ut blandit, mauris ut euismod vehicula, eros purus vulputate diam, eget congue ipsum felis non massa. Nullam dictum aliquet dolor vitae facilisis.\n\n"
"Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Duis venenatis finibus velit vitae tincidunt. Integer placerat nunc sit amet nibh lobortis convallis. Ut varius dui sit amet nibh dapibus, luctus blandit mi efficitur. Morbi vel erat ultrices, ultricies nisi eget, auctor leo. Sed scelerisque porta finibus. Aliquam neque ante, congue eget magna vel, venenatis ornare lectus. Proin scelerisque, nibh consectetur sagittis commodo, orci ipsum viverra enim, non cursus urna libero ut odio. Cras et tristique tortor, aliquet laoreet augue. Curabitur porttitor tortor vel lobortis gravida. Nunc vitae sapien urna. Donec erat turpis, efficitur vulputate quam non, sodales commodo elit. Quisque a orci at est varius auctor eu ut sapien. Integer pretium elit vel nisl sollicitudin, et dignissim ipsum vulputate. Aenean sed dui aliquet, aliquam massa eget, ultrices turpis.\n\n"
"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nunc ut ante pulvinar, venenatis eros a, mollis dui. Fusce commodo finibus erat id sollicitudin. Maecenas gravida purus non enim porta tincidunt. Nulla bibendum congue tincidunt. Maecenas vulputate diam enim, molestie viverra diam volutpat eget. Donec quis dictum metus. Cras semper tincidunt enim ac blandit.";
char outbuffer1[2048], outbuffer2[2048];
size_t bytes = strlen(inbuffer) + 1; /* remember the '\0' terminator */
printf("The input buffer is %lu bytes\n", (unsigned long)bytes);
lzss_buffer_io(&br, inbuffer, bytes);
lzss_buffer_io(&bw, outbuffer1, sizeof outbuffer1);
bytes = lzss_encode(&br, &bw);
printf("It was compressed to %lu bytes\n", (unsigned long)bytes);
lzss_buffer_io(&br, outbuffer1, bytes);
lzss_buffer_io(&bw, outbuffer2, sizeof outbuffer2);
bytes = lzss_decode(&br, &bw);
printf("It was decompressed to %lu bytes\n", (unsigned long)bytes);
printf("Payload:\n%s\n", outbuffer2);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment