Skip to content

Instantly share code, notes, and snippets.

@daskol
Created February 10, 2024 23:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daskol/9104d676641b9b0f5da628cad1a259f1 to your computer and use it in GitHub Desktop.
Save daskol/9104d676641b9b0f5da628cad1a259f1 to your computer and use it in GitHub Desktop.
Philip Gage's original code of Byte-Pair Encoding (BPE) compression technique.
/* compress.c */
/* Copyright 1994 by Philip Gage */
#include <stdio.h>
#define BLOCKSIZE 5000 /* Maximum block size */
#define HASHSIZE 4096 /* Size of hash table */
#define MAXCHARS 200 /* Char set per block */
#define THRESHOLD 3 /* Minimum pair count */
unsigned char buffer[BLOCKSIZE]; /* Data block */
unsigned char leftcode[256]; /* Pair table */
unsigned char rightcode[256]; /* Pair table */
unsigned char left[HASHSIZE]; /* Hash table */
unsigned char right[HASHSIZE]; /* Hash table */
unsigned char count[HASHSIZE]; /* Pair count */
int size; /* Size of current data block */
/* Function prototypes */
int lookup(unsigned char, unsigned char);
int fileread(FILE *);
void filewrite(FILE *);
void compress(FILE *, FILE *);
/* Return index of character pair in hash table */
/* Deleted nodes have count of 1 for hashing */
int lookup(unsigned char a, unsigned char b) {
int index;
/* Compute hash key from both characters */
index = (a ^ (b << 5)) & (HASHSIZE - 1);
/* Search for pair or first empty slot */
while ((left[index] != a || right[index] != b) && count[index] != 0)
index = (index + 1) & (HASHSIZE - 1);
/* Store pair in table */
left[index] = a;
right[index] = b;
return index;
}
/* Read next block from input file into buffer */
int fileread(FILE *input) {
int c, index, used = 0;
/* Reset hash table and pair table */
for (c = 0; c < HASHSIZE; c++)
count[c] = 0;
for (c = 0; c < 256; c++) {
leftcode[c] = c;
rightcode[c] = 0;
}
size = 0;
/* Read data until full or few unused chars */
while (size < BLOCKSIZE && used < MAXCHARS && (c = getc(input)) != EOF) {
if (size > 0) {
index = lookup(buffer[size - 1], c);
if (count[index] < 255)
++count[index];
}
buffer[size++] = c;
/* Use rightcode to flag data chars found */
if (!rightcode[c]) {
rightcode[c] = 1;
used++;
}
}
return c == EOF;
}
/* Write each pair table and data block to output */
void filewrite(FILE *output) {
int i, len, c = 0;
/* For each character 0..255 */
while (c < 256) {
/* If not a pair code, count run of literals */
if (c == leftcode[c]) {
len = 1;
c++;
while (len < 127 && c < 256 && c == leftcode[c]) {
len++;
c++;
}
putc(len + 127, output);
len = 0;
if (c == 256)
break;
}
/* Else count run of pair codes */
else {
len = 0;
c++;
while (len < 127 && c < 256 && c != leftcode[c] ||
len < 125 && c < 254 && c + 1 != leftcode[c + 1]) {
len++;
c++;
}
putc(len, output);
c -= len + 1;
}
/* Write range of pairs to output */
for (i = 0; i <= len; i++) {
putc(leftcode[c], output);
if (c != leftcode[c])
putc(rightcode[c], output);
c++;
}
}
/* Write size bytes and compressed data block */
putc(size / 256, output);
putc(size % 256, output);
fwrite(buffer, size, 1, output);
}
/* Compress from input file to output file */
void compress(FILE *infile, FILE *outfile) {
int leftch, rightch, code, oldsize;
int index, r, w, best, done = 0;
/* Compress each data block until end of file */
while (!done) {
done = fileread(infile);
code = 256;
/* Compress this block */
for (;;) {
/* Get next unused char for pair code */
for (code--; code >= 0; code--)
if (code == leftcode[code] && !rightcode[code])
break;
/* Must quit if no unused chars left */
if (code < 0)
break;
/* Find most frequent pair of chars */
for (best = 2, index = 0; index < HASHSIZE; index++)
if (count[index] > best) {
best = count[index];
leftch = left[index];
rightch = right[index];
}
/* Done if no more compression possible */
if (best < THRESHOLD)
break;
/* Replace pairs in data, adjust pair counts */
oldsize = size - 1;
for (w = 0, r = 0; r < oldsize; r++) {
if (buffer[r] == leftch && buffer[r + 1] == rightch) {
if (r > 0) {
index = lookup(buffer[w - 1], leftch);
if (count[index] > 1)
--count[index];
index = lookup(buffer[w - 1], code);
if (count[index] < 255)
++count[index];
}
if (r < oldsize - 1) {
index = lookup(rightch, buffer[r + 2]);
if (count[index] > 1)
--count[index];
index = lookup(code, buffer[r + 2]);
if (count[index] < 255)
++count[index];
}
buffer[w++] = code;
r++;
size--;
} else
buffer[w++] = buffer[r];
}
buffer[w] = buffer[r];
/* Add to pair substitution table */
leftcode[code] = leftch;
rightcode[code] = rightch;
/* Delete pair from hash table */
index = lookup(leftch, rightch);
count[index] = 1;
}
filewrite(outfile);
}
}
void main(int argc, char *argv[]) {
FILE *infile, *outfile;
if (argc != 3)
printf("Usage: compress infile outfile\n");
else if ((infile = fopen(argv[1], "rb")) == NULL)
printf("Error opening input %s\n", argv[1]);
else if ((outfile = fopen(argv[2], "wb")) == NULL)
printf("Error opening output %s\n", argv[2]);
else {
compress(infile, outfile);
fclose(outfile);
fclose(infile);
}
}
/*End of File */
/* expand.c */
/* Copyright 1994 by Philip Gage */
#include <stdio.h>
/* Decompress data from input to output */
void expand(FILE *input, FILE *output) {
unsigned char left[256], right[256], stack[30];
short int c, count, i, size;
/* Unpack each block until end of file */
while ((count = getc(input)) != EOF) {
/* Set left to itself as literal flag */
for (i = 0; i < 256; i++)
left[i] = i;
/* Read pair table */
for (c = 0;;) {
/* Skip range of literal bytes */
if (count > 127) {
c += count - 127;
count = 0;
}
if (c == 256)
break;
/* Read pairs, skip right if literal */
for (i = 0; i <= count; i++, c++) {
left[c] = getc(input);
if (c != left[c])
right[c] = getc(input);
}
if (c == 256)
break;
count = getc(input);
}
/* Calculate packed data block size */
size = 256 * getc(input) + getc(input);
/* Unpack data block */
for (i = 0;;) {
/* Pop byte from stack or read byte */
if (i)
c = stack[--i];
else {
if (!size--)
break;
c = getc(input);
}
/* Output byte or push pair on stack */
if (c == left[c])
putc(c, output);
else {
stack[i++] = right[c];
stack[i++] = left[c];
}
}
}
}
void main(int argc, char *argv[]) {
FILE *infile, *outfile;
if (argc != 3)
printf("Usage: expand infile outfile\n");
else if ((infile = fopen(argv[1], "rb")) == NULL)
printf("Error opening input %s\n", argv[1]);
else if ((outfile = fopen(argv[2], "wb")) == NULL)
printf("Error opening output %s\n", argv[2]);
else {
expand(infile, outfile);
fclose(outfile);
fclose(infile);
}
}
/* End of File */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment