Created
February 10, 2024 23:17
-
-
Save daskol/9104d676641b9b0f5da628cad1a259f1 to your computer and use it in GitHub Desktop.
Philip Gage's original code of Byte-Pair Encoding (BPE) compression technique.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* compress.c */ | |
/* Copyright 1994 by Philip Gage */ | |
#include <stdio.h> | |
#define BLOCKSIZE 5000 /* Maximum block size */ | |
#define HASHSIZE 4096 /* Size of hash table */ | |
#define MAXCHARS 200 /* Char set per block */ | |
#define THRESHOLD 3 /* Minimum pair count */ | |
unsigned char buffer[BLOCKSIZE]; /* Data block */ | |
unsigned char leftcode[256]; /* Pair table */ | |
unsigned char rightcode[256]; /* Pair table */ | |
unsigned char left[HASHSIZE]; /* Hash table */ | |
unsigned char right[HASHSIZE]; /* Hash table */ | |
unsigned char count[HASHSIZE]; /* Pair count */ | |
int size; /* Size of current data block */ | |
/* Function prototypes */ | |
int lookup(unsigned char, unsigned char); | |
int fileread(FILE *); | |
void filewrite(FILE *); | |
void compress(FILE *, FILE *); | |
/* Return index of character pair in hash table */ | |
/* Deleted nodes have count of 1 for hashing */ | |
int lookup(unsigned char a, unsigned char b) { | |
int index; | |
/* Compute hash key from both characters */ | |
index = (a ^ (b << 5)) & (HASHSIZE - 1); | |
/* Search for pair or first empty slot */ | |
while ((left[index] != a || right[index] != b) && count[index] != 0) | |
index = (index + 1) & (HASHSIZE - 1); | |
/* Store pair in table */ | |
left[index] = a; | |
right[index] = b; | |
return index; | |
} | |
/* Read next block from input file into buffer */ | |
int fileread(FILE *input) { | |
int c, index, used = 0; | |
/* Reset hash table and pair table */ | |
for (c = 0; c < HASHSIZE; c++) | |
count[c] = 0; | |
for (c = 0; c < 256; c++) { | |
leftcode[c] = c; | |
rightcode[c] = 0; | |
} | |
size = 0; | |
/* Read data until full or few unused chars */ | |
while (size < BLOCKSIZE && used < MAXCHARS && (c = getc(input)) != EOF) { | |
if (size > 0) { | |
index = lookup(buffer[size - 1], c); | |
if (count[index] < 255) | |
++count[index]; | |
} | |
buffer[size++] = c; | |
/* Use rightcode to flag data chars found */ | |
if (!rightcode[c]) { | |
rightcode[c] = 1; | |
used++; | |
} | |
} | |
return c == EOF; | |
} | |
/* Write each pair table and data block to output */ | |
void filewrite(FILE *output) { | |
int i, len, c = 0; | |
/* For each character 0..255 */ | |
while (c < 256) { | |
/* If not a pair code, count run of literals */ | |
if (c == leftcode[c]) { | |
len = 1; | |
c++; | |
while (len < 127 && c < 256 && c == leftcode[c]) { | |
len++; | |
c++; | |
} | |
putc(len + 127, output); | |
len = 0; | |
if (c == 256) | |
break; | |
} | |
/* Else count run of pair codes */ | |
else { | |
len = 0; | |
c++; | |
while (len < 127 && c < 256 && c != leftcode[c] || | |
len < 125 && c < 254 && c + 1 != leftcode[c + 1]) { | |
len++; | |
c++; | |
} | |
putc(len, output); | |
c -= len + 1; | |
} | |
/* Write range of pairs to output */ | |
for (i = 0; i <= len; i++) { | |
putc(leftcode[c], output); | |
if (c != leftcode[c]) | |
putc(rightcode[c], output); | |
c++; | |
} | |
} | |
/* Write size bytes and compressed data block */ | |
putc(size / 256, output); | |
putc(size % 256, output); | |
fwrite(buffer, size, 1, output); | |
} | |
/* Compress from input file to output file */ | |
void compress(FILE *infile, FILE *outfile) { | |
int leftch, rightch, code, oldsize; | |
int index, r, w, best, done = 0; | |
/* Compress each data block until end of file */ | |
while (!done) { | |
done = fileread(infile); | |
code = 256; | |
/* Compress this block */ | |
for (;;) { | |
/* Get next unused char for pair code */ | |
for (code--; code >= 0; code--) | |
if (code == leftcode[code] && !rightcode[code]) | |
break; | |
/* Must quit if no unused chars left */ | |
if (code < 0) | |
break; | |
/* Find most frequent pair of chars */ | |
for (best = 2, index = 0; index < HASHSIZE; index++) | |
if (count[index] > best) { | |
best = count[index]; | |
leftch = left[index]; | |
rightch = right[index]; | |
} | |
/* Done if no more compression possible */ | |
if (best < THRESHOLD) | |
break; | |
/* Replace pairs in data, adjust pair counts */ | |
oldsize = size - 1; | |
for (w = 0, r = 0; r < oldsize; r++) { | |
if (buffer[r] == leftch && buffer[r + 1] == rightch) { | |
if (r > 0) { | |
index = lookup(buffer[w - 1], leftch); | |
if (count[index] > 1) | |
--count[index]; | |
index = lookup(buffer[w - 1], code); | |
if (count[index] < 255) | |
++count[index]; | |
} | |
if (r < oldsize - 1) { | |
index = lookup(rightch, buffer[r + 2]); | |
if (count[index] > 1) | |
--count[index]; | |
index = lookup(code, buffer[r + 2]); | |
if (count[index] < 255) | |
++count[index]; | |
} | |
buffer[w++] = code; | |
r++; | |
size--; | |
} else | |
buffer[w++] = buffer[r]; | |
} | |
buffer[w] = buffer[r]; | |
/* Add to pair substitution table */ | |
leftcode[code] = leftch; | |
rightcode[code] = rightch; | |
/* Delete pair from hash table */ | |
index = lookup(leftch, rightch); | |
count[index] = 1; | |
} | |
filewrite(outfile); | |
} | |
} | |
void main(int argc, char *argv[]) { | |
FILE *infile, *outfile; | |
if (argc != 3) | |
printf("Usage: compress infile outfile\n"); | |
else if ((infile = fopen(argv[1], "rb")) == NULL) | |
printf("Error opening input %s\n", argv[1]); | |
else if ((outfile = fopen(argv[2], "wb")) == NULL) | |
printf("Error opening output %s\n", argv[2]); | |
else { | |
compress(infile, outfile); | |
fclose(outfile); | |
fclose(infile); | |
} | |
} | |
/*End of File */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* expand.c */ | |
/* Copyright 1994 by Philip Gage */ | |
#include <stdio.h> | |
/* Decompress data from input to output */ | |
void expand(FILE *input, FILE *output) { | |
unsigned char left[256], right[256], stack[30]; | |
short int c, count, i, size; | |
/* Unpack each block until end of file */ | |
while ((count = getc(input)) != EOF) { | |
/* Set left to itself as literal flag */ | |
for (i = 0; i < 256; i++) | |
left[i] = i; | |
/* Read pair table */ | |
for (c = 0;;) { | |
/* Skip range of literal bytes */ | |
if (count > 127) { | |
c += count - 127; | |
count = 0; | |
} | |
if (c == 256) | |
break; | |
/* Read pairs, skip right if literal */ | |
for (i = 0; i <= count; i++, c++) { | |
left[c] = getc(input); | |
if (c != left[c]) | |
right[c] = getc(input); | |
} | |
if (c == 256) | |
break; | |
count = getc(input); | |
} | |
/* Calculate packed data block size */ | |
size = 256 * getc(input) + getc(input); | |
/* Unpack data block */ | |
for (i = 0;;) { | |
/* Pop byte from stack or read byte */ | |
if (i) | |
c = stack[--i]; | |
else { | |
if (!size--) | |
break; | |
c = getc(input); | |
} | |
/* Output byte or push pair on stack */ | |
if (c == left[c]) | |
putc(c, output); | |
else { | |
stack[i++] = right[c]; | |
stack[i++] = left[c]; | |
} | |
} | |
} | |
} | |
void main(int argc, char *argv[]) { | |
FILE *infile, *outfile; | |
if (argc != 3) | |
printf("Usage: expand infile outfile\n"); | |
else if ((infile = fopen(argv[1], "rb")) == NULL) | |
printf("Error opening input %s\n", argv[1]); | |
else if ((outfile = fopen(argv[2], "wb")) == NULL) | |
printf("Error opening output %s\n", argv[2]); | |
else { | |
expand(infile, outfile); | |
fclose(outfile); | |
fclose(infile); | |
} | |
} | |
/* End of File */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment