Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mohamed-ennahdi/e1c44eb1a3d4830cddb5 to your computer and use it in GitHub Desktop.
Save mohamed-ennahdi/e1c44eb1a3d4830cddb5 to your computer and use it in GitHub Desktop.
//*******************************************************************
// File: HuffmanEncoding.c
// Author(s): Mohamed Ennahdi El Idrissi
// Date: 14-Aug-2012
//
// Input Files: in.txt
// Output Files: out.txt
// Description: CSC 2302 - <Data Structures>
// <Struct, Array, File I/O, Recursion, Pointers, Binary Tree>
// This program covers the Huffman Encoding concept.
// We first read a file, from we which we count the number of characters, and then reckon the frequency
// of each letter individually. Each letter's frequency is stored in a node with its respective character.
// This node is stored in an array of 26 elements (element 0 -> 'A', element 1 -> 'B'...element 25 -> 'Z').
// Each element is a pointer, and each pointer is supposed to be a root of a tree (sub tree).
// After processing all characters of the text (read from a file), we end up with an array with
// 25 NULL elements. The only element that is not NULL is the root of the tree that gathers the different
// nodes of each letter.
// Deducing the encoding of each letter if performed with intermediary of the prefix traversal.
// To summarize, the pseudo-code is:
// - Initialize the letters array
// - Read the output file
// - Increment each letter frequency + compute the number of characters in the file
// - Store in the array's node the frequency of each letter
// - Compute the number (N) of involved characters (Sometimes, texts don't include all letters. In our case 'Q' and 'Z' are absent).
// - Loop N times
// - find Minimum and second minimum
// - create a new node, its left child contains the minimum and the right child contains the second minimum
// - minimum position points on the new node, and the second minimum's array position points on NULL
// - Browse the array till the unique non NULL element is encountered
// - invoke prefix traversal function
// - build the encoding of each character
// - display the letter and its characteristics when found.
// - Finally, read the output file to interpret its content
// - if root contains a character (A - Z), display character
// - else, if the current character is '0', browse the left leaf
// - else, if the current character is '1', browse the right leaf
//
//*******************************************************************
#include <stdio.h>
#define NBR_OF_LETTERS 26
#define LEFT 'L'
#define RIGHT 'R'
#define CODE_SIZE 128
#define TYPED_ALLOC(type) (type *) malloc( sizeof(type) )
#define BYTE_SIZE 8
#define IN_PATH "./files/in.txt"
#define OUT_PATH "./files/out.txt"
typedef struct tree_node_s {
float frequency;
char c;
char code[CODE_SIZE];
struct tree_node_s *left;
struct tree_node_s *right;
} tree_node_t;
tree_node_t *arr[NBR_OF_LETTERS], *letters[NBR_OF_LETTERS];
void findMinAndSecondMin(tree_node_t **, float *, int *, float *, int *);
void printTree(tree_node_t *);
void interpret(char *, int *, tree_node_t *);
void printTree(tree_node_t *);
void encode(tree_node_t *, tree_node_t **, char, short, char*);
/*
*
*/
int main() {
char str[CODE_SIZE];
int fileReadingVerdict;
int i, j, k, index, n;
float min, secondMin;
int minIndex, secondMinIndex;
int numberOfCharacters = 0;
tree_node_t *tree;
FILE *in = fopen(IN_PATH, "r");
FILE *out;
if ( in == NULL ) {
printf("\nFile not found");
return 0;
} else {
/*
* Begin: Array Initialization
*/
for (i = 'A'; i <= 'Z'; i++) {
index = i - 'A';
arr[index] = NULL;
}
/*
* End: Array Initialization
*/
numberOfCharacters = 0;
fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
while(!feof(in) || fileReadingVerdict) {
n = strlen(str);
printf("\n%s", str);
for (i = 0; i < n ; i ++ ) {
str[i] = toupper(str[i]);
if (str[i] >= 'A' && str[i] <= 'Z') {
numberOfCharacters ++;
index = str[i] - 'A';
if (arr[index] == NULL) {
arr[index] = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
arr[index]->c = str[i];
arr[index]->frequency = 1;
arr[index]->left = arr[index]->right = NULL;
} else {
arr[index]->frequency += 1;
}
}
}
if (fileReadingVerdict) {
fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
}
}
}
fclose(in);
for ( i = 0, n = 0 ; i < NBR_OF_LETTERS ; i ++ ) {
letters[i] = arr[i];
if (arr[i] != NULL) {
arr[i]->frequency /= numberOfCharacters; // Computing the frequency.
n ++; // n is the number of involved letters which is going to be consumed in the do while loop's condition
}
}
j = 1;
do {
findMinAndSecondMin(arr, &min, &minIndex, &secondMin, &secondMinIndex);
if (minIndex != -1 && secondMinIndex != -1 && minIndex != secondMinIndex) {
tree_node_t *temp;
tree = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
tree->frequency = arr[minIndex]->frequency + arr[secondMinIndex]->frequency;
tree->c = j;
tree->left = arr[minIndex];
temp = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
temp->c = arr[secondMinIndex]->c;
temp->frequency = arr[secondMinIndex]->frequency;
temp->left = arr[secondMinIndex]->left;
temp->right = arr[secondMinIndex]->right;
tree->right = temp;
arr[minIndex] = tree;
arr[secondMinIndex] = NULL;
}
j ++;
} while( j < n );
for ( i = 0 ; i < NBR_OF_LETTERS ; i ++ ) {
if (arr[i] != NULL) {
char code[CODE_SIZE];
strcpy(code, "");
encode(tree = arr[i], letters, 0, 0, code);
puts("\nSuccessful encoding");
printTree(arr[i]);
break;
}
}
in = fopen(IN_PATH, "r");
out = fopen(OUT_PATH, "w");
fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
while(!feof(in) || fileReadingVerdict) {
n = strlen(str);
for (i = 0; i < n ; i ++ ) {
str[i] = toupper(str[i]);
if (str[i] >= 'A' && str[i] <= 'Z') {
index = str[i] - 'A';
fputs(letters[index]->code, out);
}
}
if (fileReadingVerdict) {
fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
}
}
fclose(in);
fclose(out);
printf("\nFile size (only letters) of the input file: %d bits", numberOfCharacters * BYTE_SIZE);
out = fopen(OUT_PATH, "r");
fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
numberOfCharacters = 0;
while(!feof(out) || fileReadingVerdict) {
numberOfCharacters += strlen(str);
if (fileReadingVerdict) {
fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
}
}
fclose(out);
printf("\nFile size of the output file: %d bits", numberOfCharacters);
printf("\nInterpreting output file:\n");
out = fopen(OUT_PATH, "r");
fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
while(!feof(out) || fileReadingVerdict) {
n = strlen(str);
i = 0 ;
while(i < n) {
interpret(str, &i, tree);
}
if (fileReadingVerdict) {
fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
}
}
fclose(out);
puts("\n");
return 0;
}
/*
*
*/
void encode(tree_node_t *node, tree_node_t **letters, char direction, short level, char* code) {
int n;
if ( node != NULL ) {
if ((n = strlen(code)) < level) {
if (direction == RIGHT) {
strcat(code, "1");
} else {
if (direction == LEFT) {
strcat(code, "0");
}
}
} else {
if (n >= level) {
code[n - (n - level) - 1] = 0;
if (direction == RIGHT) {
strcat(code, "1");
} else {
if (direction == LEFT) {
strcat(code, "0");
}
}
}
}
if (node->c >= 'A' && node->c <= 'Z') {
strcpy(node->code, code);
strcpy(letters[node->c - 'A']->code, code);
}
encode(node->left, letters, LEFT, level + 1, code);
encode(node->right, letters, RIGHT, level + 1, code);
}
}
void printTree(tree_node_t *node) {
int n;
if ( node != NULL ) {
if (node->c >= 'A' && node->c <= 'Z') {
printf("\t%c - frequency: %.10f\tencoding: %s\n", node->c, node->frequency, node->code);
}
printTree(node->left);
printTree(node->right);
}
}
/*
* Begin: Minimum and second minimum
*/
void findMinAndSecondMin(tree_node_t *arr[], float *min, int *minIndex, float *secondMin, int *secondMinIndex) {
int i, k;
k = 0;
*minIndex = -1;
/*
* Skipping all the NULL elements.
*/
while (k < NBR_OF_LETTERS && arr[k] == NULL) k++;
*minIndex = k;
*min = arr[k]->frequency;
for ( i = k ; i < NBR_OF_LETTERS; i ++ ) {
if ( arr[i] != NULL && arr[i]->frequency < *min ) {
*min = arr[i]->frequency;
*minIndex = i;
}
}
k = 0;
*secondMinIndex = -1;
/*
* Skipping all the NULL elements.
*/
while ((k < NBR_OF_LETTERS && arr[k] == NULL) || (k == *minIndex && arr[k] != NULL)) k++;
*secondMin = arr[k]->frequency;
*secondMinIndex = k;
if (k == *minIndex) k ++;
for ( i = k ; i < NBR_OF_LETTERS; i ++ ) {
if ( arr[i] != NULL && arr[i]->frequency < *secondMin && i != *minIndex ) {
*secondMin = arr[i]->frequency;
*secondMinIndex = i;
}
}
/*
* End: Minimum and second minimum
*/
}
void interpret(char *str, int *index, tree_node_t *tree) {
int n = strlen(str);
if (tree->c >= 'A' && tree->c <= 'Z') {
printf("%c ", tree->c);
return ;
} else {
if ( *index < n ) {
if (str[*index] == '0') {
(*index) ++;
interpret(str, index, tree->left);
} else {
if (str[*index] == '1') {
(*index) ++;
interpret(str, index, tree->right);
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment