mohamed-ennahdi/HuffmanEncoding_FromMarkAllenWeissBook.c

## HuffmanEncoding_FromMarkAllenWeissBook.c
//*******************************************************************
// File:			HuffmanEncoding.c
// Author(s):		Mohamed Ennahdi El Idrissi
// Date:  			14-Aug-2012
//
// Input Files:		in.txt
// Output Files:	out.txt
// Description:  	CSC 2302 - <Data Structures>
//							   <Struct, Array, File I/O, Recursion, Pointers, Binary Tree>
//					This program covers the Huffman Encoding concept.
//					We first read a file, from we which we count the number of characters, and then reckon the frequency
//					of each letter individually. Each letter's frequency is stored in a node with its respective character.
//					This node is stored in an array of 26 elements (element 0 -> 'A', element 1 -> 'B'...element 25 -> 'Z').
//					Each element is a pointer, and each pointer is supposed to be a root of a tree (sub tree).
//					After processing all characters of the text (read from a file), we end up with an array with
//					25 NULL elements. The only element that is not NULL is the root of the tree that gathers the different
//					nodes of each letter.
//					Deducing the encoding of each letter if performed with intermediary of the prefix traversal.
//					To summarize, the pseudo-code is:
//						- Initialize the letters array
//						- Read the output file
//							- Increment each letter frequency + compute the number of characters in the file
//							- Store in the array's node the frequency of each letter
//						- Compute the number (N) of involved characters (Sometimes, texts don't include all letters. In our case 'Q' and 'Z' are absent).
//						- Loop N times
//							- find Minimum and second minimum
//							- create a new node, its left child contains the minimum and the right child contains the second minimum
//							- minimum position points on the new node, and the second minimum's array position points on NULL
//						- Browse the array till the unique non NULL element is encountered
//							- invoke prefix traversal function
//								- build the encoding of each character
//								- display the letter and its characteristics when found.
//						- Finally, read the output file to interpret its content
//							- if root contains a character (A - Z), display character
//							- else, if the current character is '0', browse the left leaf
//							- else, if the current character is '1', browse the right leaf
//
//*******************************************************************
#include <stdio.h>

#define NBR_OF_LETTERS 26

#define LEFT 'L'
#define RIGHT 'R'

#define CODE_SIZE 128

#define TYPED_ALLOC(type) (type *) malloc( sizeof(type) )
#define BYTE_SIZE 8

#define IN_PATH "./files/in.txt"
#define OUT_PATH "./files/out.txt"

typedef struct tree_node_s {
	float frequency;
	char c;
	char code[CODE_SIZE];
	struct tree_node_s *left;
	struct tree_node_s *right;
} tree_node_t;

tree_node_t *arr[NBR_OF_LETTERS], *letters[NBR_OF_LETTERS];


void findMinAndSecondMin(tree_node_t **, float *, int *, float *, int *);
void printTree(tree_node_t *);
void interpret(char *, int *, tree_node_t *);
void printTree(tree_node_t *);
void encode(tree_node_t *, tree_node_t **, char, short, char*);

/*
 *
 */
int main() {
	char str[CODE_SIZE];
	int fileReadingVerdict;
	int i, j, k, index, n;
	float min, secondMin;
	int minIndex, secondMinIndex;
	int numberOfCharacters = 0;
	tree_node_t *tree;
	FILE *in = fopen(IN_PATH, "r");
	FILE *out;
	if ( in == NULL ) {
		printf("\nFile not found");
		return 0;
	} else {
		/*
		 *	Begin: Array Initialization
		 */
		for (i = 'A'; i <= 'Z'; i++) {
			index = i - 'A';
			arr[index] = NULL;
		}
		/*
		 *	End:	Array Initialization
		 */
		numberOfCharacters = 0;
		fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
		while(!feof(in) || fileReadingVerdict) {
			n = strlen(str);
			printf("\n%s", str);
			for (i = 0; i < n ; i ++ ) {
				str[i] = toupper(str[i]);
				if (str[i] >= 'A' && str[i] <= 'Z') {
					numberOfCharacters ++;
					index = str[i] - 'A';
					if (arr[index] == NULL) {
						arr[index] = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
						arr[index]->c = str[i];
						arr[index]->frequency = 1;
						arr[index]->left = arr[index]->right = NULL;
					} else {
						arr[index]->frequency += 1;
					}
				}
			}
			if (fileReadingVerdict) {
				fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
			}
		}
	}
	fclose(in);

	for ( i = 0, n = 0 ; i < NBR_OF_LETTERS ; i ++ ) {
		letters[i] = arr[i];
		if (arr[i] != NULL) {
			arr[i]->frequency /= numberOfCharacters;	// Computing the frequency.
			n ++;										// n is the number of involved letters which is going to be consumed in the do while loop's condition
		}
	}

	j = 1;
	do {
		findMinAndSecondMin(arr, &min, &minIndex, &secondMin, &secondMinIndex);

		if (minIndex != -1 && secondMinIndex != -1 && minIndex != secondMinIndex) {
			tree_node_t *temp;
			tree = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
			tree->frequency = arr[minIndex]->frequency + arr[secondMinIndex]->frequency;
			tree->c = j;
			tree->left = arr[minIndex];
			temp = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
			temp->c = arr[secondMinIndex]->c;
			temp->frequency = arr[secondMinIndex]->frequency;
			temp->left = arr[secondMinIndex]->left;
			temp->right = arr[secondMinIndex]->right;
			tree->right = temp;

			arr[minIndex] = tree;

			arr[secondMinIndex] = NULL;
		}
		j ++;
	} while( j < n );

	for ( i = 0 ; i < NBR_OF_LETTERS ; i ++ ) {
		if (arr[i] != NULL)  {
			char code[CODE_SIZE];
			strcpy(code, "");
			encode(tree = arr[i], letters, 0, 0, code);
			puts("\nSuccessful encoding");
			printTree(arr[i]);
			break;
		}
	}
	in = fopen(IN_PATH, "r");
	out = fopen(OUT_PATH, "w");
	fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
	while(!feof(in) || fileReadingVerdict) {
		n = strlen(str);
		for (i = 0; i < n ; i ++ ) {
			str[i] = toupper(str[i]);
			if (str[i] >= 'A' && str[i] <= 'Z') {
				index = str[i] - 'A';
				fputs(letters[index]->code, out);
			}
		}
		if (fileReadingVerdict) {
			fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
		}
	}

	fclose(in);
	fclose(out);

	printf("\nFile size (only letters) of the input file:	%d bits", numberOfCharacters * BYTE_SIZE);

	out = fopen(OUT_PATH, "r");
	fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
	numberOfCharacters = 0;
	while(!feof(out) || fileReadingVerdict) {
		numberOfCharacters += strlen(str);
		if (fileReadingVerdict) {
			fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
		}
	}
	fclose(out);

	printf("\nFile size of the output file:	%d bits", numberOfCharacters);

	printf("\nInterpreting output file:\n");
	out = fopen(OUT_PATH, "r");
	fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
	while(!feof(out) || fileReadingVerdict) {
		n = strlen(str);
		i = 0 ;
		while(i < n) {
			interpret(str, &i, tree);
		}
		if (fileReadingVerdict) {
			fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
		}
	}
	fclose(out);

	puts("\n");
	return 0;
}
/*
 *
 */
void encode(tree_node_t *node, tree_node_t **letters, char direction, short level, char* code) {
	int n;
	if ( node != NULL ) {
		if ((n = strlen(code)) < level) {
			if (direction == RIGHT) {
				strcat(code, "1");
			} else {
				if (direction == LEFT) {
					strcat(code, "0");
				}
			}
		} else {
			if (n >= level) {
				code[n - (n - level) - 1] = 0;
				if (direction == RIGHT) {
					strcat(code, "1");
				} else {
					if (direction == LEFT) {
						strcat(code, "0");
					}
				}
			}
		}
		if (node->c >= 'A' && node->c <= 'Z') {
			strcpy(node->code, code);
			strcpy(letters[node->c - 'A']->code, code);
		}
		encode(node->left, letters, LEFT, level + 1, code);
		encode(node->right, letters, RIGHT, level + 1, code);
	}
}

void printTree(tree_node_t *node) {
	int n;
	if ( node != NULL ) {
		if (node->c >= 'A' && node->c <= 'Z') {
			printf("\t%c - frequency: %.10f\tencoding: %s\n", node->c, node->frequency, node->code);
		}
		printTree(node->left);
		printTree(node->right);
	}
}

/*
 *	Begin:	Minimum and second minimum
 */
void findMinAndSecondMin(tree_node_t *arr[], float *min, int *minIndex, float *secondMin, int *secondMinIndex) {
	int i, k;
	k = 0;
 	*minIndex = -1;
	/*
	 * Skipping all the NULL elements.
	 */
	while (k < NBR_OF_LETTERS && arr[k] == NULL) k++;

	*minIndex = k;
	*min = arr[k]->frequency;

	for ( i = k ; i < NBR_OF_LETTERS; i ++ ) {
		if ( arr[i] != NULL && arr[i]->frequency < *min ) {
			*min = arr[i]->frequency;
			*minIndex = i;
		}
	}

	k = 0;
	*secondMinIndex = -1;
	/*
	 * Skipping all the NULL elements.
	 */
	while ((k < NBR_OF_LETTERS && arr[k] == NULL) || (k == *minIndex && arr[k] != NULL)) k++;

	*secondMin = arr[k]->frequency;
	*secondMinIndex = k;

	if (k == *minIndex) k ++;

	for ( i = k ; i < NBR_OF_LETTERS; i ++ ) {
		if ( arr[i] != NULL && arr[i]->frequency < *secondMin && i != *minIndex ) {
			*secondMin = arr[i]->frequency;
			*secondMinIndex = i;
		}
	}
 	/*
	 *	End:	Minimum and second minimum
	 */
}

void interpret(char *str, int *index, tree_node_t *tree) {
	int n = strlen(str);
	if (tree->c >= 'A' && tree->c <= 'Z') {
		printf("%c ", tree->c);
		return ;
	} else {
		if ( *index < n ) {
			if (str[*index] == '0') {
				(*index) ++;
				interpret(str, index, tree->left);
			} else {
				if (str[*index] == '1') {
					(*index) ++;
					interpret(str, index, tree->right);
				}
			}
		}
	}
}
	//*******************************************************************
	// File: HuffmanEncoding.c
	// Author(s): Mohamed Ennahdi El Idrissi
	// Date: 14-Aug-2012
	//
	// Input Files: in.txt
	// Output Files: out.txt
	// Description: CSC 2302 - <Data Structures>
	// <Struct, Array, File I/O, Recursion, Pointers, Binary Tree>
	// This program covers the Huffman Encoding concept.
	// We first read a file, from we which we count the number of characters, and then reckon the frequency
	// of each letter individually. Each letter's frequency is stored in a node with its respective character.
	// This node is stored in an array of 26 elements (element 0 -> 'A', element 1 -> 'B'...element 25 -> 'Z').
	// Each element is a pointer, and each pointer is supposed to be a root of a tree (sub tree).
	// After processing all characters of the text (read from a file), we end up with an array with
	// 25 NULL elements. The only element that is not NULL is the root of the tree that gathers the different
	// nodes of each letter.
	// Deducing the encoding of each letter if performed with intermediary of the prefix traversal.
	// To summarize, the pseudo-code is:
	// - Initialize the letters array
	// - Read the output file
	// - Increment each letter frequency + compute the number of characters in the file
	// - Store in the array's node the frequency of each letter
	// - Compute the number (N) of involved characters (Sometimes, texts don't include all letters. In our case 'Q' and 'Z' are absent).
	// - Loop N times
	// - find Minimum and second minimum
	// - create a new node, its left child contains the minimum and the right child contains the second minimum
	// - minimum position points on the new node, and the second minimum's array position points on NULL
	// - Browse the array till the unique non NULL element is encountered
	// - invoke prefix traversal function
	// - build the encoding of each character
	// - display the letter and its characteristics when found.
	// - Finally, read the output file to interpret its content
	// - if root contains a character (A - Z), display character
	// - else, if the current character is '0', browse the left leaf
	// - else, if the current character is '1', browse the right leaf
	//
	//*******************************************************************
	#include <stdio.h>

	#define NBR_OF_LETTERS 26

	#define LEFT 'L'
	#define RIGHT 'R'

	#define CODE_SIZE 128

	#define TYPED_ALLOC(type) (type *) malloc( sizeof(type) )
	#define BYTE_SIZE 8

	#define IN_PATH "./files/in.txt"
	#define OUT_PATH "./files/out.txt"

	typedef struct tree_node_s {
	float frequency;
	char c;
	char code[CODE_SIZE];
	struct tree_node_s *left;
	struct tree_node_s *right;
	} tree_node_t;

	tree_node_t arr[NBR_OF_LETTERS], letters[NBR_OF_LETTERS];


	void findMinAndSecondMin(tree_node_t *, float , int , float , int *);
	void printTree(tree_node_t *);
	void interpret(char , int , tree_node_t *);
	void printTree(tree_node_t *);
	void encode(tree_node_t , tree_node_t , char, short, char);

	/*
	*
	*/
	int main() {
	char str[CODE_SIZE];
	int fileReadingVerdict;
	int i, j, k, index, n;
	float min, secondMin;
	int minIndex, secondMinIndex;
	int numberOfCharacters = 0;
	tree_node_t *tree;
	FILE *in = fopen(IN_PATH, "r");
	FILE *out;
	if ( in == NULL ) {
	printf("\nFile not found");
	return 0;
	} else {
	/*
	* Begin: Array Initialization
	*/
	for (i = 'A'; i <= 'Z'; i++) {
	index = i - 'A';
	arr[index] = NULL;
	}
	/*
	* End: Array Initialization
	*/
	numberOfCharacters = 0;
	fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
	while(!feof(in) \|\| fileReadingVerdict) {
	n = strlen(str);
	printf("\n%s", str);
	for (i = 0; i < n ; i ++ ) {
	str[i] = toupper(str[i]);
	if (str[i] >= 'A' && str[i] <= 'Z') {
	numberOfCharacters ++;
	index = str[i] - 'A';
	if (arr[index] == NULL) {
	arr[index] = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
	arr[index]->c = str[i];
	arr[index]->frequency = 1;
	arr[index]->left = arr[index]->right = NULL;
	} else {
	arr[index]->frequency += 1;
	}
	}
	}
	if (fileReadingVerdict) {
	fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
	}
	}
	}
	fclose(in);

	for ( i = 0, n = 0 ; i < NBR_OF_LETTERS ; i ++ ) {
	letters[i] = arr[i];
	if (arr[i] != NULL) {
	arr[i]->frequency /= numberOfCharacters; // Computing the frequency.
	n ++; // n is the number of involved letters which is going to be consumed in the do while loop's condition
	}
	}

	j = 1;
	do {
	findMinAndSecondMin(arr, &min, &minIndex, &secondMin, &secondMinIndex);

	if (minIndex != -1 && secondMinIndex != -1 && minIndex != secondMinIndex) {
	tree_node_t *temp;
	tree = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
	tree->frequency = arr[minIndex]->frequency + arr[secondMinIndex]->frequency;
	tree->c = j;
	tree->left = arr[minIndex];
	temp = TYPED_ALLOC(tree_node_t);// malloc(sizeof(tree_node_t));
	temp->c = arr[secondMinIndex]->c;
	temp->frequency = arr[secondMinIndex]->frequency;
	temp->left = arr[secondMinIndex]->left;
	temp->right = arr[secondMinIndex]->right;
	tree->right = temp;

	arr[minIndex] = tree;

	arr[secondMinIndex] = NULL;
	}
	j ++;
	} while( j < n );

	for ( i = 0 ; i < NBR_OF_LETTERS ; i ++ ) {
	if (arr[i] != NULL) {
	char code[CODE_SIZE];
	strcpy(code, "");
	encode(tree = arr[i], letters, 0, 0, code);
	puts("\nSuccessful encoding");
	printTree(arr[i]);
	break;
	}
	}
	in = fopen(IN_PATH, "r");
	out = fopen(OUT_PATH, "w");
	fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
	while(!feof(in) \|\| fileReadingVerdict) {
	n = strlen(str);
	for (i = 0; i < n ; i ++ ) {
	str[i] = toupper(str[i]);
	if (str[i] >= 'A' && str[i] <= 'Z') {
	index = str[i] - 'A';
	fputs(letters[index]->code, out);
	}
	}
	if (fileReadingVerdict) {
	fileReadingVerdict = fgets(str, CODE_SIZE, in) != NULL;
	}
	}

	fclose(in);
	fclose(out);

	printf("\nFile size (only letters) of the input file: %d bits", numberOfCharacters * BYTE_SIZE);

	out = fopen(OUT_PATH, "r");
	fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
	numberOfCharacters = 0;
	while(!feof(out) \|\| fileReadingVerdict) {
	numberOfCharacters += strlen(str);
	if (fileReadingVerdict) {
	fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
	}
	}
	fclose(out);

	printf("\nFile size of the output file: %d bits", numberOfCharacters);

	printf("\nInterpreting output file:\n");
	out = fopen(OUT_PATH, "r");
	fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
	while(!feof(out) \|\| fileReadingVerdict) {
	n = strlen(str);
	i = 0 ;
	while(i < n) {
	interpret(str, &i, tree);
	}
	if (fileReadingVerdict) {
	fileReadingVerdict = fgets(str, CODE_SIZE, out) != NULL;
	}
	}
	fclose(out);

	puts("\n");
	return 0;
	}
	/*
	*
	*/
	void encode(tree_node_t node, tree_node_t letters, char direction, short level, char code) {
	int n;
	if ( node != NULL ) {
	if ((n = strlen(code)) < level) {
	if (direction == RIGHT) {
	strcat(code, "1");
	} else {
	if (direction == LEFT) {
	strcat(code, "0");
	}
	}
	} else {
	if (n >= level) {
	code[n - (n - level) - 1] = 0;
	if (direction == RIGHT) {
	strcat(code, "1");
	} else {
	if (direction == LEFT) {
	strcat(code, "0");
	}
	}
	}
	}
	if (node->c >= 'A' && node->c <= 'Z') {
	strcpy(node->code, code);
	strcpy(letters[node->c - 'A']->code, code);
	}
	encode(node->left, letters, LEFT, level + 1, code);
	encode(node->right, letters, RIGHT, level + 1, code);
	}
	}

	void printTree(tree_node_t *node) {
	int n;
	if ( node != NULL ) {
	if (node->c >= 'A' && node->c <= 'Z') {
	printf("\t%c - frequency: %.10f\tencoding: %s\n", node->c, node->frequency, node->code);
	}
	printTree(node->left);
	printTree(node->right);
	}
	}

	/*
	* Begin: Minimum and second minimum
	*/
	void findMinAndSecondMin(tree_node_t arr[], float min, int minIndex, float secondMin, int *secondMinIndex) {
	int i, k;
	k = 0;
	*minIndex = -1;
	/*
	* Skipping all the NULL elements.
	*/
	while (k < NBR_OF_LETTERS && arr[k] == NULL) k++;

	*minIndex = k;
	*min = arr[k]->frequency;

	for ( i = k ; i < NBR_OF_LETTERS; i ++ ) {
	if ( arr[i] != NULL && arr[i]->frequency < *min ) {
	*min = arr[i]->frequency;
	*minIndex = i;
	}
	}

	k = 0;
	*secondMinIndex = -1;
	/*
	* Skipping all the NULL elements.
	*/
	while ((k < NBR_OF_LETTERS && arr[k] == NULL) \|\| (k == *minIndex && arr[k] != NULL)) k++;

	*secondMin = arr[k]->frequency;
	*secondMinIndex = k;

	if (k == *minIndex) k ++;

	for ( i = k ; i < NBR_OF_LETTERS; i ++ ) {
	if ( arr[i] != NULL && arr[i]->frequency < secondMin && i != minIndex ) {
	*secondMin = arr[i]->frequency;
	*secondMinIndex = i;
	}
	}
	/*
	* End: Minimum and second minimum
	*/
	}

	void interpret(char str, int index, tree_node_t *tree) {
	int n = strlen(str);
	if (tree->c >= 'A' && tree->c <= 'Z') {
	printf("%c ", tree->c);
	return ;
	} else {
	if ( *index < n ) {
	if (str[*index] == '0') {
	(*index) ++;
	interpret(str, index, tree->left);
	} else {
	if (str[*index] == '1') {
	(*index) ++;
	interpret(str, index, tree->right);
	}
	}
	}
	}
	}