Artiavis/tokenizer.c

## tokenizer.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct TokenizerT_ {
    char* separators; // C string of separation characters
    char* tokenString; // C string of unprocessed tokens
    size_t separatorStringLength; // length of separator string
    size_t tokenStringLength; // length of token string
    int cursorPosition; // cursor to token string
};

typedef struct TokenizerT_ TokenizerT;

// Maps escaped letters to control characters
char escapeChar(char c) {
    switch (c) {
        case 'n':
            return '\n';
        case 't':
            return '\t';
        case 'v':
            return '\v';
        case 'b':
            return '\b';
        case 'r':
            return '\r';
        case 'f':
            return '\f';
        case 'a':
            return '\a';
        case '\\':
            return '\\';
        case '\"':
            return '\"';
        default:
            return c;
    }
}

// Maps control characters to hex values
void printChar(char c) {
    switch (c) {
        case '\n':
            printf("[0x0a]");
            break;
        case '\t':
            printf("[0x09]");;
            break;
        case '\v':
            printf("[0x0b]");
            break;
        case '\b':
            printf("[0x08]");
            break;
        case '\r':
            printf("[0x0d]");
            break;
        case '\f':
            printf("[0x0c]");
            break;
        case '\a':
            printf("[0x07]");
            break;
        case '\\':
            printf("[0x5c]");
            break;
        case '\"':
            printf("[0x22]");
            break;
        default:
            putchar(c);
            break;
    }
}

// Function to iterate through strings and replace control sequences using the escapeChar function
char* escapeString(char* str) {

    size_t tempLength = strlen(str);
    char* temp = (char*) malloc(sizeof(char)*(1+tempLength));

    int i;
    int j = 0; // counter to observe differences between string positions

    for (i = 0; i + j < tempLength + 1; i++) {
        if (str[i+j] != '\\') {
            temp[i] = str[i+j];
        } else {
            temp[i] = escapeChar(str[i+j+1]);
            j++;
        }
    }

    return temp;
}

TokenizerT *TKCreate(char *separators, char *ts) {

    TokenizerT* tokenizer = (TokenizerT*) malloc(sizeof(TokenizerT));

    if (tokenizer) {

        // Escape the control sequences inside the strings
        char* escapedSeps = escapeString(separators);
        char* escapedToks = escapeString(ts);

        // Copy the values over (eliminates dead space at end of strings)
        tokenizer->separatorStringLength = strlen(escapedSeps);
        tokenizer->tokenStringLength = strlen(escapedToks);
        tokenizer->separators = (char*) malloc(sizeof(char)*(1+tokenizer->separatorStringLength));
        tokenizer->tokenString = (char*) malloc(sizeof(char)*(1+tokenizer->tokenStringLength));
        strcpy(tokenizer->separators, escapedSeps);
        strcpy(tokenizer->tokenString, escapedToks);
        tokenizer->cursorPosition = 0;

        // Free temp strings
        free(escapedSeps);
        free(escapedToks);
    }

    // If malloc fails, pointer will be NULL anyway
    return tokenizer;
}

void TKDestroy(TokenizerT *tk) {

    free(tk->separators);
    free(tk->tokenString);
    free (tk);
}

// Function to check that a given character is not a token separator
int charNotSeparator(char c, TokenizerT *tk) {

    int i;

    for (i = 0; i < tk->separatorStringLength; i++) {
        if (tk->separators[i] == c) {
            return 0;
        }
    }

    return 1; // return true
}

char *TKGetNextToken(TokenizerT *tk) {

    char* nextToken;
    int tokenLength = 0;
    int startPosition = tk->cursorPosition;
    int tokenStringLength = tk->tokenStringLength;

    // increment tokenLength counter while token is incomplete
    while (startPosition + tokenLength < tokenStringLength) {
        if (charNotSeparator(tk->tokenString[startPosition + tokenLength], tk)) {
            tokenLength++;
        } else {
            break;
        }
    }
    /*
    * If tokenLength is 0, that character is a separator, and bypass it
    *    unless it's the end, in which case exit the program
    * If the token length is nonzero, copy that token and return it
    */
    if (tokenLength == 0 && (startPosition + tokenLength < tokenStringLength) ) {

        tk->cursorPosition++;
        nextToken = (char*) malloc(sizeof(char)*2);
        strcpy(nextToken, "");
    } else if ( tokenLength > 0 ) {
        nextToken = (char*) malloc(sizeof(char)*(1+tokenLength));
        strncpy(nextToken, &(tk->tokenString[startPosition]), (size_t) tokenLength );
        nextToken[tokenLength+1] = '\0'; // strncpy doesn't automatically add '\0'
        tk->cursorPosition += tokenLength + 1;
    } else {
        return 0;
    }

    return nextToken;
}


int main(int argc, char **argv) {

    char* separatorString;
    char* tokenString;
    char* tokenResultString;
    TokenizerT* tokenizer;

    separatorString = argv[1];
    tokenString = argv[2];

    tokenizer = TKCreate(separatorString, tokenString);

    if (tokenizer) {

        tokenResultString = TKGetNextToken(tokenizer);

        while( tokenResultString != 0 ) {

            if (strcmp("", tokenResultString) == 0) {
                printf(tokenResultString);
            } else {
                int i;
                size_t length = strlen(tokenResultString);
                for (i = 0; i < length; i++) {
                    printChar(tokenResultString[i]);
                }
                putchar('\n');
                free (tokenResultString);
            }

            tokenResultString = TKGetNextToken(tokenizer);
        }

        TKDestroy(tokenizer);
    }

    return 0;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	struct TokenizerT_ {
	char* separators; // C string of separation characters
	char* tokenString; // C string of unprocessed tokens
	size_t separatorStringLength; // length of separator string
	size_t tokenStringLength; // length of token string
	int cursorPosition; // cursor to token string
	};

	typedef struct TokenizerT_ TokenizerT;

	// Maps escaped letters to control characters
	char escapeChar(char c) {
	switch (c) {
	case 'n':
	return '\n';
	case 't':
	return '\t';
	case 'v':
	return '\v';
	case 'b':
	return '\b';
	case 'r':
	return '\r';
	case 'f':
	return '\f';
	case 'a':
	return '\a';
	case '\\':
	return '\\';
	case '\"':
	return '\"';
	default:
	return c;
	}
	}

	// Maps control characters to hex values
	void printChar(char c) {
	switch (c) {
	case '\n':
	printf("[0x0a]");
	break;
	case '\t':
	printf("[0x09]");;
	break;
	case '\v':
	printf("[0x0b]");
	break;
	case '\b':
	printf("[0x08]");
	break;
	case '\r':
	printf("[0x0d]");
	break;
	case '\f':
	printf("[0x0c]");
	break;
	case '\a':
	printf("[0x07]");
	break;
	case '\\':
	printf("[0x5c]");
	break;
	case '\"':
	printf("[0x22]");
	break;
	default:
	putchar(c);
	break;
	}
	}

	// Function to iterate through strings and replace control sequences using the escapeChar function
	char* escapeString(char* str) {

	size_t tempLength = strlen(str);
	char* temp = (char) malloc(sizeof(char)(1+tempLength));

	int i;
	int j = 0; // counter to observe differences between string positions

	for (i = 0; i + j < tempLength + 1; i++) {
	if (str[i+j] != '\\') {
	temp[i] = str[i+j];
	} else {
	temp[i] = escapeChar(str[i+j+1]);
	j++;
	}
	}

	return temp;
	}

	TokenizerT TKCreate(char separators, char *ts) {

	TokenizerT* tokenizer = (TokenizerT*) malloc(sizeof(TokenizerT));

	if (tokenizer) {

	// Escape the control sequences inside the strings
	char* escapedSeps = escapeString(separators);
	char* escapedToks = escapeString(ts);

	// Copy the values over (eliminates dead space at end of strings)
	tokenizer->separatorStringLength = strlen(escapedSeps);
	tokenizer->tokenStringLength = strlen(escapedToks);
	tokenizer->separators = (char) malloc(sizeof(char)(1+tokenizer->separatorStringLength));
	tokenizer->tokenString = (char) malloc(sizeof(char)(1+tokenizer->tokenStringLength));
	strcpy(tokenizer->separators, escapedSeps);
	strcpy(tokenizer->tokenString, escapedToks);
	tokenizer->cursorPosition = 0;

	// Free temp strings
	free(escapedSeps);
	free(escapedToks);
	}

	// If malloc fails, pointer will be NULL anyway
	return tokenizer;
	}

	void TKDestroy(TokenizerT *tk) {

	free(tk->separators);
	free(tk->tokenString);
	free (tk);
	}

	// Function to check that a given character is not a token separator
	int charNotSeparator(char c, TokenizerT *tk) {

	int i;

	for (i = 0; i < tk->separatorStringLength; i++) {
	if (tk->separators[i] == c) {
	return 0;
	}
	}

	return 1; // return true
	}

	char TKGetNextToken(TokenizerT tk) {

	char* nextToken;
	int tokenLength = 0;
	int startPosition = tk->cursorPosition;
	int tokenStringLength = tk->tokenStringLength;

	// increment tokenLength counter while token is incomplete
	while (startPosition + tokenLength < tokenStringLength) {
	if (charNotSeparator(tk->tokenString[startPosition + tokenLength], tk)) {
	tokenLength++;
	} else {
	break;
	}
	}
	/*
	* If tokenLength is 0, that character is a separator, and bypass it
	* unless it's the end, in which case exit the program
	* If the token length is nonzero, copy that token and return it
	*/
	if (tokenLength == 0 && (startPosition + tokenLength < tokenStringLength) ) {

	tk->cursorPosition++;
	nextToken = (char) malloc(sizeof(char)2);
	strcpy(nextToken, "");
	} else if ( tokenLength > 0 ) {
	nextToken = (char) malloc(sizeof(char)(1+tokenLength));
	strncpy(nextToken, &(tk->tokenString[startPosition]), (size_t) tokenLength );
	nextToken[tokenLength+1] = '\0'; // strncpy doesn't automatically add '\0'
	tk->cursorPosition += tokenLength + 1;
	} else {
	return 0;
	}

	return nextToken;
	}


	int main(int argc, char **argv) {

	char* separatorString;
	char* tokenString;
	char* tokenResultString;
	TokenizerT* tokenizer;

	separatorString = argv[1];
	tokenString = argv[2];

	tokenizer = TKCreate(separatorString, tokenString);

	if (tokenizer) {

	tokenResultString = TKGetNextToken(tokenizer);

	while( tokenResultString != 0 ) {

	if (strcmp("", tokenResultString) == 0) {
	printf(tokenResultString);
	} else {
	int i;
	size_t length = strlen(tokenResultString);
	for (i = 0; i < length; i++) {
	printChar(tokenResultString[i]);
	}
	putchar('\n');
	free (tokenResultString);
	}

	tokenResultString = TKGetNextToken(tokenizer);
	}

	TKDestroy(tokenizer);
	}

	return 0;
	}