Created
January 30, 2014 00:55
-
-
Save Artiavis/8700573 to your computer and use it in GitHub Desktop.
Intro assignment for Systems Programming at Rutgers. Create a string tokenizer.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
struct TokenizerT_ { | |
char* separators; // C string of separation characters | |
char* tokenString; // C string of unprocessed tokens | |
size_t separatorStringLength; // length of separator string | |
size_t tokenStringLength; // length of token string | |
int cursorPosition; // cursor to token string | |
}; | |
typedef struct TokenizerT_ TokenizerT; | |
// Maps escaped letters to control characters | |
char escapeChar(char c) { | |
switch (c) { | |
case 'n': | |
return '\n'; | |
case 't': | |
return '\t'; | |
case 'v': | |
return '\v'; | |
case 'b': | |
return '\b'; | |
case 'r': | |
return '\r'; | |
case 'f': | |
return '\f'; | |
case 'a': | |
return '\a'; | |
case '\\': | |
return '\\'; | |
case '\"': | |
return '\"'; | |
default: | |
return c; | |
} | |
} | |
// Maps control characters to hex values | |
void printChar(char c) { | |
switch (c) { | |
case '\n': | |
printf("[0x0a]"); | |
break; | |
case '\t': | |
printf("[0x09]");; | |
break; | |
case '\v': | |
printf("[0x0b]"); | |
break; | |
case '\b': | |
printf("[0x08]"); | |
break; | |
case '\r': | |
printf("[0x0d]"); | |
break; | |
case '\f': | |
printf("[0x0c]"); | |
break; | |
case '\a': | |
printf("[0x07]"); | |
break; | |
case '\\': | |
printf("[0x5c]"); | |
break; | |
case '\"': | |
printf("[0x22]"); | |
break; | |
default: | |
putchar(c); | |
break; | |
} | |
} | |
// Function to iterate through strings and replace control sequences using the escapeChar function | |
char* escapeString(char* str) { | |
size_t tempLength = strlen(str); | |
char* temp = (char*) malloc(sizeof(char)*(1+tempLength)); | |
int i; | |
int j = 0; // counter to observe differences between string positions | |
for (i = 0; i + j < tempLength + 1; i++) { | |
if (str[i+j] != '\\') { | |
temp[i] = str[i+j]; | |
} else { | |
temp[i] = escapeChar(str[i+j+1]); | |
j++; | |
} | |
} | |
return temp; | |
} | |
TokenizerT *TKCreate(char *separators, char *ts) { | |
TokenizerT* tokenizer = (TokenizerT*) malloc(sizeof(TokenizerT)); | |
if (tokenizer) { | |
// Escape the control sequences inside the strings | |
char* escapedSeps = escapeString(separators); | |
char* escapedToks = escapeString(ts); | |
// Copy the values over (eliminates dead space at end of strings) | |
tokenizer->separatorStringLength = strlen(escapedSeps); | |
tokenizer->tokenStringLength = strlen(escapedToks); | |
tokenizer->separators = (char*) malloc(sizeof(char)*(1+tokenizer->separatorStringLength)); | |
tokenizer->tokenString = (char*) malloc(sizeof(char)*(1+tokenizer->tokenStringLength)); | |
strcpy(tokenizer->separators, escapedSeps); | |
strcpy(tokenizer->tokenString, escapedToks); | |
tokenizer->cursorPosition = 0; | |
// Free temp strings | |
free(escapedSeps); | |
free(escapedToks); | |
} | |
// If malloc fails, pointer will be NULL anyway | |
return tokenizer; | |
} | |
void TKDestroy(TokenizerT *tk) { | |
free(tk->separators); | |
free(tk->tokenString); | |
free (tk); | |
} | |
// Function to check that a given character is not a token separator | |
int charNotSeparator(char c, TokenizerT *tk) { | |
int i; | |
for (i = 0; i < tk->separatorStringLength; i++) { | |
if (tk->separators[i] == c) { | |
return 0; | |
} | |
} | |
return 1; // return true | |
} | |
char *TKGetNextToken(TokenizerT *tk) { | |
char* nextToken; | |
int tokenLength = 0; | |
int startPosition = tk->cursorPosition; | |
int tokenStringLength = tk->tokenStringLength; | |
// increment tokenLength counter while token is incomplete | |
while (startPosition + tokenLength < tokenStringLength) { | |
if (charNotSeparator(tk->tokenString[startPosition + tokenLength], tk)) { | |
tokenLength++; | |
} else { | |
break; | |
} | |
} | |
/* | |
* If tokenLength is 0, that character is a separator, and bypass it | |
* unless it's the end, in which case exit the program | |
* If the token length is nonzero, copy that token and return it | |
*/ | |
if (tokenLength == 0 && (startPosition + tokenLength < tokenStringLength) ) { | |
tk->cursorPosition++; | |
nextToken = (char*) malloc(sizeof(char)*2); | |
strcpy(nextToken, ""); | |
} else if ( tokenLength > 0 ) { | |
nextToken = (char*) malloc(sizeof(char)*(1+tokenLength)); | |
strncpy(nextToken, &(tk->tokenString[startPosition]), (size_t) tokenLength ); | |
nextToken[tokenLength+1] = '\0'; // strncpy doesn't automatically add '\0' | |
tk->cursorPosition += tokenLength + 1; | |
} else { | |
return 0; | |
} | |
return nextToken; | |
} | |
int main(int argc, char **argv) { | |
char* separatorString; | |
char* tokenString; | |
char* tokenResultString; | |
TokenizerT* tokenizer; | |
separatorString = argv[1]; | |
tokenString = argv[2]; | |
tokenizer = TKCreate(separatorString, tokenString); | |
if (tokenizer) { | |
tokenResultString = TKGetNextToken(tokenizer); | |
while( tokenResultString != 0 ) { | |
if (strcmp("", tokenResultString) == 0) { | |
printf(tokenResultString); | |
} else { | |
int i; | |
size_t length = strlen(tokenResultString); | |
for (i = 0; i < length; i++) { | |
printChar(tokenResultString[i]); | |
} | |
putchar('\n'); | |
free (tokenResultString); | |
} | |
tokenResultString = TKGetNextToken(tokenizer); | |
} | |
TKDestroy(tokenizer); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment