Skip to content

Instantly share code, notes, and snippets.

@Artiavis
Created January 30, 2014 00:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Artiavis/8700573 to your computer and use it in GitHub Desktop.
Save Artiavis/8700573 to your computer and use it in GitHub Desktop.
Intro assignment for Systems Programming at Rutgers. Create a string tokenizer.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct TokenizerT_ {
char* separators; // C string of separation characters
char* tokenString; // C string of unprocessed tokens
size_t separatorStringLength; // length of separator string
size_t tokenStringLength; // length of token string
int cursorPosition; // cursor to token string
};
typedef struct TokenizerT_ TokenizerT;
// Maps escaped letters to control characters
char escapeChar(char c) {
switch (c) {
case 'n':
return '\n';
case 't':
return '\t';
case 'v':
return '\v';
case 'b':
return '\b';
case 'r':
return '\r';
case 'f':
return '\f';
case 'a':
return '\a';
case '\\':
return '\\';
case '\"':
return '\"';
default:
return c;
}
}
// Maps control characters to hex values
void printChar(char c) {
switch (c) {
case '\n':
printf("[0x0a]");
break;
case '\t':
printf("[0x09]");;
break;
case '\v':
printf("[0x0b]");
break;
case '\b':
printf("[0x08]");
break;
case '\r':
printf("[0x0d]");
break;
case '\f':
printf("[0x0c]");
break;
case '\a':
printf("[0x07]");
break;
case '\\':
printf("[0x5c]");
break;
case '\"':
printf("[0x22]");
break;
default:
putchar(c);
break;
}
}
// Function to iterate through strings and replace control sequences using the escapeChar function
char* escapeString(char* str) {
size_t tempLength = strlen(str);
char* temp = (char*) malloc(sizeof(char)*(1+tempLength));
int i;
int j = 0; // counter to observe differences between string positions
for (i = 0; i + j < tempLength + 1; i++) {
if (str[i+j] != '\\') {
temp[i] = str[i+j];
} else {
temp[i] = escapeChar(str[i+j+1]);
j++;
}
}
return temp;
}
TokenizerT *TKCreate(char *separators, char *ts) {
TokenizerT* tokenizer = (TokenizerT*) malloc(sizeof(TokenizerT));
if (tokenizer) {
// Escape the control sequences inside the strings
char* escapedSeps = escapeString(separators);
char* escapedToks = escapeString(ts);
// Copy the values over (eliminates dead space at end of strings)
tokenizer->separatorStringLength = strlen(escapedSeps);
tokenizer->tokenStringLength = strlen(escapedToks);
tokenizer->separators = (char*) malloc(sizeof(char)*(1+tokenizer->separatorStringLength));
tokenizer->tokenString = (char*) malloc(sizeof(char)*(1+tokenizer->tokenStringLength));
strcpy(tokenizer->separators, escapedSeps);
strcpy(tokenizer->tokenString, escapedToks);
tokenizer->cursorPosition = 0;
// Free temp strings
free(escapedSeps);
free(escapedToks);
}
// If malloc fails, pointer will be NULL anyway
return tokenizer;
}
void TKDestroy(TokenizerT *tk) {
free(tk->separators);
free(tk->tokenString);
free (tk);
}
// Function to check that a given character is not a token separator
int charNotSeparator(char c, TokenizerT *tk) {
int i;
for (i = 0; i < tk->separatorStringLength; i++) {
if (tk->separators[i] == c) {
return 0;
}
}
return 1; // return true
}
char *TKGetNextToken(TokenizerT *tk) {
char* nextToken;
int tokenLength = 0;
int startPosition = tk->cursorPosition;
int tokenStringLength = tk->tokenStringLength;
// increment tokenLength counter while token is incomplete
while (startPosition + tokenLength < tokenStringLength) {
if (charNotSeparator(tk->tokenString[startPosition + tokenLength], tk)) {
tokenLength++;
} else {
break;
}
}
/*
* If tokenLength is 0, that character is a separator, and bypass it
* unless it's the end, in which case exit the program
* If the token length is nonzero, copy that token and return it
*/
if (tokenLength == 0 && (startPosition + tokenLength < tokenStringLength) ) {
tk->cursorPosition++;
nextToken = (char*) malloc(sizeof(char)*2);
strcpy(nextToken, "");
} else if ( tokenLength > 0 ) {
nextToken = (char*) malloc(sizeof(char)*(1+tokenLength));
strncpy(nextToken, &(tk->tokenString[startPosition]), (size_t) tokenLength );
nextToken[tokenLength+1] = '\0'; // strncpy doesn't automatically add '\0'
tk->cursorPosition += tokenLength + 1;
} else {
return 0;
}
return nextToken;
}
int main(int argc, char **argv) {
char* separatorString;
char* tokenString;
char* tokenResultString;
TokenizerT* tokenizer;
separatorString = argv[1];
tokenString = argv[2];
tokenizer = TKCreate(separatorString, tokenString);
if (tokenizer) {
tokenResultString = TKGetNextToken(tokenizer);
while( tokenResultString != 0 ) {
if (strcmp("", tokenResultString) == 0) {
printf(tokenResultString);
} else {
int i;
size_t length = strlen(tokenResultString);
for (i = 0; i < length; i++) {
printChar(tokenResultString[i]);
}
putchar('\n');
free (tokenResultString);
}
tokenResultString = TKGetNextToken(tokenizer);
}
TKDestroy(tokenizer);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment