Last active
February 29, 2020 07:53
-
-
Save zambony/3094139c40813f3f55b73f1fdf28001a to your computer and use it in GitHub Desktop.
Better string separation in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
typedef enum BOOL | |
{ | |
false = 0, | |
true = 1 | |
} bool; | |
/** | |
* Tokenize a string using a custom separator and obey tags (such as chevrons or double quotes) | |
* | |
* When finished, remember to free each entry and then the array itself | |
* | |
* @param string The string to break apart | |
* @param separator The separator to break tokens apart by | |
* @param openTag The character which signals the start of a tagged argument | |
* @param closeTag The character which signals the end of a tagged argument | |
* @param bRemoveTag Whether or not to remove the tags from the final result | |
* @param numTokens Number of entries is assigned to this variable if not NULL | |
* | |
* @return The token array | |
*/ | |
char **explodeByTag(const char *string, | |
const char separator, | |
const char openTag, | |
const char closeTag, | |
bool bRemoveTag, | |
size_t *const numTokens) | |
{ | |
size_t tokensAllocated = 1; | |
size_t tokenCount = 0; | |
char **tokenList = calloc(tokensAllocated, sizeof(char *)); // allocate an array to store whole words per-index | |
char *buffer = (char *) malloc(CHUNK_SIZE); | |
ssize_t bufSize = CHUNK_SIZE; | |
bool bTag = false; | |
int i = 0; // which character are we reading from the string | |
int pos = 0; // current length of the read buffer, used to track where to append new characters | |
for (i = 0; string[i] != 0; i++) | |
{ | |
char read = string[i]; | |
if (tokenCount == tokensAllocated) | |
{ | |
tokensAllocated *= 2; // grow buffer more than needed for safety | |
tokenList = realloc(tokenList, tokensAllocated * sizeof(char *)); | |
} | |
// If we're not inside a tag... | |
if (!bTag) | |
{ | |
// If the current character is the opening tag... | |
if (read == openTag) | |
{ | |
// If they would like to keep the tag... | |
if (!bRemoveTag) | |
{ | |
// Concatenate the current character | |
buffer[pos++] = read; | |
} | |
// Set state to inside tag | |
bTag = true; | |
} | |
// We encountered a regular separator, concatenate the current buffer | |
else if (read == separator) | |
{ | |
// Terminate string, and reset buffer size | |
buffer[pos] = '\0'; | |
tokenList[tokenCount++] = strdup(trim(buffer)); | |
// Clear the contents of the buffer. After clear, reset buffer size and set the buffer to the default size | |
memset(buffer, 0, bufSize + 1); | |
bufSize = CHUNK_SIZE; | |
buffer = realloc(buffer, bufSize); | |
pos = 0; | |
} | |
else | |
{ | |
buffer[pos++] = read; | |
} | |
} | |
else | |
{ | |
if (read == closeTag) | |
{ | |
if (!bRemoveTag) | |
{ | |
buffer[pos++] = read; | |
} | |
bTag = false; | |
} | |
else | |
{ | |
buffer[pos++] = read; | |
} | |
} | |
// The length of the current buffer has exceeded the buffer size, reallocate! | |
if (pos == bufSize) | |
{ | |
bufSize += CHUNK_SIZE; | |
buffer = realloc(buffer, bufSize); | |
} | |
} | |
if (buffer[0] != 0) | |
{ | |
tokenList = realloc(tokenList, (tokenCount + 1) * sizeof(char *)); | |
buffer = realloc(buffer, pos + 1); | |
buffer[pos] = '\0'; | |
tokenList[tokenCount++] = strdup(trim(buffer)); | |
} | |
if (tokenCount == 0) // if we have no tokens just delete our list | |
{ | |
free(tokenList); | |
tokenList = NULL; | |
} | |
else // otherwise shrink to fit! | |
{ | |
tokenList = realloc(tokenList, tokenCount * sizeof(char *)); | |
} | |
*numTokens = tokenCount; | |
free(buffer); // clean the buffer out | |
return tokenList; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment