Skip to content

Instantly share code, notes, and snippets.

@zambony
Last active February 29, 2020 07:53
Show Gist options
  • Save zambony/3094139c40813f3f55b73f1fdf28001a to your computer and use it in GitHub Desktop.
Save zambony/3094139c40813f3f55b73f1fdf28001a to your computer and use it in GitHub Desktop.
Better string separation in C
typedef enum BOOL
{
false = 0,
true = 1
} bool;
/**
* Tokenize a string using a custom separator and obey tags (such as chevrons or double quotes)
*
* When finished, remember to free each entry and then the array itself
*
* @param string The string to break apart
* @param separator The separator to break tokens apart by
* @param openTag The character which signals the start of a tagged argument
* @param closeTag The character which signals the end of a tagged argument
* @param bRemoveTag Whether or not to remove the tags from the final result
* @param numTokens Number of entries is assigned to this variable if not NULL
*
* @return The token array
*/
char **explodeByTag(const char *string,
const char separator,
const char openTag,
const char closeTag,
bool bRemoveTag,
size_t *const numTokens)
{
size_t tokensAllocated = 1;
size_t tokenCount = 0;
char **tokenList = calloc(tokensAllocated, sizeof(char *)); // allocate an array to store whole words per-index
char *buffer = (char *) malloc(CHUNK_SIZE);
ssize_t bufSize = CHUNK_SIZE;
bool bTag = false;
int i = 0; // which character are we reading from the string
int pos = 0; // current length of the read buffer, used to track where to append new characters
for (i = 0; string[i] != 0; i++)
{
char read = string[i];
if (tokenCount == tokensAllocated)
{
tokensAllocated *= 2; // grow buffer more than needed for safety
tokenList = realloc(tokenList, tokensAllocated * sizeof(char *));
}
// If we're not inside a tag...
if (!bTag)
{
// If the current character is the opening tag...
if (read == openTag)
{
// If they would like to keep the tag...
if (!bRemoveTag)
{
// Concatenate the current character
buffer[pos++] = read;
}
// Set state to inside tag
bTag = true;
}
// We encountered a regular separator, concatenate the current buffer
else if (read == separator)
{
// Terminate string, and reset buffer size
buffer[pos] = '\0';
tokenList[tokenCount++] = strdup(trim(buffer));
// Clear the contents of the buffer. After clear, reset buffer size and set the buffer to the default size
memset(buffer, 0, bufSize + 1);
bufSize = CHUNK_SIZE;
buffer = realloc(buffer, bufSize);
pos = 0;
}
else
{
buffer[pos++] = read;
}
}
else
{
if (read == closeTag)
{
if (!bRemoveTag)
{
buffer[pos++] = read;
}
bTag = false;
}
else
{
buffer[pos++] = read;
}
}
// The length of the current buffer has exceeded the buffer size, reallocate!
if (pos == bufSize)
{
bufSize += CHUNK_SIZE;
buffer = realloc(buffer, bufSize);
}
}
if (buffer[0] != 0)
{
tokenList = realloc(tokenList, (tokenCount + 1) * sizeof(char *));
buffer = realloc(buffer, pos + 1);
buffer[pos] = '\0';
tokenList[tokenCount++] = strdup(trim(buffer));
}
if (tokenCount == 0) // if we have no tokens just delete our list
{
free(tokenList);
tokenList = NULL;
}
else // otherwise shrink to fit!
{
tokenList = realloc(tokenList, tokenCount * sizeof(char *));
}
*numTokens = tokenCount;
free(buffer); // clean the buffer out
return tokenList;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment