Skip to content

Instantly share code, notes, and snippets.

@mofosyne
Last active April 22, 2024 11:37
Show Gist options
  • Save mofosyne/81c94740c0f33259606afa823562914c to your computer and use it in GitHub Desktop.
Save mofosyne/81c94740c0f33259606afa823562914c to your computer and use it in GitHub Desktop.
alternative to strtok but with escaped character support for deliminating a string with a single char (e.g. csv or psv)
#include <stdio.h>
#include <string.h>
char *strtok_escaped(char *str, const char *delim) {
// Tokenise a string from a single char deliminator
// (strtok can deal with a deliminator string but for my purpose of splitting a psv table I only need one char)
// (There are better ways to optimise this, but just wanted something to work for now)
// https://gist.github.com/mofosyne/81c94740c0f33259606afa823562914c
static char *last_token_end = NULL;
if (str == NULL && last_token_end == NULL)
return NULL;
char *token_start = (str != NULL) ? str : last_token_end + 1;
char *token_end = token_start;
while (*token_end != '\0') {
if (*token_end == '\\' && *(token_end + 1) == delim[0]) {
// Handle escaped deliminator
memmove(token_end, token_end + 1, strlen(token_end + 1) + 1);
token_end++;
} else if (strchr(delim, *token_end) != NULL) {
// Found delimiter
*token_end = '\0';
last_token_end = token_end;
return token_start;
}
token_end++;
}
if (*token_start == '\0')
return NULL;
last_token_end = token_end - 1;
return token_start;
}
int main() {
// Define test cases
typedef struct {
char *input;
char *expected_tokens[10]; // Maximum of 10 tokens per test case
} Test;
char *delim = "|";
Test tests[] = {
{"apple|banana|cherry", {"apple", "banana", "cherry", NULL}},
{"apple\\|banana|cherry", {"apple|banana", "cherry", NULL}},
{"apple|banana|cherry\\", {"apple", "banana", "cherry\\", NULL}},
{"apple\\|banana|cherry\\", {"apple|banana", "cherry\\", NULL}},
{"apple|banana\\|cherry", {"apple", "banana|cherry", NULL}},
{"apple\\|banana\\|cherry", {"apple|banana|cherry", NULL}},
{"apple\\\\|banana|cherry", {"apple\\|banana", "cherry", NULL}},
{"apple\\|banana\\|cherry\\", {"apple|banana|cherry\\", NULL}},
{"apple|ba\\nana|cherry", {"apple", "ba\\nana", "cherry", NULL}},
{"", {NULL}} // Empty string test case
};
int num_tests = sizeof(tests) / sizeof(tests[0]);
int failed_tests = 0;
// Iterate through each test case
for (int i = 0; i < num_tests; i++) {
printf("\nTest Case %d: '%s'\n", i, tests[i].input);
int failed_sub_tests = 0;
// Copy input string to a buffer for tokenization
char str[512];
strcpy(str, tests[i].input);
// Tokenize the string and compare tokens with expected tokens
int token_index = 0;
char *token = strtok_escaped(str, delim);
while (token != NULL && tests[i].expected_tokens[token_index] != NULL) {
if (strcmp(token, tests[i].expected_tokens[token_index]) != 0) {
printf("Token Mismatch - got '%s' but expecting '%s' - failed\n", token, tests[i].expected_tokens[token_index]);
failed_sub_tests++;
} else {
printf("Token: '%s' - ok\n", token);
}
token_index++;
token = strtok_escaped(NULL, delim);
}
if (tests[i].expected_tokens[token_index] != NULL) {
printf("Incorrect number of tokens\n");
failed_sub_tests++;
}
if (failed_sub_tests) {
failed_tests++;
printf("FAILED\n");
}else {
printf("PASSED\n");
}
}
if (failed_tests > 0) {
printf("\n%d test(s) failed.\n", failed_tests);
return 1;
} else {
printf("\nAll tests passed.\n");
return 0;
}
}
@mofosyne
Copy link
Author

mofosyne commented Apr 22, 2024

When running the above you will get something like below

Test Case 0: 'apple|banana|cherry'
Token: 'apple' - ok
Token: 'banana' - ok
Token: 'cherry' - ok
PASSED

Test Case 1: 'apple\|banana|cherry'
Token: 'apple|banana' - ok
Token: 'cherry' - ok
PASSED

Test Case 2: 'apple|banana|cherry\'
Token: 'apple' - ok
Token: 'banana' - ok
Token: 'cherry\' - ok
PASSED

Test Case 3: 'apple\|banana|cherry\'
Token: 'apple|banana' - ok
Token: 'cherry\' - ok
PASSED

Test Case 4: 'apple|banana\|cherry'
Token: 'apple' - ok
Token: 'banana|cherry' - ok
PASSED

Test Case 5: 'apple\|banana\|cherry'
Token: 'apple|banana|cherry' - ok
PASSED

Test Case 6: 'apple\\|banana|cherry'
Token: 'apple\|banana' - ok
Token: 'cherry' - ok
PASSED

Test Case 7: 'apple\|banana\|cherry\'
Token: 'apple|banana|cherry\' - ok
PASSED

Test Case 8: 'apple|ba\nana|cherry'
Token: 'apple' - ok
Token: 'ba\nana' - ok
Token: 'cherry' - ok
PASSED

Test Case 9: ''
PASSED

All tests passed.

Additional note that I do not escape any other characters like \n etc... because in the application (specifically https://github.com/psv-format/psv.c ) I was intending to simply copy over the resultant token to a json data field.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment