Created
June 14, 2017 15:37
-
-
Save bit-hack/f816a9b6f971ef4198b7aadd0bed3759 to your computer and use it in GitHub Desktop.
A very small expression tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <string> | |
#include <vector> | |
static bool is_operator(const char ch) { | |
switch (ch) { | |
case '(': | |
case ')': | |
case '+': | |
case '-': | |
case '/': | |
case '*': | |
case '%': | |
case '&': | |
case '|': | |
return true; | |
default: | |
return false; | |
} | |
} | |
static bool is_value(const char ch) { | |
bool ret = false; | |
ret |= ch >= 'a' && ch <= 'z'; | |
ret |= ch >= 'A' && ch <= 'Z'; | |
ret |= ch >= '0' && ch <= '9'; | |
ret |= ch == '$'; | |
ret |= ch == '_'; | |
return ret; | |
} | |
static bool is_whitespace(const char ch) { | |
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; | |
} | |
static size_t tokenize(std::string input, std::vector<std::string> &out) { | |
out.clear(); | |
const char *h = input.c_str(); | |
const char *t = input.c_str(); | |
for (; *h != '\0'; ++h) { | |
const char ch = *h; | |
// if head == tail | |
if (h == t) { | |
// skip over whitespaces | |
if (is_whitespace(ch)) { | |
++t; | |
continue; | |
} | |
// push operators immediately | |
if (is_operator(ch)) { | |
out.push_back(std::string(1, ch)); | |
t = h + 1; | |
continue; | |
} | |
} | |
// if head != tail | |
else { | |
// non value types signal push point | |
if (!is_value(ch)) { | |
std::string tok = std::string(t, h); | |
out.push_back(std::move(tok)); | |
t = h; | |
h -= 1; | |
} | |
} | |
} | |
// push any remaining tokens | |
if (h != t) { | |
out.push_back(std::string{t, h}); | |
} | |
// return number of parsed tokens | |
return out.size(); | |
} | |
void expect(size_t exp, size_t in) { | |
if (exp != in) { | |
assert(!"Fail"); | |
} | |
} | |
int main() { | |
std::vector<std::string> out; | |
expect( 9, tokenize(std::string{"$thing + 4*( $var - 0x2345)"}, out)); | |
expect( 3, tokenize(std::string{"$thing + 1234"}, out)); | |
expect( 5, tokenize(std::string{"$aa ++ -0x3458"}, out)); | |
expect( 4, tokenize(std::string{"$a $b_c 1234 0xsdf "}, out)); | |
expect( 2, tokenize(std::string{" + - "}, out)); | |
expect(11, tokenize(std::string{"((1234+45)-1234)*10"}, out)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment