Last active
November 3, 2015 12:39
-
-
Save artemkin/d2685305014b8559917e to your computer and use it in GitHub Desktop.
Simple tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <functional> | |
#include <ctype.h> | |
using Tokenizer = std::function<std::string()>; | |
Tokenizer CreateTokenizer(const std::string& i_str) | |
{ | |
if (i_str.empty()) | |
return [] { return std::string(); }; | |
enum Type | |
{ | |
eUnknown, | |
eAlpha, | |
eDigit, | |
ePunct | |
}; | |
auto classify = [](char ch) | |
{ | |
if (isalpha(ch)) | |
return eAlpha; | |
if (isdigit(ch)) | |
return eDigit; | |
if (ispunct(ch)) | |
return ePunct; | |
return eUnknown; | |
}; | |
Type state = classify(i_str[0]); | |
size_t idx = 0; | |
return [state, classify, idx, &i_str]() mutable | |
{ | |
if (idx >= i_str.size()) | |
return std::string(); | |
size_t prevIdx = idx; | |
while (idx < i_str.size()) | |
{ | |
char ch = i_str[idx]; | |
auto type = classify(ch); | |
if (state != type) | |
{ | |
state = type; | |
return i_str.substr(prevIdx, idx - prevIdx); | |
} | |
state = type; | |
++idx; | |
} | |
return i_str.substr(prevIdx, idx - prevIdx); | |
}; | |
} | |
int main() { | |
std::string str = "trades=4;pur1=short;typ1=put;pur2=long;typ2=put;sym2=sym1;exp2=exp1;qty2=qty1;str2>str1;pur3=long;typ3=call;sym3=sym1;exp3=exp1;qty3=qty1;str3>str2;pur4=short;typ4=call;sym4=sym1;exp4=exp1;qty4=qty1;str4>str3;enforce=all;"; | |
auto getNextToken = CreateTokenizer(str); | |
for (;;) | |
{ | |
const auto& token = getNextToken(); | |
if (token.empty()) | |
{ | |
break; | |
} | |
std::cout << "token(" << token << ")" << std::endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
output: