Skip to content

Instantly share code, notes, and snippets.

@vittorioromeo
Created July 18, 2013 20:54
Show Gist options
  • Save vittorioromeo/6032999 to your computer and use it in GitHub Desktop.
Save vittorioromeo/6032999 to your computer and use it in GitHub Desktop.
Temp tokenizer
enum class TokenType
{
Whitespace, // Sequence of blanks, newlines and tabs
Keyword, // Hardcoded words
Identifier, // String of letters + digits starting with a letter
Number, // String of digits
ParenthesisRoundOpen,
ParenthesisRoundClose,
ParenthesisSquareOpen,
ParenthesisSquareClose,
ParenthesisCurlyOpen,
ParenthesisCurlyClose,
ParenthesisAngleOpen,
ParenthesisAngleClose,
Semicolon,
Operator
};
string getTokenStr(TokenType type)
{
switch(type)
{
case TokenType::Whitespace: return "TokenType::Whitespace";
case TokenType::Keyword: return "TokenType::Keyword";
case TokenType::Identifier: return "TokenType::Identifier";
case TokenType::Number: return "TokenType::Number";
case TokenType::ParenthesisRoundOpen: return "TokenType::ParenthesisRoundOpen";
case TokenType::ParenthesisRoundClose: return "TokenType::ParenthesisRoundClose";
case TokenType::ParenthesisSquareOpen: return "TokenType::ParenthesisSquareOpen";
case TokenType::ParenthesisSquareClose: return "TokenType::ParenthesisSquareClose";
case TokenType::ParenthesisCurlyOpen: return "TokenType::ParenthesisCurlyOpen";
case TokenType::ParenthesisCurlyClose: return "TokenType::ParenthesisCurlyClose";
case TokenType::ParenthesisAngleOpen: return "TokenType::ParenthesisAngleOpen";
case TokenType::ParenthesisAngleClose: return "TokenType::ParenthesisAngleClose";
case TokenType::Semicolon: return "TokenType::Semicolon";
case TokenType::Operator: return "TokenType::Operator";
}
}
int main()
{
using MyTokenizer = Tokenizer<TokenType>;
MyTokenizer tokenizer;
vector<string> keywords { "if", "else", "for", "while", "struct", "class", "enum" };
unordered_map<string, int> operators { {"*", 0}, {"/", 0}, {"%", 0}, {"+", 1}, {"-", 1}, {"<", 2}, {">", 2}, {"<=", 2}, {">=", 2}, {"==", 3}, {"!=", 3}, };
tokenizer.createRuleWord
(
[](const string&) { return TokenType::Whitespace; },
[](char mChar) { return isspace(mChar); },
[](char mChar) { return isspace(mChar); }
);
tokenizer.createRuleWord
(
[=](const string& mToken) { return contains(keywords, mToken) ? TokenType::Keyword : TokenType::Identifier; },
[](char mChar) { return isalpha(mChar); },
[](char mChar) { return isalnum(mChar); }
);
tokenizer.createRuleWord
(
[](const string&) { return TokenType::Number; },
[](char mChar) { return isdigit(mChar); },
[](char mChar) { return isdigit(mChar); }
);
tokenizer.createRuleWord
(
[](const string&) { return TokenType::Operator; },
[=](char mChar) { for(const auto& o : operators) if(o.first[0] == mChar) return true; return false; },
[=](char mChar) { for(const auto& o : operators) if(contains(o.first, mChar)) return true; return false; }
);
tokenizer.createRuleChar(TokenType::ParenthesisRoundOpen, '(');
tokenizer.createRuleChar(TokenType::ParenthesisRoundClose, ')');
tokenizer.createRuleChar(TokenType::ParenthesisSquareOpen, '[');
tokenizer.createRuleChar(TokenType::ParenthesisSquareClose, ']');
tokenizer.createRuleChar(TokenType::ParenthesisCurlyOpen, '{');
tokenizer.createRuleChar(TokenType::ParenthesisCurlyClose, '}');
tokenizer.createRuleChar(TokenType::ParenthesisAngleOpen, '<');
tokenizer.createRuleChar(TokenType::ParenthesisAngleClose, '>');
tokenizer.createRuleChar(TokenType::Semicolon, ';');
tokenizer.setSource("if(BananaPhone == Blah) { mmh(12345); } else { templateFailsToTokenize<TypeFFF>(hi); }");
tokenizer.process();
for(const auto& t : tokenizer.getTokens())
{
log("\t\t\t\t" + t.getStr(), getTokenStr(t.getType()));
}
return 0;
}
#ifndef SSVU_TOKENIZER
#define SSVU_TOKENIZER
#include <string>
#include <vector>
#include <unordered_map>
#include <functional>
#include "SSVUtils/Utils/UtilsContainers.h"
#include "SSVUtils/Log/Log.h"
#include "SSVUtils/Global/Typedefs.h"
namespace ssvu
{
template<typename T> class Token
{
private:
T type;
std::string str;
public:
Token(T mType, const std::string& mStr) : type{mType}, str{mStr} { }
inline const T& getType() const { return type; }
inline const std::string& getStr() const { return str; }
};
template<typename T> class Tokenizer
{
public:
struct RuleWord
{
std::function<T(const std::string&)> getType;
std::function<bool(char)> isValidStart, isValid;
RuleWord(std::function<T(const std::string&)> mGetType, std::function<bool(char)> mIsValidStart, std::function<bool(char)> mIsValid) : getType(mGetType), isValidStart(mIsValidStart), isValid(mIsValid) { }
};
struct RuleChar
{
T type;
char tokenChar;
RuleChar(T mType, char mTokenChar) : type{mType}, tokenChar{mTokenChar} { }
};
private:
std::vector<RuleChar> ruleChars;
std::vector<RuleWord> ruleWords;
std::vector<Token<T>> tokens;
std::string source;
std::string::iterator itr, tokenStart;
bool finished{false};
void advance() { ++itr; if(itr >= std::end(source)) finished = true; }
void emitToken(T mTokenType) { tokens.push_back({mTokenType, std::string(tokenStart, itr)}); }
public:
inline void setSource(const std::string& mSource) { source = mSource; }
inline const std::vector<Token<T>>& getTokens() const { return tokens; }
template<typename... TArgs> inline void createRuleChar(TArgs&&... mArgs) { ruleChars.emplace_back(std::forward<TArgs>(mArgs)...); }
template<typename... TArgs> inline void createRuleWord(TArgs&&... mArgs) { ruleWords.emplace_back(std::forward<TArgs>(mArgs)...); }
void process()
{
itr = std::begin(source);
tokenStart = itr;
while(!finished)
{
tokenStart = itr;
bool found{false};
for(auto& r : ruleWords)
{
if(r.isValidStart(*itr))
{
found = true;
while(r.isValid(*itr)) advance();
emitToken(r.getType(std::string(tokenStart, itr)));
goto end;
}
}
for(auto& r : ruleChars)
{
found = true;
if(r.tokenChar != *itr) continue;
advance(); emitToken(r.type);
goto end;
}
end:
if(!found) finished = true;
}
}
};
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment