Skip to content

Instantly share code, notes, and snippets.

@abenkovskii
Created April 3, 2018 06:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abenkovskii/c8a0e5084e399198c3bc50cef2602ac7 to your computer and use it in GitHub Desktop.
Save abenkovskii/c8a0e5084e399198c3bc50cef2602ac7 to your computer and use it in GitHub Desktop.
#include <string>
#include <iostream>
#include <algorithm>
#include <iterator>
#include <cctype>
using std::string;
using std::istream;
using std::find;
using std::begin;
using std::end;
class Token {
public:
enum class Type
{
// kw
PROGRAM,
INT,
STRING,
BOOL,
REAL,
IF,
ELSE,
CASE,
OF,
END,
DO,
WHILE,
READ,
WRITE,
BREAK,
NOT,
AND,
OR,
TRUE,
FALSE,
// delim (single char)
OPEN_CURLY,
CLOSE_CURLY,
SEMICOLON,
COMMA,
OPEN_ROUND,
CLOSE_ROUND,
ASSIGN,
COLON,
MUL,
DIV,
MOD,
PLUS,
MINUS,
LT,
GT,
// delim (multiple char)
LE,
GE,
EQ,
NEQ,
// other
ID,
INT_CONST,
STRING_CONST,
REAL_CONST,
SPACE,
END_OF_FILE
};
Token(Type t):type_(t) {}
Token(Type t, string s):type_(t), str_(s) {}
Token(Type t, int i):type_(t), int_(i) {}
Token(Type t, double d):type_(t), double_(d) {}
private:
Type type_;
int int_;
double double_;
string str_;
};
namespace {
Token::Type kw_tok[] = {
Token::Type::PROGRAM,
Token::Type::INT,
Token::Type::STRING,
Token::Type::BOOL,
Token::Type::REAL,
Token::Type::IF,
Token::Type::ELSE,
Token::Type::CASE,
Token::Type::OF,
Token::Type::END,
Token::Type::DO,
Token::Type::WHILE,
Token::Type::READ,
Token::Type::WRITE,
Token::Type::BREAK,
Token::Type::NOT,
Token::Type::AND,
Token::Type::OR,
Token::Type::TRUE,
Token::Type::FALSE
};
string kw[] = {
"program",
"int",
"string",
"bool",
"real",
"if",
"else",
"case",
"of",
"end",
"do",
"while",
"read",
"write",
"break",
"not",
"and",
"or",
"true",
"false"
};
}
// XXX
typedef int UnexpectedCharacter;
int no_eof(int c) {
if(c == EOF)
throw UnexpectedCharacter(c);
return c;
}
bool is_id_char(int c) {
return isdigit(c) || ('a' <= c && c<= 'z') || ('A' <= c && c <= 'Z');
}
bool my_isspace(int c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
}
// XXX: can't find proof that one character putback is guarantied after reading
// XXX: are all this chars in the basic source char set?
Token next_token(istream &is) {
int c = is.get();
switch(c) {
case '{': return Token::Type::OPEN_CURLY;
case '}': return Token::Type::CLOSE_CURLY;
case ';': return Token::Type::SEMICOLON;
case ',': return Token::Type::COMMA;
case '(': return Token::Type::OPEN_ROUND;
case ')': return Token::Type::CLOSE_ROUND;
case ':': return Token::Type::COLON;
case '*': return Token::Type::MUL;
case '%': return Token::Type::MOD;
case '+': return Token::Type::PLUS;
case '-': return Token::Type::MINUS;
case EOF: return Token::Type::END_OF_FILE;
case '=':
if((c = no_eof(is.get())) == '=')
return Token::Type::EQ;
is.unget();
return Token::Type::ASSIGN;
case '<':
if((c = no_eof(is.get())) == '=')
return Token::Type::LE;
is.unget();
return Token::Type::LT;
case '>':
if((c = no_eof(is.get())) == '=')
return Token::Type::GE;
is.unget();
return Token::Type::GT;
case '!':
if((c = no_eof(is.get())) == '=')
return Token::Type::NEQ;
throw UnexpectedCharacter(c);
// "/*/" -- not a comment
// "/* * */" -- comment
// "/* * / */" -- comment
// "/* **/" -- comment
case '/':
if((c = no_eof(is.get())) != '*') {
is.unget();
return Token::Type::DIV;
}
goto comment;
comment:
if(no_eof(is.get()) == '*')
goto asterix_found;
else
goto comment;
asterix_found:
switch(no_eof(is.get())) {
case '/':
goto done;
case '*':
goto asterix_found;
default:
goto comment;
}
done:
return Token::Type::SPACE;
case '"':
{
string s;
while((c = no_eof(is.get())) != '"')
s.push_back(c);
return Token(Token::Type::STRING_CONST, s);
}
default:
if(my_isspace(c)) {
while(my_isspace(c = is.get()))
continue;
if(c != EOF)
is.unget();
return Token::Type::SPACE;
} else if(isdigit(c)) {
// TODO: when writing parser don't forget that constants can have signs
// bonus points: "- /* comment */ 3.14" is technically not a constant
string s;
for(;isdigit(c); c=no_eof(is.get()))
s.push_back(c);
if(c != '.') {
is.unget();
return Token(Token::Type::INT_CONST, stoi(s));
}
s.push_back('.');
// XXX "314." should not be valid
for(c=no_eof(is.get()); isdigit(c); c=no_eof(is.get()))
s.push_back(c);
is.unget();
return Token(Token::Type::REAL_CONST, stod(s));
} else if (is_id_char(c)) {
string s;
for(;is_id_char(c); c=no_eof(is.get()))
s.push_back(c);
is.unget();
auto k = find(begin(kw), end(kw), s);
if(k != end(kw))
return Token(kw_tok[k - begin(kw)]);
return Token(Token::Type::ID, s);
}
throw UnexpectedCharacter(c);
}
}
int main() {
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment