Skip to content

Instantly share code, notes, and snippets.

@lascar-pacagi lascar-pacagi/lexer.cpp
Last active Dec 31, 2019

Embed
What would you like to do?
#include <iostream>
#include <vector>
#include <map>
#include <variant>
#include <exception>
#include <fstream>
using namespace std;
void debug(auto f) {
#ifdef DEBUG
f();
#endif
}
struct position {
int lnum = 0;
int cnum = 0;
int bol = 0;
};
ostream& operator<<(ostream& os, const position& pos) {
os << "(line: " << pos.lnum + 1 << ", char: " << pos.cnum - pos.bol + 1 << ")";
return os;
}
class lexbuf {
/*
0 lex_buffer_len - 1 lex_buffer.size() - 1
v v
+----------+------------------------------------------+----------------------+
| junk | valid data | junk |
+----------+------------------------------------------+----------------------+
^ ^ ^
lex_start_pos lex_last_pos lex_curr_pos
*/
static constexpr char eof = 128;
int lex_start_pos = 0;
int lex_last_pos = 0;
int lex_curr_pos = 0;
int lex_buffer_len = 0;
position lex_start_p;
position lex_last_p;
position lex_curr_p;
istream* input;
bool owns;
vector<char> lex_buffer;
static constexpr int refill_size = 8;
char aux_buffer[refill_size];
void refill() {
debug([]{cout << "refill" << endl;});
debug([this]{print_buffer();});
input->read(aux_buffer, refill_size);
int n = input->gcount();
debug([=]{cout << n << endl;});
if (input->eof()) {
aux_buffer[n++] = eof;
}
if (lex_buffer_len + n > static_cast<int>(lex_buffer.size())) {
// not enough space at the end of buffer
debug([]{cout << "not enough space" << endl;});
if (lex_buffer_len - lex_start_pos + n <= static_cast<int>(lex_buffer.size())) {
// enough space at the beginning of buffer
debug([]{cout << "enough space at beginning" << endl;});
copy(begin(lex_buffer) + lex_start_pos,
begin(lex_buffer) + lex_buffer_len,
begin(lex_buffer));
} else {
// need to resize buffer
debug([]{cout << "need to resize" << endl;});
lex_buffer.resize(lex_buffer.size() + n);
debug([&]{cout << "lex_buffer capacity " << lex_buffer.capacity() << endl;});
copy(begin(lex_buffer) + lex_start_pos,
begin(lex_buffer) + lex_buffer_len,
begin(lex_buffer));
}
int s = lex_start_pos;
lex_start_pos = 0;
lex_last_pos -= s;
lex_curr_pos -= s;
lex_buffer_len -= s;
}
copy(begin(aux_buffer),
begin(aux_buffer) + n,
begin(lex_buffer) + lex_buffer_len);
lex_buffer_len += n;
debug([this]{print_buffer();});
debug([]{cout << "end refill" << endl;});
}
void close() {
if (owns) delete input;
}
void set_exceptions() {
input->exceptions(istream::badbit);
}
public:
lexbuf(istream& is) : input(&is), owns(false) {
set_exceptions();
}
lexbuf(istream* is) : input(is), owns(true) {
set_exceptions();
}
lexbuf(const lexbuf&) = delete;
lexbuf& operator=(const lexbuf&) = delete;
lexbuf(lexbuf&&) = delete;
lexbuf& operator=(lexbuf&&) = delete;
void set_input(istream& is) {
close();
input = &is;
owns = false;
set_exceptions();
}
void set_input(istream* is) {
close();
input = is;
owns = true;
set_exceptions();
}
~lexbuf() {
close();
}
char next() {
if (lex_curr_pos == lex_buffer_len) {
refill();
}
if (lex_buffer[lex_curr_pos] == eof) return eof;
char res = lex_buffer[lex_curr_pos];
debug([this]{print_buffer();});
lex_curr_pos++;
lex_curr_p.cnum++;
return res;
}
string lexeme() const {
return {begin(lex_buffer) + lex_start_pos,
begin(lex_buffer) + lex_curr_pos};
}
position lexeme_start_p() const {
return lex_start_p;
}
position lexeme_end_p() const {
return lex_curr_p;
}
void set_start_position() {
lex_start_pos = lex_curr_pos;
lex_start_p = lex_curr_p;
}
void save_position() {
lex_last_pos = lex_curr_pos;
lex_last_p = lex_curr_p;
}
void backup_to_last_position() {
lex_curr_pos = lex_last_pos;
lex_curr_p = lex_last_p;
}
void newline() {
lex_curr_p.lnum++;
lex_curr_p.bol = lex_curr_p.cnum;
}
void print_buffer() {
string start;
string last;
string current;
for (int i = 0; i < lex_buffer_len; i++) {
int n = 0;
char c = lex_buffer[i];
if (c == '\n') n += 2, cout << "\\n";
else if (c == '\t') n += 2, cout << "\\t";
else if (c == eof) n += 3, cout << "eof";
else n++, cout << (c == ' ' ? '_' : c);
if (i < lex_start_pos) start += string(n, ' ');
if (i < lex_last_pos) last += string(n, ' ');
if (i < lex_curr_pos) current += string(n, ' ');
}
cout << "\n";
cout << start << "^ start pos\n";
cout << last << "^ last pos\n";
cout << current << "^ current pos\n";
}
};
enum class kind {
let, letrec, integer, identifier, eof,
};
struct token {
enum kind kind;
variant<string, int> attribute;
position start;
position end;
};
ostream& operator<<(ostream& os, const token& token) {
static const vector<string> kind_to_string {"LET", "LETREC", "INT", "ID", "EOF"};
os << "token(" << kind_to_string[static_cast<int>(token.kind)];
if (token.kind == kind::integer) {
os << ", " << get<int>(token.attribute);
} else {
os << ", " << get<string>(token.attribute);
}
os << ", start: " << token.start;
os << ", end: " << token.end << ")";
return os;
}
class lexical_error : public exception {
};
struct dfa {
using state = int;
static constexpr state initial_state = 0;
static constexpr state sink_state = 15;
private:
vector<int> char_to_index {21,21,21,21,21,21,21,21,21,18,19,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,17,21,21,21,21,21,21,21,21,21,16,21,21,21,21,15,5,6,7,8,9,10,11,12,13,14,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,4,21,1,21,21,21,21,21,21,0,21,21,21,21,21,3,21,2,21,21,21,21,21,21,21,21,21,21,21,20};
vector<vector<state>> transitions
{
// l e t r c 0 1 2 3 4 5 6 7 8 9 / * ' ' \t \n eof others
/*0*/ { 7, 13, 13, 13, 13, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 15, 5, 5, 5, 14, 15, },
/*1*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 2, 15, 15, 15, 15, 15, },
/*2*/ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 15, 15, },
/*3*/ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 2, 2, 15, 15, },
/*4*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, },
/*5*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 5, 5, 5, 15, 15, },
/*6*/ { 15, 15, 15, 15, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 15, 15, 15, 15, 15, 15, 15, },
/*7*/ { 13, 8, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, },
/*8*/ { 13, 13, 9, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, },
/*9*/ { 13, 13, 13, 10, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, },
/*10*/{ 13, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, },
/*11*/{ 13, 13, 13, 13, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, },
/*12*/{ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, },
/*13*/{ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, },
/*14*/{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, },
/*15*/{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, },
};
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
vector<bool> accepting { false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, false };
map<state, token> state_to_token
{
{6, {kind::integer}},
{7, {kind::identifier}},
{8, {kind::identifier}},
{9, {kind::let}},
{10, {kind::identifier}},
{11, {kind::identifier}},
{12, {kind::letrec}},
{13, {kind::identifier}},
{14, {kind::eof}},
};
public:
state transition(state s, char c) {
return transitions[s][char_to_index[static_cast<uint8_t>(c)]];
}
bool is_accepting(state s) {
return accepting[s];
}
bool ignore(state s) {
return state_to_token.count(s) == 0;
}
token get_token(state s, const string& lexeme,
const position& lexeme_start, const position& lexeme_end) {
token token = state_to_token[s];
token.start = lexeme_start;
token.end = lexeme_end;
switch (token.kind) {
case kind::integer:
token.attribute = stoi(lexeme);
break;
case kind::eof:
token.attribute = "eof";
break;
case kind::identifier:
case kind::let:
case kind::letrec:
token.attribute = lexeme;
break;
}
return token;
}
};
template <typename DFA>
class scanner {
DFA dfa;
lexbuf& lbuf;
public:
scanner(const DFA& dfa, lexbuf& lbuf) : dfa(dfa), lbuf(lbuf) {}
token next() {
typename DFA::state state = DFA::initial_state;
typename DFA::state last_final_state = DFA::sink_state;
lbuf.set_start_position();
while (true) {
char c = lbuf.next();
debug([=]{cout << "next: " << c << endl;});
if (c == '\n') lbuf.newline();
state = dfa.transition(state, c);
debug([=]{cout << "state: " << state << endl;});
if (dfa.is_accepting(state)) {
last_final_state = state;
lbuf.save_position();
} else if (state == DFA::sink_state) {
if (last_final_state == DFA::sink_state) throw lexical_error{};
state = last_final_state;
lbuf.backup_to_last_position();
if (dfa.ignore(state)) return debug([]{cout << "ignore" << endl;}), next();
return dfa.get_token(state, lbuf.lexeme(),
lbuf.lexeme_start_p(), lbuf.lexeme_end_p());
}
}
}
};
int main(int argc, char *argv[]) {
lexbuf lbuf{cin};
if (argc == 2) {
lbuf.set_input(new ifstream{argv[1], ifstream::in});
}
scanner scanner{dfa{}, lbuf};
while (true) {
token token = scanner.next();
cout << token << "\n";
if (token.kind == kind::eof) break;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.