Last active
December 31, 2019 13:36
-
-
Save lascar-pacagi/a98b218c00eb446c8294b2683866ed56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <vector> | |
#include <map> | |
#include <variant> | |
#include <exception> | |
#include <fstream> | |
using namespace std; | |
void debug(auto f) { | |
#ifdef DEBUG | |
f(); | |
#endif | |
} | |
struct position { | |
int lnum = 0; | |
int cnum = 0; | |
int bol = 0; | |
}; | |
ostream& operator<<(ostream& os, const position& pos) { | |
os << "(line: " << pos.lnum + 1 << ", char: " << pos.cnum - pos.bol + 1 << ")"; | |
return os; | |
} | |
class lexbuf { | |
/* | |
0 lex_buffer_len - 1 lex_buffer.size() - 1 | |
v v | |
+----------+------------------------------------------+----------------------+ | |
| junk | valid data | junk | | |
+----------+------------------------------------------+----------------------+ | |
^ ^ ^ | |
lex_start_pos lex_last_pos lex_curr_pos | |
*/ | |
static constexpr char eof = 128; | |
int lex_start_pos = 0; | |
int lex_last_pos = 0; | |
int lex_curr_pos = 0; | |
int lex_buffer_len = 0; | |
position lex_start_p; | |
position lex_last_p; | |
position lex_curr_p; | |
istream* input; | |
bool owns; | |
vector<char> lex_buffer; | |
static constexpr int refill_size = 8; | |
char aux_buffer[refill_size]; | |
void refill() { | |
debug([]{cout << "refill" << endl;}); | |
debug([this]{print_buffer();}); | |
input->read(aux_buffer, refill_size); | |
int n = input->gcount(); | |
debug([=]{cout << n << endl;}); | |
if (input->eof()) { | |
aux_buffer[n++] = eof; | |
} | |
if (lex_buffer_len + n > static_cast<int>(lex_buffer.size())) { | |
// not enough space at the end of buffer | |
debug([]{cout << "not enough space" << endl;}); | |
if (lex_buffer_len - lex_start_pos + n <= static_cast<int>(lex_buffer.size())) { | |
// enough space at the beginning of buffer | |
debug([]{cout << "enough space at beginning" << endl;}); | |
copy(begin(lex_buffer) + lex_start_pos, | |
begin(lex_buffer) + lex_buffer_len, | |
begin(lex_buffer)); | |
} else { | |
// need to resize buffer | |
debug([]{cout << "need to resize" << endl;}); | |
lex_buffer.resize(lex_buffer.size() + n); | |
debug([&]{cout << "lex_buffer capacity " << lex_buffer.capacity() << endl;}); | |
copy(begin(lex_buffer) + lex_start_pos, | |
begin(lex_buffer) + lex_buffer_len, | |
begin(lex_buffer)); | |
} | |
int s = lex_start_pos; | |
lex_start_pos = 0; | |
lex_last_pos -= s; | |
lex_curr_pos -= s; | |
lex_buffer_len -= s; | |
} | |
copy(begin(aux_buffer), | |
begin(aux_buffer) + n, | |
begin(lex_buffer) + lex_buffer_len); | |
lex_buffer_len += n; | |
debug([this]{print_buffer();}); | |
debug([]{cout << "end refill" << endl;}); | |
} | |
void close() { | |
if (owns) delete input; | |
} | |
void set_exceptions() { | |
input->exceptions(istream::badbit); | |
} | |
public: | |
lexbuf(istream& is) : input(&is), owns(false) { | |
set_exceptions(); | |
} | |
lexbuf(istream* is) : input(is), owns(true) { | |
set_exceptions(); | |
} | |
lexbuf(const lexbuf&) = delete; | |
lexbuf& operator=(const lexbuf&) = delete; | |
lexbuf(lexbuf&&) = delete; | |
lexbuf& operator=(lexbuf&&) = delete; | |
void set_input(istream& is) { | |
close(); | |
input = &is; | |
owns = false; | |
set_exceptions(); | |
} | |
void set_input(istream* is) { | |
close(); | |
input = is; | |
owns = true; | |
set_exceptions(); | |
} | |
~lexbuf() { | |
close(); | |
} | |
char next() { | |
if (lex_curr_pos == lex_buffer_len) { | |
refill(); | |
} | |
if (lex_buffer[lex_curr_pos] == eof) return eof; | |
char res = lex_buffer[lex_curr_pos]; | |
debug([this]{print_buffer();}); | |
lex_curr_pos++; | |
lex_curr_p.cnum++; | |
return res; | |
} | |
string lexeme() const { | |
return {begin(lex_buffer) + lex_start_pos, | |
begin(lex_buffer) + lex_curr_pos}; | |
} | |
position lexeme_start_p() const { | |
return lex_start_p; | |
} | |
position lexeme_end_p() const { | |
return lex_curr_p; | |
} | |
void set_start_position() { | |
lex_start_pos = lex_curr_pos; | |
lex_start_p = lex_curr_p; | |
} | |
void save_position() { | |
lex_last_pos = lex_curr_pos; | |
lex_last_p = lex_curr_p; | |
} | |
void backup_to_last_position() { | |
lex_curr_pos = lex_last_pos; | |
lex_curr_p = lex_last_p; | |
} | |
void newline() { | |
lex_curr_p.lnum++; | |
lex_curr_p.bol = lex_curr_p.cnum; | |
} | |
void print_buffer() { | |
string start; | |
string last; | |
string current; | |
for (int i = 0; i < lex_buffer_len; i++) { | |
int n = 0; | |
char c = lex_buffer[i]; | |
if (c == '\n') n += 2, cout << "\\n"; | |
else if (c == '\t') n += 2, cout << "\\t"; | |
else if (c == eof) n += 3, cout << "eof"; | |
else n++, cout << (c == ' ' ? '_' : c); | |
if (i < lex_start_pos) start += string(n, ' '); | |
if (i < lex_last_pos) last += string(n, ' '); | |
if (i < lex_curr_pos) current += string(n, ' '); | |
} | |
cout << "\n"; | |
cout << start << "^ start pos\n"; | |
cout << last << "^ last pos\n"; | |
cout << current << "^ current pos\n"; | |
} | |
}; | |
enum class kind { | |
let, letrec, integer, identifier, eof, | |
}; | |
struct token { | |
enum kind kind; | |
variant<string, int> attribute; | |
position start; | |
position end; | |
}; | |
ostream& operator<<(ostream& os, const token& token) { | |
static const vector<string> kind_to_string {"LET", "LETREC", "INT", "ID", "EOF"}; | |
os << "token(" << kind_to_string[static_cast<int>(token.kind)]; | |
if (token.kind == kind::integer) { | |
os << ", " << get<int>(token.attribute); | |
} else { | |
os << ", " << get<string>(token.attribute); | |
} | |
os << ", start: " << token.start; | |
os << ", end: " << token.end << ")"; | |
return os; | |
} | |
class lexical_error : public exception { | |
}; | |
struct dfa { | |
using state = int; | |
static constexpr state initial_state = 0; | |
static constexpr state sink_state = 15; | |
private: | |
vector<int> char_to_index {21,21,21,21,21,21,21,21,21,18,19,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,17,21,21,21,21,21,21,21,21,21,16,21,21,21,21,15,5,6,7,8,9,10,11,12,13,14,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,4,21,1,21,21,21,21,21,21,0,21,21,21,21,21,3,21,2,21,21,21,21,21,21,21,21,21,21,21,20}; | |
vector<vector<state>> transitions | |
{ | |
// l e t r c 0 1 2 3 4 5 6 7 8 9 / * ' ' \t \n eof others | |
/*0*/ { 7, 13, 13, 13, 13, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 15, 5, 5, 5, 14, 15, }, | |
/*1*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 2, 15, 15, 15, 15, 15, }, | |
/*2*/ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 15, 15, }, | |
/*3*/ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 2, 2, 15, 15, }, | |
/*4*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }, | |
/*5*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 5, 5, 5, 15, 15, }, | |
/*6*/ { 15, 15, 15, 15, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 15, 15, 15, 15, 15, 15, 15, }, | |
/*7*/ { 13, 8, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*8*/ { 13, 13, 9, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*9*/ { 13, 13, 13, 10, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*10*/{ 13, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*11*/{ 13, 13, 13, 13, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*12*/{ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*13*/{ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*14*/{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }, | |
/*15*/{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }, | |
}; | |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
vector<bool> accepting { false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, false }; | |
map<state, token> state_to_token | |
{ | |
{6, {kind::integer}}, | |
{7, {kind::identifier}}, | |
{8, {kind::identifier}}, | |
{9, {kind::let}}, | |
{10, {kind::identifier}}, | |
{11, {kind::identifier}}, | |
{12, {kind::letrec}}, | |
{13, {kind::identifier}}, | |
{14, {kind::eof}}, | |
}; | |
public: | |
state transition(state s, char c) { | |
return transitions[s][char_to_index[static_cast<uint8_t>(c)]]; | |
} | |
bool is_accepting(state s) { | |
return accepting[s]; | |
} | |
bool ignore(state s) { | |
return state_to_token.count(s) == 0; | |
} | |
token get_token(state s, const string& lexeme, | |
const position& lexeme_start, const position& lexeme_end) { | |
token token = state_to_token[s]; | |
token.start = lexeme_start; | |
token.end = lexeme_end; | |
switch (token.kind) { | |
case kind::integer: | |
token.attribute = stoi(lexeme); | |
break; | |
case kind::eof: | |
token.attribute = "eof"; | |
break; | |
case kind::identifier: | |
case kind::let: | |
case kind::letrec: | |
token.attribute = lexeme; | |
break; | |
} | |
return token; | |
} | |
}; | |
template <typename DFA> | |
class scanner { | |
DFA dfa; | |
lexbuf& lbuf; | |
public: | |
scanner(const DFA& dfa, lexbuf& lbuf) : dfa(dfa), lbuf(lbuf) {} | |
token next() { | |
typename DFA::state state = DFA::initial_state; | |
typename DFA::state last_final_state = DFA::sink_state; | |
lbuf.set_start_position(); | |
while (true) { | |
char c = lbuf.next(); | |
debug([=]{cout << "next: " << c << endl;}); | |
if (c == '\n') lbuf.newline(); | |
state = dfa.transition(state, c); | |
debug([=]{cout << "state: " << state << endl;}); | |
if (dfa.is_accepting(state)) { | |
last_final_state = state; | |
lbuf.save_position(); | |
} else if (state == DFA::sink_state) { | |
if (last_final_state == DFA::sink_state) throw lexical_error{}; | |
state = last_final_state; | |
lbuf.backup_to_last_position(); | |
if (dfa.ignore(state)) return debug([]{cout << "ignore" << endl;}), next(); | |
return dfa.get_token(state, lbuf.lexeme(), | |
lbuf.lexeme_start_p(), lbuf.lexeme_end_p()); | |
} | |
} | |
} | |
}; | |
int main(int argc, char *argv[]) { | |
lexbuf lbuf{cin}; | |
if (argc == 2) { | |
lbuf.set_input(new ifstream{argv[1], ifstream::in}); | |
} | |
scanner scanner{dfa{}, lbuf}; | |
while (true) { | |
token token = scanner.next(); | |
cout << token << "\n"; | |
if (token.kind == kind::eof) break; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment