#include <iostream> | |
#include <vector> | |
#include <map> | |
#include <variant> | |
#include <exception> | |
#include <fstream> | |
using namespace std; | |
void debug(auto f) { | |
#ifdef DEBUG | |
f(); | |
#endif | |
} | |
struct position { | |
int lnum = 0; | |
int cnum = 0; | |
int bol = 0; | |
}; | |
ostream& operator<<(ostream& os, const position& pos) { | |
os << "(line: " << pos.lnum + 1 << ", char: " << pos.cnum - pos.bol + 1 << ")"; | |
return os; | |
} | |
class lexbuf { | |
/* | |
0 lex_buffer_len - 1 lex_buffer.size() - 1 | |
v v | |
+----------+------------------------------------------+----------------------+ | |
| junk | valid data | junk | | |
+----------+------------------------------------------+----------------------+ | |
^ ^ ^ | |
lex_start_pos lex_last_pos lex_curr_pos | |
*/ | |
static constexpr char eof = 128; | |
int lex_start_pos = 0; | |
int lex_last_pos = 0; | |
int lex_curr_pos = 0; | |
int lex_buffer_len = 0; | |
position lex_start_p; | |
position lex_last_p; | |
position lex_curr_p; | |
istream* input; | |
bool owns; | |
vector<char> lex_buffer; | |
static constexpr int refill_size = 8; | |
char aux_buffer[refill_size]; | |
void refill() { | |
debug([]{cout << "refill" << endl;}); | |
debug([this]{print_buffer();}); | |
input->read(aux_buffer, refill_size); | |
int n = input->gcount(); | |
debug([=]{cout << n << endl;}); | |
if (input->eof()) { | |
aux_buffer[n++] = eof; | |
} | |
if (lex_buffer_len + n > static_cast<int>(lex_buffer.size())) { | |
// not enough space at the end of buffer | |
debug([]{cout << "not enough space" << endl;}); | |
if (lex_buffer_len - lex_start_pos + n <= static_cast<int>(lex_buffer.size())) { | |
// enough space at the beginning of buffer | |
debug([]{cout << "enough space at beginning" << endl;}); | |
copy(begin(lex_buffer) + lex_start_pos, | |
begin(lex_buffer) + lex_buffer_len, | |
begin(lex_buffer)); | |
} else { | |
// need to resize buffer | |
debug([]{cout << "need to resize" << endl;}); | |
lex_buffer.resize(lex_buffer.size() + n); | |
debug([&]{cout << "lex_buffer capacity " << lex_buffer.capacity() << endl;}); | |
copy(begin(lex_buffer) + lex_start_pos, | |
begin(lex_buffer) + lex_buffer_len, | |
begin(lex_buffer)); | |
} | |
int s = lex_start_pos; | |
lex_start_pos = 0; | |
lex_last_pos -= s; | |
lex_curr_pos -= s; | |
lex_buffer_len -= s; | |
} | |
copy(begin(aux_buffer), | |
begin(aux_buffer) + n, | |
begin(lex_buffer) + lex_buffer_len); | |
lex_buffer_len += n; | |
debug([this]{print_buffer();}); | |
debug([]{cout << "end refill" << endl;}); | |
} | |
void close() { | |
if (owns) delete input; | |
} | |
void set_exceptions() { | |
input->exceptions(istream::badbit); | |
} | |
public: | |
lexbuf(istream& is) : input(&is), owns(false) { | |
set_exceptions(); | |
} | |
lexbuf(istream* is) : input(is), owns(true) { | |
set_exceptions(); | |
} | |
lexbuf(const lexbuf&) = delete; | |
lexbuf& operator=(const lexbuf&) = delete; | |
lexbuf(lexbuf&&) = delete; | |
lexbuf& operator=(lexbuf&&) = delete; | |
void set_input(istream& is) { | |
close(); | |
input = &is; | |
owns = false; | |
set_exceptions(); | |
} | |
void set_input(istream* is) { | |
close(); | |
input = is; | |
owns = true; | |
set_exceptions(); | |
} | |
~lexbuf() { | |
close(); | |
} | |
char next() { | |
if (lex_curr_pos == lex_buffer_len) { | |
refill(); | |
} | |
if (lex_buffer[lex_curr_pos] == eof) return eof; | |
char res = lex_buffer[lex_curr_pos]; | |
debug([this]{print_buffer();}); | |
lex_curr_pos++; | |
lex_curr_p.cnum++; | |
return res; | |
} | |
string lexeme() const { | |
return {begin(lex_buffer) + lex_start_pos, | |
begin(lex_buffer) + lex_curr_pos}; | |
} | |
position lexeme_start_p() const { | |
return lex_start_p; | |
} | |
position lexeme_end_p() const { | |
return lex_curr_p; | |
} | |
void set_start_position() { | |
lex_start_pos = lex_curr_pos; | |
lex_start_p = lex_curr_p; | |
} | |
void save_position() { | |
lex_last_pos = lex_curr_pos; | |
lex_last_p = lex_curr_p; | |
} | |
void backup_to_last_position() { | |
lex_curr_pos = lex_last_pos; | |
lex_curr_p = lex_last_p; | |
} | |
void newline() { | |
lex_curr_p.lnum++; | |
lex_curr_p.bol = lex_curr_p.cnum; | |
} | |
void print_buffer() { | |
string start; | |
string last; | |
string current; | |
for (int i = 0; i < lex_buffer_len; i++) { | |
int n = 0; | |
char c = lex_buffer[i]; | |
if (c == '\n') n += 2, cout << "\\n"; | |
else if (c == '\t') n += 2, cout << "\\t"; | |
else if (c == eof) n += 3, cout << "eof"; | |
else n++, cout << (c == ' ' ? '_' : c); | |
if (i < lex_start_pos) start += string(n, ' '); | |
if (i < lex_last_pos) last += string(n, ' '); | |
if (i < lex_curr_pos) current += string(n, ' '); | |
} | |
cout << "\n"; | |
cout << start << "^ start pos\n"; | |
cout << last << "^ last pos\n"; | |
cout << current << "^ current pos\n"; | |
} | |
}; | |
enum class kind { | |
let, letrec, integer, identifier, eof, | |
}; | |
struct token { | |
enum kind kind; | |
variant<string, int> attribute; | |
position start; | |
position end; | |
}; | |
ostream& operator<<(ostream& os, const token& token) { | |
static const vector<string> kind_to_string {"LET", "LETREC", "INT", "ID", "EOF"}; | |
os << "token(" << kind_to_string[static_cast<int>(token.kind)]; | |
if (token.kind == kind::integer) { | |
os << ", " << get<int>(token.attribute); | |
} else { | |
os << ", " << get<string>(token.attribute); | |
} | |
os << ", start: " << token.start; | |
os << ", end: " << token.end << ")"; | |
return os; | |
} | |
class lexical_error : public exception { | |
}; | |
struct dfa { | |
using state = int; | |
static constexpr state initial_state = 0; | |
static constexpr state sink_state = 15; | |
private: | |
vector<int> char_to_index {21,21,21,21,21,21,21,21,21,18,19,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,17,21,21,21,21,21,21,21,21,21,16,21,21,21,21,15,5,6,7,8,9,10,11,12,13,14,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,4,21,1,21,21,21,21,21,21,0,21,21,21,21,21,3,21,2,21,21,21,21,21,21,21,21,21,21,21,20}; | |
vector<vector<state>> transitions | |
{ | |
// l e t r c 0 1 2 3 4 5 6 7 8 9 / * ' ' \t \n eof others | |
/*0*/ { 7, 13, 13, 13, 13, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 15, 5, 5, 5, 14, 15, }, | |
/*1*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 2, 15, 15, 15, 15, 15, }, | |
/*2*/ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 15, 15, }, | |
/*3*/ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 2, 2, 15, 15, }, | |
/*4*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }, | |
/*5*/ { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 5, 5, 5, 15, 15, }, | |
/*6*/ { 15, 15, 15, 15, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 15, 15, 15, 15, 15, 15, 15, }, | |
/*7*/ { 13, 8, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*8*/ { 13, 13, 9, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*9*/ { 13, 13, 13, 10, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*10*/{ 13, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*11*/{ 13, 13, 13, 13, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*12*/{ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*13*/{ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 15, }, | |
/*14*/{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }, | |
/*15*/{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }, | |
}; | |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |
vector<bool> accepting { false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, false }; | |
map<state, token> state_to_token | |
{ | |
{6, {kind::integer}}, | |
{7, {kind::identifier}}, | |
{8, {kind::identifier}}, | |
{9, {kind::let}}, | |
{10, {kind::identifier}}, | |
{11, {kind::identifier}}, | |
{12, {kind::letrec}}, | |
{13, {kind::identifier}}, | |
{14, {kind::eof}}, | |
}; | |
public: | |
state transition(state s, char c) { | |
return transitions[s][char_to_index[static_cast<uint8_t>(c)]]; | |
} | |
bool is_accepting(state s) { | |
return accepting[s]; | |
} | |
bool ignore(state s) { | |
return state_to_token.count(s) == 0; | |
} | |
token get_token(state s, const string& lexeme, | |
const position& lexeme_start, const position& lexeme_end) { | |
token token = state_to_token[s]; | |
token.start = lexeme_start; | |
token.end = lexeme_end; | |
switch (token.kind) { | |
case kind::integer: | |
token.attribute = stoi(lexeme); | |
break; | |
case kind::eof: | |
token.attribute = "eof"; | |
break; | |
case kind::identifier: | |
case kind::let: | |
case kind::letrec: | |
token.attribute = lexeme; | |
break; | |
} | |
return token; | |
} | |
}; | |
template <typename DFA> | |
class scanner { | |
DFA dfa; | |
lexbuf& lbuf; | |
public: | |
scanner(const DFA& dfa, lexbuf& lbuf) : dfa(dfa), lbuf(lbuf) {} | |
token next() { | |
typename DFA::state state = DFA::initial_state; | |
typename DFA::state last_final_state = DFA::sink_state; | |
lbuf.set_start_position(); | |
while (true) { | |
char c = lbuf.next(); | |
debug([=]{cout << "next: " << c << endl;}); | |
if (c == '\n') lbuf.newline(); | |
state = dfa.transition(state, c); | |
debug([=]{cout << "state: " << state << endl;}); | |
if (dfa.is_accepting(state)) { | |
last_final_state = state; | |
lbuf.save_position(); | |
} else if (state == DFA::sink_state) { | |
if (last_final_state == DFA::sink_state) throw lexical_error{}; | |
state = last_final_state; | |
lbuf.backup_to_last_position(); | |
if (dfa.ignore(state)) return debug([]{cout << "ignore" << endl;}), next(); | |
return dfa.get_token(state, lbuf.lexeme(), | |
lbuf.lexeme_start_p(), lbuf.lexeme_end_p()); | |
} | |
} | |
} | |
}; | |
int main(int argc, char *argv[]) { | |
lexbuf lbuf{cin}; | |
if (argc == 2) { | |
lbuf.set_input(new ifstream{argv[1], ifstream::in}); | |
} | |
scanner scanner{dfa{}, lbuf}; | |
while (true) { | |
token token = scanner.next(); | |
cout << token << "\n"; | |
if (token.kind == kind::eof) break; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment