mikeymop/lexinator.cpp

## lexinator.cpp
#include <map>
#include <string.h>
#include <cctype>
#include "tokens.h"

//track all tokentypes from tokens.h
static std::map<TokenType, std::string> TokenMap {
    {PRINT, "PRINT"},
    {IF, "IF"},
    {THEN, "THEN"},
    {TRUE, "TRUE"},
    {FALSE, "FALSE"},
    {IDENT, "IDENT"},
    {ICONST, "ICONST"},
    {SCONST, "SCONST"},
    {PLUS, "PLUS"},
    {MINUS, "MINUS"},
    {STAR, "STAR"},
    {SLASH, "SLASH"},
    {ASSIGN, "ASSIGN"},
    {EQ, "EQ"},
    {NEQ, "NEQ"},
    {LT, "LT"},
    {LEQ, "LEQ"},
    {GT, "GT"},
    {GEQ, "GEQ"},
    {LOGICAND, "LOGICAND"},
    {LOGICOR, "LOGICOR"},
    {LPAREN, "LPAREN"},
    {RPAREN, "RPAREN"},
    {SC, "SC"},
    {ERR, "ERR"},
    {DONE, "DONE"},
};

/*Token::TokenType detect_id(std::string const& iStr) {
    if (iStr == "var") return VAR;
    if (iStr == "set") return SET;
    if (iStr == "print") return PRINT;
    if (iStr == "repeat") return REPEAT;
}*/

/*static std::map<std::string, TokenType> identures {
    //I very inconveniently cannot use the tokenmap
    {"var", VAR},
    {"set", SET},
    {"print", PRINT},
    {"repeat", REPEAT}
};*/

//I have to treat identifiers differently, since I cant fit the ident tokens in the TokenMap
/*Token identificator(const string& lexeme, int lineNum) {
    TokenType tt = IDENT;

    switch(detect_id(lexeme)) {
        case VAR:
            tt = VAR;
            return VAR;
        case SET:
            tt = SET;
            return SET;
        case PRINT:
            tt = PRINT;
            return PRINT;
        case REPEAT:
            tt = REPEAT;
            return REPEAT;
    }
    return Token(tt, lexeme, lineNum);
}*/
//redir istream to cout
ostream& operator<<(ostream& out, const Token &tok) {
    TokenType tt = tok.GetTokenType();
    //output whatever token you match from the map
    out << TokenMap[tt];
    //check a few tokentypes including string and int
    if(tt == IDENT || tt == SCONST || tt == ICONST || tt == ERR) {
        out << "(" << tok.GetLexeme() << ")";
    }
    return out;
}

Token getNextToken(istream *in, int *lineNum) {
    //TokenType tt = tok.GetTokenType(); //test tok
    //from slide
    enum LexState {BEGIN, INID, INSTRING, ININT, INCOMMENT, /* do more later */};
    string lexeme; //the lexeme we're building
    char ch; //the char we're working with
    LexState state = BEGIN; //ensure we start at the top of the switch

    //deref instream, istream::peek didn't work?
    while(in->get(ch)) {
        //grab chars / close lines
        if(ch == '\n') {
            (*lineNum)++;
        }
        //lexemes for strings and declarations
        switch(state) {
            //begin case
            case BEGIN:
                if(isspace(ch)) {
                    continue; //move on to next word if you see a space
                }

                lexeme = ch; //save the char
                //Identifier stuff
                if(isalpha(ch)) {
                    state = INID;
                } else if(isdigit(ch)) {
                    state = ININT;
                } else if( ch == '"' ) {
                    //string stuff
                    state = INSTRING;
                } else if( ch == '#') {
                    state = INCOMMENT;
                } else {
                    // operators
                    TokenType tt = ERR;
                    switch(ch) {
                        case '+':
                            tt = PLUS;
                            break;
                        case '*':
                            tt = STAR;
                            break;
                        case '/':
                            tt = SLASH;
                            break;
                        case '-':
                            tt = MINUS;
                            break;
                        case '=':
                            //two char ops are strange
                            lexeme.push_back(ch);
                            ch = in->peek();
                            lexeme.push_back(ch);
                            if(ch == '=') {
                                tt = EQ;
                            } else if(' ') {
                                tt = ASSIGN;
                            } else {
                                return Token(ERR, lexeme, *lineNum);
                            }
                            break;
                        case '!':
                        //two char ops are strange
                            lexeme.push_back(ch);
                            ch = in->peek();
                            lexeme.push_back(ch);
                            if(ch == '=') {
                                tt = NEQ;
                            } else {
                                //tf is that?
                                return Token(ERR, lexeme, *lineNum);
                            }
                            break;
                        case '<':
                            //two char ops are strange
                            lexeme.push_back(ch);
                            ch = in->peek();
                            lexeme.push_back(ch);
                            if(ch == '=') {
                                tt = LEQ;
                            }
                            tt = LT;
                            break;
                        case '>':
                            //two char ops are strange
                            lexeme.push_back(ch);
                            ch = in->peek();
                            lexeme.push_back(ch);
                            if(ch == '=') {
                                tt = GEQ;
                            }
                            tt = LT;
                            break;
                        case '&':
                            //two char ops are strange
                            lexeme.push_back(ch);
                            ch = in->peek();
                            lexeme.push_back(ch);
                            if(ch == '&') {
                                tt = LOGICAND;
                            }
                            tt = LT;
                            break;
                        case '|':
                            //two char ops are strange
                            lexeme.push_back(ch);
                            ch = in->peek();
                            lexeme.push_back(ch);
                            if(ch == '|') {
                                tt = LOGICOR;
                            }
                            tt = LT;
                            break;
                        case '(':
                            //two char ops are strange
                                tt = LPAREN;
                                break;
                        case ')':
                            //two char ops are strange
                                tt = RPAREN;
                                break;
                        case ';':
                                tt = SC;
                                break;
                    } //end opcase
                    return Token(tt, lexeme, *lineNum);
                    //cant return in case statements

                } //end stringbs
                break;
                //end begin case
            //identifiers
            case INID:
                if(isalpha(ch) || isdigit(ch)) {
                    lexeme += ch;
                } else {
                    if(ch == '\n') {
                        (*lineNum)--; //have to -1 rq incase we print an err on this lexeme
                    }
                    in->putback(ch);
                } //identify keywords
                if(lexeme == "print") {
                    return Token(PRINT, lexeme, *lineNum);
                    break;
                } else if(lexeme == "if") {
                    return Token(IF, lexeme, *lineNum);
                    state == BEGIN;
                    break;
                } else if(lexeme == "then") {
                    return Token(THEN, lexeme, *lineNum);
                    state == BEGIN;
                    break;
                } else if(lexeme == "true") {
                    return Token(TRUE, lexeme, *lineNum);
                    state == BEGIN;
                    break;
                } else if(lexeme == "false") {
                    return Token(FALSE, lexeme, *lineNum);
                    state == BEGIN;
                    break;
                } else if(isspace(ch)) {
                    state == BEGIN;
                    return Token(IDENT, lexeme, *lineNum);
                    lexeme.clear();
                    break;
                } else {
                    break;
                }
            //finish strings
            case ININT:
                if(isdigit(ch)) {
                    lexeme += ch;
                } else if(isalpha(ch)) {
                    lexeme += ch;
                    return Token(ERR, lexeme, *lineNum);
                } else {
                    if(ch == '\n') {
                    (*lineNum)--; //it saw the \n so roll bk
                        in->putback(ch);
                    }
                }
                return Token(ICONST, lexeme, *lineNum);
                break;
            case INSTRING:
                lexeme += ch;
                if(ch == '"') {
                    lexeme = lexeme.substr(1, lexeme.length()-2);
                    return Token(SCONST, lexeme, *lineNum);
                }
                if(ch == '\n') {
                    return Token(ERR, lexeme, *lineNum);
                }
                break; //start the case over
            case INCOMMENT:
                if(ch == '\n') {
                    state = BEGIN;
                }
                break;
        }
    }

    if( in->eof() )
        return Token(DONE, "", *lineNum);

    return Token(ERR, lexeme, *lineNum);
}

## prog2.cpp
#include <iostream>
#include <fstream>
#include <string.h>
#include "tokens.h"

using namespace std;

/*
return Token(token type, lexeme, line#)
return Token(Done, "", lineNum)

-v every token is printed
-sum if present, summary information is printed
-allids a list of the lexemes for all identifiers printed in
    alphabetical order
filename read from the filename; otherwise read from standard in

getNextToken(istream * ...)
istream *in;
in = & cin;
    or
in = & some ifstream;

*/

istream *in = &cin;

Token tok;
TokenType tt = tok.GetTokenType();


int main(int argc, char* argv[]) {
    //args
    //int numfiles;
    bool isfile;
    bool v;
    bool sum;
    //bool allids;

    int lineNum;
    int stringCt;
    //int identCt;
    int tokenCt;

    string arg1;
    string source;

    for(int i=1; i < argc; i++) {
        //numfiles = 0;
        arg1 = argv[i];
        //Done: arg tester
        if(in) {
            ;
        }
        if(arg1[0] == '-') {
            if(arg1 == "-v") {
                //do -v
                v = true;
            } else if(arg1 == "-sum") {
                //do sum
                sum = true;
            } else if(arg1 == "-allids") {
                //do allids
                ;
            } else {
                cout << "INVALID FLAG " << arg1 << endl;
                return 2;
            }
        } else {
            //arg must be a filename test it
            isfile = true;
            source = argv[i];
        }
        //handle files if you have them2
        ifstream iFile;
        if(isfile) {
            iFile.open(source);
            if(i < argc - 1) {
                cout << "TOO MANY FILE NAMES" << endl;
                return 3;
            } else if(!iFile.is_open()) {
                cerr << "UNABLE TO OPEN " << source << endl;
            } else {
                in = &iFile;
                //iterate the file and spit out tokens
                while((tok = getNextToken(&iFile, &lineNum)) != ERR && tok != DONE) {
                // handle verbose mode
                    if(v) {
                        cout << tok << endl;
                    }
                    tokenCt++;
                    //we can pick out tokens with GetTokenType()
                    if(tok.GetTokenType() == SCONST) {
                        stringCt++;
                    }
                } //while
                if(tok.GetTokenType() == ERR) {
                    cout << "Error on line " << lineNum << " (";
                    cout << tok.GetLexeme() << ")" << endl;
                }

                //sum stuff
                if(sum) {
                    cout << "Total lines: " << endl;
                    cout << "Total tokens: " << endl;
                    cout << "Total identifiers: " << endl;
                    cout << "Total strings: " << endl;

                }
            }
        }

    }
}

## tokens.h
/*
 * tokens.h
 *
 * CS280
 * Fall 2018
 */

#ifndef TOKENS_H_
#define TOKENS_H_

#include <string>
#include <iostream>
using std::string;
using std::istream;
using std::ostream;

enum TokenType {
		// keywords
	PRINT,
	IF,
	THEN,
	TRUE,
	FALSE,

		// an identifier
	IDENT,

		// an integer and string constant
	ICONST,
	SCONST,

		// the operators, parens and semicolon
	PLUS,
	MINUS,
	STAR,
	SLASH,
	ASSIGN,
	EQ,
	NEQ,
	LT,
	LEQ,
	GT,
	GEQ,
	LOGICAND,
	LOGICOR,
	LPAREN,
	RPAREN,
	SC,

		// any error returns this token
	ERR,

		// when completed (EOF), return this token
	DONE
};

class Token {
	TokenType	tt;
	string		lexeme;
	int			lnum;

public:
	Token() {
		tt = ERR;
		lnum = -1;
	}
	Token(TokenType tt, string lexeme, int line) {
		this->tt = tt;
		this->lexeme = lexeme;
		this->lnum = line;
	}

	bool operator==(const TokenType tt) const { return this->tt == tt; }
	bool operator!=(const TokenType tt) const { return this->tt != tt; }

	TokenType	GetTokenType() const { return tt; }
	string		GetLexeme() const { return lexeme; }
	int			GetLinenum() const { return lnum; }
};

extern ostream& operator<<(ostream& out, const Token& tok);

extern Token getNextToken(istream *in, int *linenum);


#endif /* TOKENS_H_ */
	#include <map>
	#include <string.h>
	#include <cctype>
	#include "tokens.h"

	//track all tokentypes from tokens.h
	static std::map<TokenType, std::string> TokenMap {
	{PRINT, "PRINT"},
	{IF, "IF"},
	{THEN, "THEN"},
	{TRUE, "TRUE"},
	{FALSE, "FALSE"},
	{IDENT, "IDENT"},
	{ICONST, "ICONST"},
	{SCONST, "SCONST"},
	{PLUS, "PLUS"},
	{MINUS, "MINUS"},
	{STAR, "STAR"},
	{SLASH, "SLASH"},
	{ASSIGN, "ASSIGN"},
	{EQ, "EQ"},
	{NEQ, "NEQ"},
	{LT, "LT"},
	{LEQ, "LEQ"},
	{GT, "GT"},
	{GEQ, "GEQ"},
	{LOGICAND, "LOGICAND"},
	{LOGICOR, "LOGICOR"},
	{LPAREN, "LPAREN"},
	{RPAREN, "RPAREN"},
	{SC, "SC"},
	{ERR, "ERR"},
	{DONE, "DONE"},
	};

	/*Token::TokenType detect_id(std::string const& iStr) {
	if (iStr == "var") return VAR;
	if (iStr == "set") return SET;
	if (iStr == "print") return PRINT;
	if (iStr == "repeat") return REPEAT;
	}*/

	/*static std::map<std::string, TokenType> identures {
	//I very inconveniently cannot use the tokenmap
	{"var", VAR},
	{"set", SET},
	{"print", PRINT},
	{"repeat", REPEAT}
	};*/

	//I have to treat identifiers differently, since I cant fit the ident tokens in the TokenMap
	/*Token identificator(const string& lexeme, int lineNum) {
	TokenType tt = IDENT;

	switch(detect_id(lexeme)) {
	case VAR:
	tt = VAR;
	return VAR;
	case SET:
	tt = SET;
	return SET;
	case PRINT:
	tt = PRINT;
	return PRINT;
	case REPEAT:
	tt = REPEAT;
	return REPEAT;
	}
	return Token(tt, lexeme, lineNum);
	}*/
	//redir istream to cout
	ostream& operator<<(ostream& out, const Token &tok) {
	TokenType tt = tok.GetTokenType();
	//output whatever token you match from the map
	out << TokenMap[tt];
	//check a few tokentypes including string and int
	if(tt == IDENT \|\| tt == SCONST \|\| tt == ICONST \|\| tt == ERR) {
	out << "(" << tok.GetLexeme() << ")";
	}
	return out;
	}

	Token getNextToken(istream in, int lineNum) {
	//TokenType tt = tok.GetTokenType(); //test tok
	//from slide
	enum LexState {BEGIN, INID, INSTRING, ININT, INCOMMENT, /* do more later */};
	string lexeme; //the lexeme we're building
	char ch; //the char we're working with
	LexState state = BEGIN; //ensure we start at the top of the switch

	//deref instream, istream::peek didn't work?
	while(in->get(ch)) {
	//grab chars / close lines
	if(ch == '\n') {
	(*lineNum)++;
	}
	//lexemes for strings and declarations
	switch(state) {
	//begin case
	case BEGIN:
	if(isspace(ch)) {
	continue; //move on to next word if you see a space
	}

	lexeme = ch; //save the char
	//Identifier stuff
	if(isalpha(ch)) {
	state = INID;
	} else if(isdigit(ch)) {
	state = ININT;
	} else if( ch == '"' ) {
	//string stuff
	state = INSTRING;
	} else if( ch == '#') {
	state = INCOMMENT;
	} else {
	// operators
	TokenType tt = ERR;
	switch(ch) {
	case '+':
	tt = PLUS;
	break;
	case '*':
	tt = STAR;
	break;
	case '/':
	tt = SLASH;
	break;
	case '-':
	tt = MINUS;
	break;
	case '=':
	//two char ops are strange
	lexeme.push_back(ch);
	ch = in->peek();
	lexeme.push_back(ch);
	if(ch == '=') {
	tt = EQ;
	} else if(' ') {
	tt = ASSIGN;
	} else {
	return Token(ERR, lexeme, *lineNum);
	}
	break;
	case '!':
	//two char ops are strange
	lexeme.push_back(ch);
	ch = in->peek();
	lexeme.push_back(ch);
	if(ch == '=') {
	tt = NEQ;
	} else {
	//tf is that?
	return Token(ERR, lexeme, *lineNum);
	}
	break;
	case '<':
	//two char ops are strange
	lexeme.push_back(ch);
	ch = in->peek();
	lexeme.push_back(ch);
	if(ch == '=') {
	tt = LEQ;
	}
	tt = LT;
	break;
	case '>':
	//two char ops are strange
	lexeme.push_back(ch);
	ch = in->peek();
	lexeme.push_back(ch);
	if(ch == '=') {
	tt = GEQ;
	}
	tt = LT;
	break;
	case '&':
	//two char ops are strange
	lexeme.push_back(ch);
	ch = in->peek();
	lexeme.push_back(ch);
	if(ch == '&') {
	tt = LOGICAND;
	}
	tt = LT;
	break;
	case '\|':
	//two char ops are strange
	lexeme.push_back(ch);
	ch = in->peek();
	lexeme.push_back(ch);
	if(ch == '\|') {
	tt = LOGICOR;
	}
	tt = LT;
	break;
	case '(':
	//two char ops are strange
	tt = LPAREN;
	break;
	case ')':
	//two char ops are strange
	tt = RPAREN;
	break;
	case ';':
	tt = SC;
	break;
	} //end opcase
	return Token(tt, lexeme, *lineNum);
	//cant return in case statements

	} //end stringbs
	break;
	//end begin case
	//identifiers
	case INID:
	if(isalpha(ch) \|\| isdigit(ch)) {
	lexeme += ch;
	} else {
	if(ch == '\n') {
	(*lineNum)--; //have to -1 rq incase we print an err on this lexeme
	}
	in->putback(ch);
	} //identify keywords
	if(lexeme == "print") {
	return Token(PRINT, lexeme, *lineNum);
	break;
	} else if(lexeme == "if") {
	return Token(IF, lexeme, *lineNum);
	state == BEGIN;
	break;
	} else if(lexeme == "then") {
	return Token(THEN, lexeme, *lineNum);
	state == BEGIN;
	break;
	} else if(lexeme == "true") {
	return Token(TRUE, lexeme, *lineNum);
	state == BEGIN;
	break;
	} else if(lexeme == "false") {
	return Token(FALSE, lexeme, *lineNum);
	state == BEGIN;
	break;
	} else if(isspace(ch)) {
	state == BEGIN;
	return Token(IDENT, lexeme, *lineNum);
	lexeme.clear();
	break;
	} else {
	break;
	}
	//finish strings
	case ININT:
	if(isdigit(ch)) {
	lexeme += ch;
	} else if(isalpha(ch)) {
	lexeme += ch;
	return Token(ERR, lexeme, *lineNum);
	} else {
	if(ch == '\n') {
	(*lineNum)--; //it saw the \n so roll bk
	in->putback(ch);
	}
	}
	return Token(ICONST, lexeme, *lineNum);
	break;
	case INSTRING:
	lexeme += ch;
	if(ch == '"') {
	lexeme = lexeme.substr(1, lexeme.length()-2);
	return Token(SCONST, lexeme, *lineNum);
	}
	if(ch == '\n') {
	return Token(ERR, lexeme, *lineNum);
	}
	break; //start the case over
	case INCOMMENT:
	if(ch == '\n') {
	state = BEGIN;
	}
	break;
	}
	}

	if( in->eof() )
	return Token(DONE, "", *lineNum);

	return Token(ERR, lexeme, *lineNum);
	}
	#include <iostream>
	#include <fstream>
	#include <string.h>
	#include "tokens.h"

	using namespace std;

	/*
	return Token(token type, lexeme, line#)
	return Token(Done, "", lineNum)

	-v every token is printed
	-sum if present, summary information is printed
	-allids a list of the lexemes for all identifiers printed in
	alphabetical order
	filename read from the filename; otherwise read from standard in

	getNextToken(istream * ...)
	istream *in;
	in = & cin;
	or
	in = & some ifstream;

	*/

	istream *in = &cin;

	Token tok;
	TokenType tt = tok.GetTokenType();


	int main(int argc, char* argv[]) {
	//args
	//int numfiles;
	bool isfile;
	bool v;
	bool sum;
	//bool allids;

	int lineNum;
	int stringCt;
	//int identCt;
	int tokenCt;

	string arg1;
	string source;

	for(int i=1; i < argc; i++) {
	//numfiles = 0;
	arg1 = argv[i];
	//Done: arg tester
	if(in) {
	;
	}
	if(arg1[0] == '-') {
	if(arg1 == "-v") {
	//do -v
	v = true;
	} else if(arg1 == "-sum") {
	//do sum
	sum = true;
	} else if(arg1 == "-allids") {
	//do allids
	;
	} else {
	cout << "INVALID FLAG " << arg1 << endl;
	return 2;
	}
	} else {
	//arg must be a filename test it
	isfile = true;
	source = argv[i];
	}
	//handle files if you have them2
	ifstream iFile;
	if(isfile) {
	iFile.open(source);
	if(i < argc - 1) {
	cout << "TOO MANY FILE NAMES" << endl;
	return 3;
	} else if(!iFile.is_open()) {
	cerr << "UNABLE TO OPEN " << source << endl;
	} else {
	in = &iFile;
	//iterate the file and spit out tokens
	while((tok = getNextToken(&iFile, &lineNum)) != ERR && tok != DONE) {
	// handle verbose mode
	if(v) {
	cout << tok << endl;
	}
	tokenCt++;
	//we can pick out tokens with GetTokenType()
	if(tok.GetTokenType() == SCONST) {
	stringCt++;
	}
	} //while
	if(tok.GetTokenType() == ERR) {
	cout << "Error on line " << lineNum << " (";
	cout << tok.GetLexeme() << ")" << endl;
	}

	//sum stuff
	if(sum) {
	cout << "Total lines: " << endl;
	cout << "Total tokens: " << endl;
	cout << "Total identifiers: " << endl;
	cout << "Total strings: " << endl;

	}
	}
	}

	}
	}
	/*
	* tokens.h
	*
	* CS280
	* Fall 2018
	*/

	#ifndef TOKENS_H_
	#define TOKENS_H_

	#include <string>
	#include <iostream>
	using std::string;
	using std::istream;
	using std::ostream;

	enum TokenType {
	// keywords
	PRINT,
	IF,
	THEN,
	TRUE,
	FALSE,

	// an identifier
	IDENT,

	// an integer and string constant
	ICONST,
	SCONST,

	// the operators, parens and semicolon
	PLUS,
	MINUS,
	STAR,
	SLASH,
	ASSIGN,
	EQ,
	NEQ,
	LT,
	LEQ,
	GT,
	GEQ,
	LOGICAND,
	LOGICOR,
	LPAREN,
	RPAREN,
	SC,

	// any error returns this token
	ERR,

	// when completed (EOF), return this token
	DONE
	};

	class Token {
	TokenType tt;
	string lexeme;
	int lnum;

	public:
	Token() {
	tt = ERR;
	lnum = -1;
	}
	Token(TokenType tt, string lexeme, int line) {
	this->tt = tt;
	this->lexeme = lexeme;
	this->lnum = line;
	}

	bool operator==(const TokenType tt) const { return this->tt == tt; }
	bool operator!=(const TokenType tt) const { return this->tt != tt; }

	TokenType GetTokenType() const { return tt; }
	string GetLexeme() const { return lexeme; }
	int GetLinenum() const { return lnum; }
	};

	extern ostream& operator<<(ostream& out, const Token& tok);

	extern Token getNextToken(istream in, int linenum);


	#endif /* TOKENS_H_ */