In this note, I wrote a c++ tokenizer using a finite state machine, which can get tokens from a cpp program.
I may omit some situations, if you found, leave a comment and let me know.
#include <iostream>
#include <cctype>
#include <vector>
using namespace std;
vector<int> parseNumber(int start) {
int number;
vector<int> token (1, start);
while (isdigit(cin.peek())) {
number = getchar();
token.push_back(number);
}
return token;
}
bool isword(int inch) {
if (isalnum(inch)) return true;
else if (inch == '_') return true;
else return false;
}
vector<int> parseWord(int start) {
int word;
vector<int> token (1, start);
while (isword(cin.peek())) {
word = getchar();
token.push_back(word);
}
return token;
}
vector<int> parseComment(int start) {
int comment;
vector<int> token (1, start);
if (cin.peek() == '/') {
comment = getchar();
token.push_back(comment);
while ((comment = getchar()) != '\n') {
token.push_back(comment);
}
} else if (cin.peek() == '*') {
comment = getchar();
token.push_back(comment);
PARSING_COMMENT: while ((comment = getchar()) != '*') {
token.push_back(comment);
}
if (cin.peek() == '/') {
token.push_back(comment);
comment = getchar();
token.push_back(comment);
} else {
token.push_back(comment);
goto PARSING_COMMENT;
}
}
return token;
}
vector<int> parseSentence(int start) {
int sentence;
vector<int> token (1, start);
PARSING_SENTENCE: while ((sentence = getchar()) != start) {
token.push_back(sentence);
}
if (token.back() == '\\' && token[token.size() - 2] != '\\') {
token.push_back(sentence);
goto PARSING_SENTENCE;
}
token.push_back(sentence);
return token;
}
void parse_error(string message, char ch) {
cerr << message << ": " << ch;
}
vector<int> getToken() {
vector<int> token;
int inch = 0;
while ((inch = getchar()) != EOF) {
switch (inch) {
case '\t':
case '\n':
case '\r':
case ' ':
break;
case '#':
case '<':
case '>':
case '.':
case ',':
case '*':
case '=':
case '!':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case ';':
case '\\':
token.push_back(inch);
return token;
case '+':
case '-':
case ':':
case '|':
case '&':
token.push_back(inch);
if (cin.peek() == inch) {
inch = getchar();
token.push_back(inch);
}
return token;
case '/':
token = parseComment(inch);
if (token.size() == 1) { return token; }
else { token.clear(); continue; }
case '\'':
case '\"':
return parseSentence(inch);
default:
if (isdigit(inch)) {
return parseNumber(inch);
} else if (isword(inch)) {
return parseWord(inch);
} else {
parse_error("Illegal character", inch);
}
}
}
return token;
}
void printTokens()
{
cout << "-------------------------" << endl;
vector<int> token;
while((token = getToken()).size() != 0) {
for (int i = 0; i < token.size(); i++) {
cout << char(token[i]);
}
cout << endl;
}
cout << "-------------------------" << endl;
}
int main()
{
printTokens();
}
#include <iostream>
#include <fstream>
#include <string>
#include <cctype>
#include <vector>
using namespace std;
ifstream ifile;
ofstream ofile;
void printToken(vector<int> token) {
for (int i = 0; i < token.size(); i++) {
cout << (char) token[i];
}
cout << endl;
}
void putVector(vector<int> token) {
for (int i = 0; i < token.size(); i++) {
ofile.put((char) token[i]);
}
ofile.put('\n');
}
vector<int> parseNumber(int start) {
int number;
vector<int> token (1, start);
while (isdigit(ifile.peek())) {
number = ifile.get();
token.push_back(number);
}
return token;
}
bool isword(int inch) {
if (isalnum(inch)) return true;
else if (inch == '_') return true;
else return false;
}
vector<int> parseWord(int start) {
int word;
vector<int> token (1, start);
while (isword(ifile.peek())) {
word = ifile.get();
token.push_back(word);
}
return token;
}
vector<int> parseComment(int start) {
int comment;
vector<int> token (1, start);
if (ifile.peek() == '/') {
comment = ifile.get();
token.push_back(comment);
while ((comment = ifile.get()) != '\n') {
token.push_back(comment);
}
} else if (ifile.peek() == '*') {
comment = ifile.get();
token.push_back(comment);
PARSING_COMMENT: while ((comment = ifile.get()) != '*') {
token.push_back(comment);
}
if (ifile.peek() == '/') {
token.push_back(comment);
comment = ifile.get();
token.push_back(comment);
} else {
token.push_back(comment);
goto PARSING_COMMENT;
}
}
return token;
}
vector<int> parseSentence(int start) {
int sentence;
vector<int> token (1, start);
PARSING_SENTENCE: while ((sentence = ifile.get()) != start) {
token.push_back(sentence);
}
if (token.back() == '\\' && token[token.size() - 2] != '\\') {
token.push_back(sentence);
goto PARSING_SENTENCE;
}
token.push_back(sentence);
return token;
}
void parse_error(string message, char ch) {
cerr << message << ": " << (int) ch;
}
void getToken() {
vector<int> token;
int inch = 0;
while ((inch = ifile.get()) != EOF) {
token.clear();
switch (inch) {
case '\t':
case '\n':
case '\r':
case ' ':
break;
case '#':
case '<':
case '>':
case '.':
case ',':
case '*':
case '=':
case '!':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case ';':
case '\\':
token.push_back(inch);
printToken(token);
putVector(token);
break;
case '+':
case '-':
case ':':
case '|':
case '&':
token.push_back(inch);
if (ifile.peek() == inch) {
inch = ifile.get();
token.push_back(inch);
}
printToken(token);
putVector(token);
break;
case '/':
token = parseComment(inch);
if (token.size() == 1) { printToken(token); }
else { token.clear(); continue; }
break;
case '\'':
case '\"':
token = parseSentence(inch);
printToken(token);
putVector(token);
break;
default:
if (isdigit(inch)) {
token = parseNumber(inch);
printToken(token);
putVector(token);
} else if (isword(inch)) {
token = parseWord(inch);
printToken(token);
putVector(token);
} else {
parse_error("Illegal character", inch);
}
break;
}
}
}
int main(int argc, char const *argv[])
{
if (argc != 2) {
cout << "Please give a file name when running." << endl;
return 0;
}
ifile.open(argv[1], ios::in);
ofile.open(((string) argv[1] + ".txt").c_str(), ios::out);
getToken();
ifile.close();
ofile.close();
return 0;
}
I think this has some usefulness, although it's neither a state machine, nor a tokenizer. For handling special situations you simply jump into a separate function that operates on its own, then returns to the previous place.
Also it's not really a tokenizer because in C++ sequences like "==" or "+=" are single tokens, while your program reports them by two separate single characters. Additionally, if you are going to parse the original source file (i.e. not preprocessed), and you tokenize preprocessor parts equally to the rest of the language, you need to treat the EOL character in the preprocessor directives as a token, unlike EOLs found in the rest of the source. And here is where the state machine can be used: change the state once found the # character and the way how things are interpreted. There, for example, the < ... > expressions should be also single tokens, just like strings.