Skip to content

Instantly share code, notes, and snippets.

@LovelyBuggies
Last active November 19, 2023 15:50
Show Gist options
  • Save LovelyBuggies/6dafc6f3ad6d27c17d8c0dd81d23a1b8 to your computer and use it in GitHub Desktop.
Save LovelyBuggies/6dafc6f3ad6d27c17d8c0dd81d23a1b8 to your computer and use it in GitHub Desktop.
Tokenizer for C++ Program

In this note, I wrote a c++ tokenizer using a finite state machine, which can get tokens from a cpp program.

I may omit some situations, if you found, leave a comment and let me know.

Tokenize c++ std input

#include <iostream>
#include <cctype>
#include <vector>
using namespace std;


vector<int> parseNumber(int start) {
	int number;
	vector<int> token (1, start);
	while (isdigit(cin.peek())) {
		number = getchar();
		token.push_back(number);
	}
	return token;
}


bool isword(int inch) {
	if (isalnum(inch)) 		return true;
	else if (inch == '_') 	return true;
	else 					return false;
}


vector<int> parseWord(int start) {
	int word;
	vector<int> token (1, start);
	while (isword(cin.peek())) {
		word = getchar();
		token.push_back(word);
	}
	return token;
}


vector<int> parseComment(int start) {
	int comment;
	vector<int> token (1, start);
	if (cin.peek() == '/') {
		comment = getchar();
		token.push_back(comment);
		while ((comment = getchar()) != '\n') {
			token.push_back(comment);
		}
	} else if (cin.peek() == '*') {
		comment = getchar();
		token.push_back(comment);
		PARSING_COMMENT: while ((comment = getchar()) != '*') {
			token.push_back(comment);
		}
		if (cin.peek() == '/') {
			token.push_back(comment);
			comment = getchar();
			token.push_back(comment);
		} else {
			token.push_back(comment);
			goto PARSING_COMMENT;
		}
	}
	return token;
}


vector<int> parseSentence(int start) {
	int sentence;
	vector<int> token (1, start);
	PARSING_SENTENCE: while ((sentence = getchar()) != start) {
		token.push_back(sentence);
	}
	if (token.back() == '\\' && token[token.size() - 2] != '\\') {
		token.push_back(sentence);
		goto PARSING_SENTENCE;
	}
	token.push_back(sentence);
	return token;
}


void parse_error(string message, char ch) {
	cerr << message << ": " << ch;
}


vector<int> getToken() {
	vector<int> token;
	int inch = 0;
	while ((inch = getchar()) != EOF) {
		switch (inch) {
			case '\t':
			case '\n':
			case '\r':
			case ' ':
				break;
			case '#':
			case '<':
			case '>':
			case '.':
			case ',':
			case '*':
			case '=':
			case '!':
			case '(':
			case ')':
			case '[':
			case ']':
			case '{':
			case '}': 
			case ';':
			case '\\':
				token.push_back(inch);
				return token;
			case '+':
			case '-':
			case ':':
			case '|':
			case '&':
				token.push_back(inch);
				if (cin.peek() == inch) {
					inch = getchar();
					token.push_back(inch);
				}
				return token;
			case '/':
				token = parseComment(inch);
				if (token.size() == 1) { return token; }
				else { token.clear(); continue; }
			case '\'':
			case '\"':
				return parseSentence(inch);
			default:
				if (isdigit(inch)) {
					return parseNumber(inch);
				} else if (isword(inch)) {
					return parseWord(inch);
				} else {
					parse_error("Illegal character", inch);
				}
		}
	}
	return token;
}


void printTokens()
{
	cout << "-------------------------" << endl;
	vector<int> token;
	while((token = getToken()).size() != 0) {
		for (int i = 0; i < token.size(); i++) {
			cout << char(token[i]);
		}
		cout << endl;
	}
	cout << "-------------------------" << endl;
}


int main()
{
	printTokens();
}

Tokenize c++ file input

#include <iostream>
#include <fstream>
#include <string>
#include <cctype>
#include <vector>
using namespace std;


ifstream ifile;
ofstream ofile;


void printToken(vector<int> token) {
	for (int i = 0; i < token.size(); i++) {
		cout << (char) token[i];
	}
	cout << endl;
}


void putVector(vector<int> token) {
	for (int i = 0; i < token.size(); i++) {
		ofile.put((char) token[i]);
	}
	ofile.put('\n');
}


vector<int> parseNumber(int start) {
	int number;
	vector<int> token (1, start);
	while (isdigit(ifile.peek())) {
		number = ifile.get();
		token.push_back(number);
	}
	return token;
}


bool isword(int inch) {
	if (isalnum(inch)) 		return true;
	else if (inch == '_') 	return true;
	else 					return false;
}


vector<int> parseWord(int start) {
	int word;
	vector<int> token (1, start);
	while (isword(ifile.peek())) {
		word = ifile.get();
		token.push_back(word);
	}
	return token;
}


vector<int> parseComment(int start) {
	int comment;
	vector<int> token (1, start);
	if (ifile.peek() == '/') {
		comment = ifile.get();
		token.push_back(comment);
		while ((comment = ifile.get()) != '\n') {
			token.push_back(comment);
		}
	} else if (ifile.peek() == '*') {
		comment = ifile.get();
		token.push_back(comment);
		PARSING_COMMENT: while ((comment = ifile.get()) != '*') {
			token.push_back(comment);
		}
		if (ifile.peek() == '/') {
			token.push_back(comment);
			comment = ifile.get();
			token.push_back(comment);
		} else {
			token.push_back(comment);
			goto PARSING_COMMENT;
		}
	}
	return token;
}


vector<int> parseSentence(int start) {
	int sentence;
	vector<int> token (1, start);
	PARSING_SENTENCE: while ((sentence = ifile.get()) != start) {
		token.push_back(sentence);
	}
	if (token.back() == '\\' && token[token.size() - 2] != '\\') {
		token.push_back(sentence);
		goto PARSING_SENTENCE;
	}
	token.push_back(sentence);
	return token;
}


void parse_error(string message, char ch) {
	cerr << message << ": " << (int) ch;
}


void getToken() {
	vector<int> token;
	int inch = 0;
	while ((inch = ifile.get()) != EOF) {
		token.clear();
		switch (inch) {
			case '\t':
			case '\n':
			case '\r':
			case ' ':
				break;
			case '#':
			case '<':
			case '>':
			case '.':
			case ',':
			case '*':
			case '=':
			case '!':
			case '(':
			case ')':
			case '[':
			case ']':
			case '{':
			case '}': 
			case ';':
			case '\\':
				token.push_back(inch);
				printToken(token);
				putVector(token);
				break;
			case '+':
			case '-':
			case ':':
			case '|':
			case '&':
				token.push_back(inch);
				if (ifile.peek() == inch) {
					inch = ifile.get();
					token.push_back(inch);
				}
				printToken(token);
				putVector(token);
				break;
			case '/':
				token = parseComment(inch);
				if (token.size() == 1) { printToken(token); }
				else { token.clear(); continue; }
				break;
			case '\'':
			case '\"':
				token = parseSentence(inch);
				printToken(token);
				putVector(token);
				break;
			default:
				if (isdigit(inch)) {
					token = parseNumber(inch);
					printToken(token);
					putVector(token);
				} else if (isword(inch)) {
					token = parseWord(inch);
					printToken(token);
					putVector(token);
				} else {
					parse_error("Illegal character", inch);
				}
				break;
		}
	}
}


int main(int argc, char const *argv[])
{
	if (argc != 2) {
		cout << "Please give a file name when running." << endl;
		return 0;
	}
	ifile.open(argv[1], ios::in);
	ofile.open(((string) argv[1] + ".txt").c_str(), ios::out);
	getToken();
	ifile.close();
	ofile.close();
	return 0;
}
@ethouris
Copy link

I think this has some usefulness, although it's neither a state machine, nor a tokenizer. For handling special situations you simply jump into a separate function that operates on its own, then returns to the previous place.

Also it's not really a tokenizer because in C++ sequences like "==" or "+=" are single tokens, while your program reports them by two separate single characters. Additionally, if you are going to parse the original source file (i.e. not preprocessed), and you tokenize preprocessor parts equally to the rest of the language, you need to treat the EOL character in the preprocessor directives as a token, unlike EOLs found in the rest of the source. And here is where the state machine can be used: change the state once found the # character and the way how things are interpreted. There, for example, the < ... > expressions should be also single tokens, just like strings.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment