LovelyBuggies/Tokenizer for C++ Program.md

## Tokenizer for C++ Program.md

      
    Raw
  

              Tokenizer for C++ Program.md
            
          
    In this note, I wrote a c++ tokenizer using a finite state machine, which can get tokens from a cpp program.
I may omit some situations, if you found, leave a comment and let me know.
Tokenize c++ std input

#include <iostream>
#include <cctype>
#include <vector>
using namespace std;


vector<int> parseNumber(int start) {
	int number;
	vector<int> token (1, start);
	while (isdigit(cin.peek())) {
		number = getchar();
		token.push_back(number);
	}
	return token;
}


bool isword(int inch) {
	if (isalnum(inch)) 		return true;
	else if (inch == '_') 	return true;
	else 					return false;
}


vector<int> parseWord(int start) {
	int word;
	vector<int> token (1, start);
	while (isword(cin.peek())) {
		word = getchar();
		token.push_back(word);
	}
	return token;
}


vector<int> parseComment(int start) {
	int comment;
	vector<int> token (1, start);
	if (cin.peek() == '/') {
		comment = getchar();
		token.push_back(comment);
		while ((comment = getchar()) != '\n') {
			token.push_back(comment);
		}
	} else if (cin.peek() == '*') {
		comment = getchar();
		token.push_back(comment);
		PARSING_COMMENT: while ((comment = getchar()) != '*') {
			token.push_back(comment);
		}
		if (cin.peek() == '/') {
			token.push_back(comment);
			comment = getchar();
			token.push_back(comment);
		} else {
			token.push_back(comment);
			goto PARSING_COMMENT;
		}
	}
	return token;
}


vector<int> parseSentence(int start) {
	int sentence;
	vector<int> token (1, start);
	PARSING_SENTENCE: while ((sentence = getchar()) != start) {
		token.push_back(sentence);
	}
	if (token.back() == '\\' && token[token.size() - 2] != '\\') {
		token.push_back(sentence);
		goto PARSING_SENTENCE;
	}
	token.push_back(sentence);
	return token;
}


void parse_error(string message, char ch) {
	cerr << message << ": " << ch;
}


vector<int> getToken() {
	vector<int> token;
	int inch = 0;
	while ((inch = getchar()) != EOF) {
		switch (inch) {
			case '\t':
			case '\n':
			case '\r':
			case ' ':
				break;
			case '#':
			case '<':
			case '>':
			case '.':
			case ',':
			case '*':
			case '=':
			case '!':
			case '(':
			case ')':
			case '[':
			case ']':
			case '{':
			case '}': 
			case ';':
			case '\\':
				token.push_back(inch);
				return token;
			case '+':
			case '-':
			case ':':
			case '|':
			case '&':
				token.push_back(inch);
				if (cin.peek() == inch) {
					inch = getchar();
					token.push_back(inch);
				}
				return token;
			case '/':
				token = parseComment(inch);
				if (token.size() == 1) { return token; }
				else { token.clear(); continue; }
			case '\'':
			case '\"':
				return parseSentence(inch);
			default:
				if (isdigit(inch)) {
					return parseNumber(inch);
				} else if (isword(inch)) {
					return parseWord(inch);
				} else {
					parse_error("Illegal character", inch);
				}
		}
	}
	return token;
}


void printTokens()
{
	cout << "-------------------------" << endl;
	vector<int> token;
	while((token = getToken()).size() != 0) {
		for (int i = 0; i < token.size(); i++) {
			cout << char(token[i]);
		}
		cout << endl;
	}
	cout << "-------------------------" << endl;
}


int main()
{
	printTokens();
}
Tokenize c++ file input

#include <iostream>
#include <fstream>
#include <string>
#include <cctype>
#include <vector>
using namespace std;


ifstream ifile;
ofstream ofile;


void printToken(vector<int> token) {
	for (int i = 0; i < token.size(); i++) {
		cout << (char) token[i];
	}
	cout << endl;
}


void putVector(vector<int> token) {
	for (int i = 0; i < token.size(); i++) {
		ofile.put((char) token[i]);
	}
	ofile.put('\n');
}


vector<int> parseNumber(int start) {
	int number;
	vector<int> token (1, start);
	while (isdigit(ifile.peek())) {
		number = ifile.get();
		token.push_back(number);
	}
	return token;
}


bool isword(int inch) {
	if (isalnum(inch)) 		return true;
	else if (inch == '_') 	return true;
	else 					return false;
}


vector<int> parseWord(int start) {
	int word;
	vector<int> token (1, start);
	while (isword(ifile.peek())) {
		word = ifile.get();
		token.push_back(word);
	}
	return token;
}


vector<int> parseComment(int start) {
	int comment;
	vector<int> token (1, start);
	if (ifile.peek() == '/') {
		comment = ifile.get();
		token.push_back(comment);
		while ((comment = ifile.get()) != '\n') {
			token.push_back(comment);
		}
	} else if (ifile.peek() == '*') {
		comment = ifile.get();
		token.push_back(comment);
		PARSING_COMMENT: while ((comment = ifile.get()) != '*') {
			token.push_back(comment);
		}
		if (ifile.peek() == '/') {
			token.push_back(comment);
			comment = ifile.get();
			token.push_back(comment);
		} else {
			token.push_back(comment);
			goto PARSING_COMMENT;
		}
	}
	return token;
}


vector<int> parseSentence(int start) {
	int sentence;
	vector<int> token (1, start);
	PARSING_SENTENCE: while ((sentence = ifile.get()) != start) {
		token.push_back(sentence);
	}
	if (token.back() == '\\' && token[token.size() - 2] != '\\') {
		token.push_back(sentence);
		goto PARSING_SENTENCE;
	}
	token.push_back(sentence);
	return token;
}


void parse_error(string message, char ch) {
	cerr << message << ": " << (int) ch;
}


void getToken() {
	vector<int> token;
	int inch = 0;
	while ((inch = ifile.get()) != EOF) {
		token.clear();
		switch (inch) {
			case '\t':
			case '\n':
			case '\r':
			case ' ':
				break;
			case '#':
			case '<':
			case '>':
			case '.':
			case ',':
			case '*':
			case '=':
			case '!':
			case '(':
			case ')':
			case '[':
			case ']':
			case '{':
			case '}': 
			case ';':
			case '\\':
				token.push_back(inch);
				printToken(token);
				putVector(token);
				break;
			case '+':
			case '-':
			case ':':
			case '|':
			case '&':
				token.push_back(inch);
				if (ifile.peek() == inch) {
					inch = ifile.get();
					token.push_back(inch);
				}
				printToken(token);
				putVector(token);
				break;
			case '/':
				token = parseComment(inch);
				if (token.size() == 1) { printToken(token); }
				else { token.clear(); continue; }
				break;
			case '\'':
			case '\"':
				token = parseSentence(inch);
				printToken(token);
				putVector(token);
				break;
			default:
				if (isdigit(inch)) {
					token = parseNumber(inch);
					printToken(token);
					putVector(token);
				} else if (isword(inch)) {
					token = parseWord(inch);
					printToken(token);
					putVector(token);
				} else {
					parse_error("Illegal character", inch);
				}
				break;
		}
	}
}


int main(int argc, char const *argv[])
{
	if (argc != 2) {
		cout << "Please give a file name when running." << endl;
		return 0;
	}
	ifile.open(argv[1], ios::in);
	ofile.open(((string) argv[1] + ".txt").c_str(), ios::out);
	getToken();
	ifile.close();
	ofile.close();
	return 0;
}