Skip to content

Instantly share code, notes, and snippets.

@flisboac
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flisboac/175b0ea7ea5b73f1dfce to your computer and use it in GitHub Desktop.
Save flisboac/175b0ea7ea5b73f1dfce to your computer and use it in GitHub Desktop.
A pretty simple and rudimentar tokenizer, written in Java and using a rather non-intuitive java.util.Scanner management model (per token type).
package flisboac;
import java.io.IOException;
import java.util.Scanner;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.regex.Pattern;
public class Tokenizer {
public static class TokenizerException extends Exception {
}
public static class AlphabetException extends TokenizerException {
}
public static final String AnyWhitespaceRegex = "\\p{javaWhitespace}*";
public static enum TokenType {
Identifier("id", "[a-zA-Z_][a-zA-Z_0-9]*"),
ArithmeticOperator("op", "[=+-/*]"),
OpeningParens("par", "\\("),
ClosingParens("par", "\\)"),
NumericLiteral("lit", "[0-9]+");
private final String name;
private final String regex;
private final Pattern pattern;
private TokenType(String name, String regex) {
this.name = name;
this.regex = regex;
this.pattern = Pattern.compile(regex);
}
public String getName() {
return name;
}
public String getRegex() {
return regex;
}
public Pattern getPattern() {
return pattern;
}
}
public static class Token {
private TokenType tokenType;
private String parsedText;
private Integer identifierIndex;
private String asString;
private Double asDouble;
public Token() {
}
public Token(TokenType tokenType, String parsedText) {
this(tokenType, parsedText, null);
}
public Token(TokenType tokenType, String parsedText, Integer identifierIndex) {
this.tokenType = tokenType;
this.parsedText = parsedText;
this.identifierIndex = identifierIndex;
switch (this.tokenType) {
case NumericLiteral:
this.asString = parsedText;
this.asDouble = Double.parseDouble(this.parsedText);
break;
default:
this.asString = parsedText;
break;
}
}
public TokenType getTokenType() {
return tokenType;
}
public String getParsedText() {
return parsedText;
}
public Integer getIdentifierIndex() {
return identifierIndex;
}
public String getAsString() {
return asString;
}
public Double getAsDouble() {
return asDouble;
}
}
public <K, V extends Scanner> Map.Entry<K, V> findSmallestMatch(Map<K, V> matches) {
Map.Entry<K, V> smallest = null;
for (Map.Entry<K, V> match : matches.entrySet()) {
if (match.getValue().match().start() == 0) {
if (smallest == null
|| smallest.getValue().match().end() > match.getValue().match().end()) {
smallest = match;
}
}
}
return smallest;
}
public List<Token> scan(String expression) throws TokenizerException, IOException {
expression = expression.trim();
int inputIndex = 0;
List<Token> tokens = new ArrayList<>();
Map<String, Integer> identifiers = new HashMap<>();
while (inputIndex < expression.length()) {
String subInput = expression.substring(inputIndex);
Map<TokenType, Scanner> scanners = new HashMap<>();
for (TokenType type : TokenType.values()) {
String regex = AnyWhitespaceRegex + type.getRegex();
Scanner typeScanner = new Scanner(subInput);
String tokenText = typeScanner.findWithinHorizon(regex, 0);
if (tokenText != null) {
scanners.put(type, typeScanner);
}
}
Map.Entry<TokenType, Scanner> smallestMatch = findSmallestMatch(scanners);
if (smallestMatch == null) {
throw new AlphabetException();
} else {
TokenType type = smallestMatch.getKey();
String tokenText = smallestMatch.getValue().match().group().trim();
Integer identifierIndex = null;
inputIndex += smallestMatch.getValue().match().end();
if (type.equals(TokenType.Identifier)) {
if (identifiers.containsKey(tokenText)) {
identifierIndex = identifiers.get(tokenText);
} else {
identifierIndex = identifiers.size() + 1;
identifiers.put(tokenText, identifierIndex);
}
}
Token token = new Token(type, tokenText, identifierIndex);
tokens.add(token);
}
}
return tokens;
}
public static void main(String[] args) {
StringBuilder buffer = new StringBuilder();
for (String arg : args) {
buffer.append(arg);
buffer.append(" ");
}
try {
Tokenizer tokenizer = new Tokenizer();
List<Token> tokens = tokenizer.scan(buffer.toString());
for (Token token : tokens) {
System.out.print("<");
System.out.print(token.getTokenType().getName());
System.out.print(":");
Integer identifier = token.getIdentifierIndex();
if (identifier != null) {
System.out.print(identifier);
} else {
System.out.print(token.getAsString());
}
System.out.print(">");
}
System.out.println();
} catch (TokenizerException ex) {
System.err.println("Invalid syntax.");
System.exit(1);
} catch (IOException ex) {
System.err.println("Error while reading input.");
System.exit(2);
}
System.exit(0);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment