Last active
November 15, 2023 10:31
-
-
Save coderodde/46bea3aa979bbd4e291b55aa43aeeafe to your computer and use it in GitHub Desktop.
Regex parsing research
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayDeque; | |
import java.util.ArrayList; | |
import java.util.Deque; | |
import java.util.List; | |
class RegexParserResearch { | |
public static Deque<Token> shuntingYardAlgorithm(String regex) { | |
List<Token> tokens = tokenize(regex); | |
Deque<Token> output = new ArrayDeque<>(); | |
Deque<Token> operatorStack = new ArrayDeque<>(); | |
for (Token token : tokens) { | |
switch (token.getTokenType()) { | |
case CHARACTER: | |
case KLEEN_STAR: | |
output.addLast(token); | |
break; | |
case CONCATENATION: | |
processConcatenation(token, | |
output, | |
operatorStack); | |
break; | |
case UNION: | |
processUnion(token, | |
output, | |
operatorStack); | |
break; | |
case LEFT_PARENTHESIS: | |
operatorStack.addLast(token); | |
break; | |
case RIGHT_PARENTHESIS: | |
processRightParenthesis(output, operatorStack); | |
break; | |
} | |
} | |
while (!operatorStack.isEmpty()) { | |
Token topToken = operatorStack.removeLast(); | |
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) { | |
throw new IllegalStateException( | |
"Left parenthesis not expected here."); | |
} | |
output.addLast(topToken); | |
} | |
return output; | |
} | |
private static void processConcatenation(Token token, | |
Deque<Token> output, | |
Deque<Token> operatorStack) { | |
int tokenPrecedence = getOperatorPrecedence(token); | |
while (!operatorStack.isEmpty()) { | |
Token topToken = operatorStack.getLast(); | |
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) { | |
break; | |
} | |
int topTokenPrecedence = getOperatorPrecedence(topToken); | |
if (topTokenPrecedence <= tokenPrecedence) { | |
break; | |
} | |
output.addLast(operatorStack.removeLast()); | |
} | |
operatorStack.addLast(token); | |
} | |
private static void processUnion(Token token, | |
Deque<Token> output, | |
Deque<Token> operatorStack) { | |
int tokenPrecedence = getOperatorPrecedence(token); | |
while (!operatorStack.isEmpty()) { | |
Token topToken = operatorStack.getLast(); | |
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) { | |
break; | |
} | |
int topTokenPrecedence = getOperatorPrecedence(token); | |
if (topTokenPrecedence <= tokenPrecedence) { | |
break; | |
} | |
output.addLast(operatorStack.removeLast()); | |
} | |
operatorStack.addLast(token); | |
} | |
private static void | |
processRightParenthesis( | |
Deque<Token> output, | |
Deque<Token> operatorStack) { | |
while (!operatorStack.isEmpty()) { | |
Token topToken = operatorStack.getLast(); | |
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) { | |
break; | |
} | |
output.addLast(operatorStack.removeLast()); | |
} | |
if (operatorStack.isEmpty()) { | |
throw new IllegalStateException("Operator stack is empty."); | |
} | |
Token topToken = operatorStack.removeLast(); | |
if (topToken.getTokenType() != TokenType.LEFT_PARENTHESIS) { | |
throw new IllegalStateException( | |
"Left parenthesis expected at the top of operator stack."); | |
} | |
} | |
private static int getOperatorPrecedence(Token token) { | |
switch (token.getTokenType()) { | |
case KLEEN_STAR: | |
return 2; | |
case CONCATENATION: | |
return 1; | |
case UNION: | |
return 0; | |
default: | |
throw new IllegalStateException("Should not get here."); | |
} | |
} | |
public static List<Character> parse(String regex) { | |
List<Token> tokens = tokenize(regex); | |
System.out.println(tokens); | |
return null; | |
} | |
private static List<Token> tokenize(String regex) { | |
List<Token> tokens = new ArrayList<>(); | |
char previousCharacter = '\0'; | |
for (char ch : regex.toCharArray()) { | |
switch (ch) { | |
case '*': | |
tokens.add(new Token(TokenType.KLEEN_STAR)); | |
break; | |
case '|': | |
tokens.add(new Token(TokenType.UNION)); | |
break; | |
case '(': | |
if (isTextCharacter(previousCharacter)) { | |
tokens.add(new Token(TokenType.CONCATENATION)); | |
} else if (previousCharacter == '*') { | |
tokens.add(new Token(TokenType.CONCATENATION)); | |
} else if (previousCharacter == ')') { | |
tokens.add(new Token(TokenType.CONCATENATION)); | |
} | |
tokens.add(new Token(TokenType.LEFT_PARENTHESIS)); | |
break; | |
case ')': | |
tokens.add(new Token(TokenType.RIGHT_PARENTHESIS)); | |
break; | |
default: | |
// Once here, the ch is an alphabet character: | |
if (isTextCharacter(previousCharacter)) { | |
tokens.add(new Token(TokenType.CONCATENATION)); | |
} else if (previousCharacter == '*') { | |
tokens.add(new Token(TokenType.CONCATENATION)); | |
} | |
tokens.add(new Token(TokenType.CHARACTER, ch)); | |
break; | |
} | |
previousCharacter = ch; | |
} | |
return tokens; | |
} | |
private static boolean isTextCharacter(char ch) { | |
switch (ch) { | |
case '*': | |
case '|': | |
case '(': | |
case ')': | |
case '\0': | |
return false; | |
default: | |
return true; | |
} | |
} | |
public static void main(String[] args) { | |
// parse("a|bc*d"); | |
// parse("a|b(cd)*(cda)*"); | |
// Deque<Token> postfix = shuntingYardAlgorithm("a|b"); | |
// System.out.println(postfix); | |
// | |
// postfix = shuntingYardAlgorithm("(a|b)*"); | |
// System.out.println(postfix); | |
// List<Token> tokens = tokenize("(a|b)*(cd*)*"); | |
// System.out.println(tokens); | |
List<Token> tokens = tokenize("(a|bc)"); | |
System.out.println(tokens); | |
Deque<Token> postfix = shuntingYardAlgorithm("((a|(bc)))"); | |
System.out.println(postfix); | |
} | |
} | |
enum TokenType { | |
KLEEN_STAR, | |
UNION, | |
CONCATENATION, | |
CHARACTER, | |
LEFT_PARENTHESIS, | |
RIGHT_PARENTHESIS; | |
} | |
class Token { | |
TokenType tokenType; | |
Character character; | |
Token(TokenType tokenType, Character character) { | |
this.tokenType = tokenType; | |
this.character = character; | |
} | |
Token(TokenType tokenType) { | |
this.tokenType = tokenType; | |
} | |
TokenType getTokenType() { | |
return tokenType; | |
} | |
public String toString() { | |
switch (tokenType) { | |
case CHARACTER: | |
return "" + character; | |
case CONCATENATION: | |
return "o"; | |
case KLEEN_STAR: | |
return "*"; | |
case LEFT_PARENTHESIS: | |
return "("; | |
case RIGHT_PARENTHESIS: | |
return ")"; | |
case UNION: | |
return "|"; | |
default: | |
throw new IllegalStateException("Unknown token type."); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment