Skip to content

Instantly share code, notes, and snippets.

@coderodde
Last active November 15, 2023 10:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save coderodde/46bea3aa979bbd4e291b55aa43aeeafe to your computer and use it in GitHub Desktop.
Save coderodde/46bea3aa979bbd4e291b55aa43aeeafe to your computer and use it in GitHub Desktop.
Regex parsing research
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
class RegexParserResearch {
public static Deque<Token> shuntingYardAlgorithm(String regex) {
List<Token> tokens = tokenize(regex);
Deque<Token> output = new ArrayDeque<>();
Deque<Token> operatorStack = new ArrayDeque<>();
for (Token token : tokens) {
switch (token.getTokenType()) {
case CHARACTER:
case KLEEN_STAR:
output.addLast(token);
break;
case CONCATENATION:
processConcatenation(token,
output,
operatorStack);
break;
case UNION:
processUnion(token,
output,
operatorStack);
break;
case LEFT_PARENTHESIS:
operatorStack.addLast(token);
break;
case RIGHT_PARENTHESIS:
processRightParenthesis(output, operatorStack);
break;
}
}
while (!operatorStack.isEmpty()) {
Token topToken = operatorStack.removeLast();
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) {
throw new IllegalStateException(
"Left parenthesis not expected here.");
}
output.addLast(topToken);
}
return output;
}
private static void processConcatenation(Token token,
Deque<Token> output,
Deque<Token> operatorStack) {
int tokenPrecedence = getOperatorPrecedence(token);
while (!operatorStack.isEmpty()) {
Token topToken = operatorStack.getLast();
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) {
break;
}
int topTokenPrecedence = getOperatorPrecedence(topToken);
if (topTokenPrecedence <= tokenPrecedence) {
break;
}
output.addLast(operatorStack.removeLast());
}
operatorStack.addLast(token);
}
private static void processUnion(Token token,
Deque<Token> output,
Deque<Token> operatorStack) {
int tokenPrecedence = getOperatorPrecedence(token);
while (!operatorStack.isEmpty()) {
Token topToken = operatorStack.getLast();
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) {
break;
}
int topTokenPrecedence = getOperatorPrecedence(token);
if (topTokenPrecedence <= tokenPrecedence) {
break;
}
output.addLast(operatorStack.removeLast());
}
operatorStack.addLast(token);
}
private static void
processRightParenthesis(
Deque<Token> output,
Deque<Token> operatorStack) {
while (!operatorStack.isEmpty()) {
Token topToken = operatorStack.getLast();
if (topToken.getTokenType() == TokenType.LEFT_PARENTHESIS) {
break;
}
output.addLast(operatorStack.removeLast());
}
if (operatorStack.isEmpty()) {
throw new IllegalStateException("Operator stack is empty.");
}
Token topToken = operatorStack.removeLast();
if (topToken.getTokenType() != TokenType.LEFT_PARENTHESIS) {
throw new IllegalStateException(
"Left parenthesis expected at the top of operator stack.");
}
}
private static int getOperatorPrecedence(Token token) {
switch (token.getTokenType()) {
case KLEEN_STAR:
return 2;
case CONCATENATION:
return 1;
case UNION:
return 0;
default:
throw new IllegalStateException("Should not get here.");
}
}
public static List<Character> parse(String regex) {
List<Token> tokens = tokenize(regex);
System.out.println(tokens);
return null;
}
private static List<Token> tokenize(String regex) {
List<Token> tokens = new ArrayList<>();
char previousCharacter = '\0';
for (char ch : regex.toCharArray()) {
switch (ch) {
case '*':
tokens.add(new Token(TokenType.KLEEN_STAR));
break;
case '|':
tokens.add(new Token(TokenType.UNION));
break;
case '(':
if (isTextCharacter(previousCharacter)) {
tokens.add(new Token(TokenType.CONCATENATION));
} else if (previousCharacter == '*') {
tokens.add(new Token(TokenType.CONCATENATION));
} else if (previousCharacter == ')') {
tokens.add(new Token(TokenType.CONCATENATION));
}
tokens.add(new Token(TokenType.LEFT_PARENTHESIS));
break;
case ')':
tokens.add(new Token(TokenType.RIGHT_PARENTHESIS));
break;
default:
// Once here, the ch is an alphabet character:
if (isTextCharacter(previousCharacter)) {
tokens.add(new Token(TokenType.CONCATENATION));
} else if (previousCharacter == '*') {
tokens.add(new Token(TokenType.CONCATENATION));
}
tokens.add(new Token(TokenType.CHARACTER, ch));
break;
}
previousCharacter = ch;
}
return tokens;
}
private static boolean isTextCharacter(char ch) {
switch (ch) {
case '*':
case '|':
case '(':
case ')':
case '\0':
return false;
default:
return true;
}
}
public static void main(String[] args) {
// parse("a|bc*d");
// parse("a|b(cd)*(cda)*");
// Deque<Token> postfix = shuntingYardAlgorithm("a|b");
// System.out.println(postfix);
//
// postfix = shuntingYardAlgorithm("(a|b)*");
// System.out.println(postfix);
// List<Token> tokens = tokenize("(a|b)*(cd*)*");
// System.out.println(tokens);
List<Token> tokens = tokenize("(a|bc)");
System.out.println(tokens);
Deque<Token> postfix = shuntingYardAlgorithm("((a|(bc)))");
System.out.println(postfix);
}
}
enum TokenType {
KLEEN_STAR,
UNION,
CONCATENATION,
CHARACTER,
LEFT_PARENTHESIS,
RIGHT_PARENTHESIS;
}
class Token {
TokenType tokenType;
Character character;
Token(TokenType tokenType, Character character) {
this.tokenType = tokenType;
this.character = character;
}
Token(TokenType tokenType) {
this.tokenType = tokenType;
}
TokenType getTokenType() {
return tokenType;
}
public String toString() {
switch (tokenType) {
case CHARACTER:
return "" + character;
case CONCATENATION:
return "o";
case KLEEN_STAR:
return "*";
case LEFT_PARENTHESIS:
return "(";
case RIGHT_PARENTHESIS:
return ")";
case UNION:
return "|";
default:
throw new IllegalStateException("Unknown token type.");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment