Skip to content

Instantly share code, notes, and snippets.

@briangordon
Last active December 20, 2015 18:29
Show Gist options
  • Save briangordon/6176536 to your computer and use it in GitHub Desktop.
Save briangordon/6176536 to your computer and use it in GitHub Desktop.
This is a speed run. It's not how I would implement a more fully-featured regex engine. I was tripped up by a couple of bugs so the first one took me a total of 90 minutes to write. Later I had an idea for a simpler version that doesn't work in all cases. It took me 25 minutes to write.
import java.util.ArrayList;
import java.util.List;
public class SimplifiedRegexMatcher {
private SimplifiedRegexMatcher() {}
public static boolean match(String regex, String needle) {
// Our algorithm will fail if an open parenthesis is the first character, so add a character to the beginning of
// both the regex and the needle.
regex = '!' + regex;
needle = '!' + needle;
// Trailer characters
regex = regex + '^';
needle = needle + '^';
// Keep track of a pointer to all possible current locations in the regex
List<Integer> possibleRegexIndices = new ArrayList<>();
// If the regex is empty then it can only match an empty string
if(regex.equals("")) {
return needle.equals("");
}
// First scan forward to the first letter in the regex
int i = 0;
while(regex.charAt(i) == '(') {
i++;
}
// Start with the first letter in our possible states
possibleRegexIndices.add(i);
// Scan through the needle one character at a time
for(int needle_index = 0; needle_index < needle.length() - 1; needle_index++) {
// If we have no more options, then it must not have been a match.
if(possibleRegexIndices.size() == 0) {
return false;
}
char currentNeedle = needle.charAt(needle_index);
List<Integer> nextPossibleRegexIndices = new ArrayList<>();
for(Integer regex_index : possibleRegexIndices) {
if(regex.charAt(regex_index) != currentNeedle) {
continue;
}
// We know that we're pointing to a letter right now. What we do next depends on whether the next character
// is a parenthesis or not.
char nextRegexCharacter = regex.charAt(regex_index + 1);
if(nextRegexCharacter == '(') {
// Scan forwards to the next letter and add it
int j = regex_index + 1;
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') {
j++;
}
nextPossibleRegexIndices.add(j);
// Scan forward to the matching end parenthesis
j = regex_index + 2;
int depth = 0;
while(regex.charAt(j) != ')' || depth > 0) {
if(regex.charAt(j) == '(') {
depth++;
} else if(regex.charAt(j) == ')') {
depth--;
}
j++;
}
// Scan forwards to the next letter and add it
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') {
j++;
}
nextPossibleRegexIndices.add(j);
} else if(nextRegexCharacter == ')') {
// Scan forwards to the next letter and add it
int j = regex_index + 1;
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') {
j++;
// If we go off the end of the regex then this was clearly not a valid possibility
if(j == regex.length()) {
continue;
}
}
nextPossibleRegexIndices.add(j);
// Scan backwards to the opening parenthesis
j = regex_index;
int depth = 0;
while(regex.charAt(j) != '(' || depth > 0) {
if(regex.charAt(j) == ')') {
depth++;
} else if(regex.charAt(j) == '(') {
depth--;
}
j--;
}
// Scan forwards to the next letter and add it
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') {
j++;
}
nextPossibleRegexIndices.add(j);
} else {
// Just add the next letter
nextPossibleRegexIndices.add(regex_index + 1);
}
}
// Replace our current possible regex locations with the next pass
possibleRegexIndices = nextPossibleRegexIndices;
}
// Return true iff one possible state is the terminating character
return possibleRegexIndices.contains(regex.length() - 1);
}
private static void shouldMatch(String regex, String needle) {
System.out.println("The regex " + regex + " should match the needle " + needle + " - " + (match(regex, needle) ? "OK" : "ERROR"));
}
private static void shouldNotMatch(String regex, String needle) {
System.out.println("The regex " + regex + " should not match the needle " + needle + " - " + (!match(regex, needle) ? "OK" : "ERROR"));
}
public static void main(String[] args) {
shouldMatch("abcd", "abcd");
shouldMatch("(abc)*", "abc");
shouldMatch("(abc)*(abc)*", "abcabc");
shouldMatch("b(oo)*p", "bp");
shouldMatch("(b(an)*a)*", "bananaba");
shouldMatch("b(a(a)*)*", "b");
shouldMatch("b(a(a)*)*", "ba");
shouldMatch("b(a(a)*)*", "baa");
shouldNotMatch("abcd", "abc");
shouldNotMatch("abc", "abcd");
shouldNotMatch("b(a(c)*a)*", "bacac");
}
}
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
// started 1:40
// ended 2:04
public class CheatingRegexMatcher {
// Increase this to make failure cases rarer at the cost of performance.
private static final int BRANCHING_FACTOR = 5;
private static String replaceNTimes(String str, int startIdx, int endIdx, int n) {
String firstPart = str.substring(0, startIdx);
String endPart = str.substring(endIdx + 1, str.length());
String multiplyPart = str.substring(startIdx + 1, endIdx - 1);
StringBuilder sb = new StringBuilder(firstPart.length() + (endPart.length() * n) + multiplyPart.length());
sb.append(firstPart);
for(int i = 0; i < n; i++) {
sb.append(multiplyPart);
}
sb.append(endPart);
return sb.toString();
}
public static boolean match(String regex, String str) {
if(regex.equals(str)) return true;
if(!regex.contains("*")) return false;
Deque<String> buffer = new ArrayDeque<>();
buffer.add(regex);
while(!buffer.isEmpty()) {
String curString = buffer.remove();
// Find the leftmost open parenthesis
int j = 0;
while(curString.charAt(j) != '(') {
j++;
}
// Scan to the closing parenthesis
int k = j + 1, depth = 0;
while(curString.charAt(k) != ')' || depth > 0) {
if(curString.charAt(k) == '(') {
depth++;
} else if(curString.charAt(k) == ')') {
depth--;
}
k++;
}
// Go forward one to the asterisk
k++;
// Replace (abc)* with abc, abcabc, abcabcabc, abcabcabcabc, etc.
for(int i = 0; i < BRANCHING_FACTOR; i++) {
String replaced = replaceNTimes(curString, j, k, i);
if(replaced.equals(str)) {
return true;
}
if(replaced.contains("*")) {
buffer.add(replaced);
}
}
}
return false;
}
private static void shouldMatch(String regex, String needle) {
System.out.println("The regex " + regex + " should match the needle " + needle + " - " + (match(regex, needle) ? "OK" : "ERROR"));
}
private static void shouldNotMatch(String regex, String needle) {
System.out.println("The regex " + regex + " should not match the needle " + needle + " - " + (!match(regex, needle) ? "OK" : "ERROR"));
}
public static void main(String[] args) {
shouldMatch("abcd", "abcd");
shouldMatch("(abc)*", "abc");
shouldMatch("(abc)*(abc)*", "abcabc");
shouldMatch("b(oo)*p", "bp");
shouldMatch("(b(an)*a)*", "bananaba");
shouldMatch("b(a(a)*)*", "b");
shouldMatch("b(a(a)*)*", "ba");
shouldMatch("b(a(a)*)*", "baa");
// This is going to fail because it requires too much expansion. That's the downside of cheating.
shouldMatch("(a)*", "aaaaaaaaaaa");
shouldNotMatch("abcd", "abc");
shouldNotMatch("abc", "abcd");
shouldNotMatch("b(a(c)*a)*", "bacac");
}
}
Consider the subset of regular expressions that only use lowercase letters, parentheses, and the Kleene star *.
The presence of a Kleene star means that the preceding parenthesized phrase can appear zero or more times.
Parentheses are only valid around the text preceding a Kleene star. Write a program that takes such a regular
expression and a string and determines whether or not that entire string matches the regular expression.
For example, the string on the left matches the regular expression on the right:
hello hello
testtest (test)*
helllllllo he(l)*lo
bp b(oo)*p
abcd a(bc)*(bc)*d
bananaba (b(an)*a)*
Don’t use any existing regular expression utilities.
You may assume that the regex you’re given is syntactically valid.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment