Last active
December 20, 2015 18:29
-
-
Save briangordon/6176536 to your computer and use it in GitHub Desktop.
This is a speed run. It's not how I would implement a more fully-featured regex engine. I was tripped up by a couple of bugs so the first one took me a total of 90 minutes to write. Later I had an idea for a simpler version that doesn't work in all cases. It took me 25 minutes to write.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.List; | |
public class SimplifiedRegexMatcher { | |
private SimplifiedRegexMatcher() {} | |
public static boolean match(String regex, String needle) { | |
// Our algorithm will fail if an open parenthesis is the first character, so add a character to the beginning of | |
// both the regex and the needle. | |
regex = '!' + regex; | |
needle = '!' + needle; | |
// Trailer characters | |
regex = regex + '^'; | |
needle = needle + '^'; | |
// Keep track of a pointer to all possible current locations in the regex | |
List<Integer> possibleRegexIndices = new ArrayList<>(); | |
// If the regex is empty then it can only match an empty string | |
if(regex.equals("")) { | |
return needle.equals(""); | |
} | |
// First scan forward to the first letter in the regex | |
int i = 0; | |
while(regex.charAt(i) == '(') { | |
i++; | |
} | |
// Start with the first letter in our possible states | |
possibleRegexIndices.add(i); | |
// Scan through the needle one character at a time | |
for(int needle_index = 0; needle_index < needle.length() - 1; needle_index++) { | |
// If we have no more options, then it must not have been a match. | |
if(possibleRegexIndices.size() == 0) { | |
return false; | |
} | |
char currentNeedle = needle.charAt(needle_index); | |
List<Integer> nextPossibleRegexIndices = new ArrayList<>(); | |
for(Integer regex_index : possibleRegexIndices) { | |
if(regex.charAt(regex_index) != currentNeedle) { | |
continue; | |
} | |
// We know that we're pointing to a letter right now. What we do next depends on whether the next character | |
// is a parenthesis or not. | |
char nextRegexCharacter = regex.charAt(regex_index + 1); | |
if(nextRegexCharacter == '(') { | |
// Scan forwards to the next letter and add it | |
int j = regex_index + 1; | |
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') { | |
j++; | |
} | |
nextPossibleRegexIndices.add(j); | |
// Scan forward to the matching end parenthesis | |
j = regex_index + 2; | |
int depth = 0; | |
while(regex.charAt(j) != ')' || depth > 0) { | |
if(regex.charAt(j) == '(') { | |
depth++; | |
} else if(regex.charAt(j) == ')') { | |
depth--; | |
} | |
j++; | |
} | |
// Scan forwards to the next letter and add it | |
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') { | |
j++; | |
} | |
nextPossibleRegexIndices.add(j); | |
} else if(nextRegexCharacter == ')') { | |
// Scan forwards to the next letter and add it | |
int j = regex_index + 1; | |
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') { | |
j++; | |
// If we go off the end of the regex then this was clearly not a valid possibility | |
if(j == regex.length()) { | |
continue; | |
} | |
} | |
nextPossibleRegexIndices.add(j); | |
// Scan backwards to the opening parenthesis | |
j = regex_index; | |
int depth = 0; | |
while(regex.charAt(j) != '(' || depth > 0) { | |
if(regex.charAt(j) == ')') { | |
depth++; | |
} else if(regex.charAt(j) == '(') { | |
depth--; | |
} | |
j--; | |
} | |
// Scan forwards to the next letter and add it | |
while(regex.charAt(j) == '(' || regex.charAt(j) == '*' || regex.charAt(j) == ')') { | |
j++; | |
} | |
nextPossibleRegexIndices.add(j); | |
} else { | |
// Just add the next letter | |
nextPossibleRegexIndices.add(regex_index + 1); | |
} | |
} | |
// Replace our current possible regex locations with the next pass | |
possibleRegexIndices = nextPossibleRegexIndices; | |
} | |
// Return true iff one possible state is the terminating character | |
return possibleRegexIndices.contains(regex.length() - 1); | |
} | |
private static void shouldMatch(String regex, String needle) { | |
System.out.println("The regex " + regex + " should match the needle " + needle + " - " + (match(regex, needle) ? "OK" : "ERROR")); | |
} | |
private static void shouldNotMatch(String regex, String needle) { | |
System.out.println("The regex " + regex + " should not match the needle " + needle + " - " + (!match(regex, needle) ? "OK" : "ERROR")); | |
} | |
public static void main(String[] args) { | |
shouldMatch("abcd", "abcd"); | |
shouldMatch("(abc)*", "abc"); | |
shouldMatch("(abc)*(abc)*", "abcabc"); | |
shouldMatch("b(oo)*p", "bp"); | |
shouldMatch("(b(an)*a)*", "bananaba"); | |
shouldMatch("b(a(a)*)*", "b"); | |
shouldMatch("b(a(a)*)*", "ba"); | |
shouldMatch("b(a(a)*)*", "baa"); | |
shouldNotMatch("abcd", "abc"); | |
shouldNotMatch("abc", "abcd"); | |
shouldNotMatch("b(a(c)*a)*", "bacac"); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayDeque; | |
import java.util.ArrayList; | |
import java.util.Deque; | |
import java.util.List; | |
// started 1:40 | |
// ended 2:04 | |
public class CheatingRegexMatcher { | |
// Increase this to make failure cases rarer at the cost of performance. | |
private static final int BRANCHING_FACTOR = 5; | |
private static String replaceNTimes(String str, int startIdx, int endIdx, int n) { | |
String firstPart = str.substring(0, startIdx); | |
String endPart = str.substring(endIdx + 1, str.length()); | |
String multiplyPart = str.substring(startIdx + 1, endIdx - 1); | |
StringBuilder sb = new StringBuilder(firstPart.length() + (endPart.length() * n) + multiplyPart.length()); | |
sb.append(firstPart); | |
for(int i = 0; i < n; i++) { | |
sb.append(multiplyPart); | |
} | |
sb.append(endPart); | |
return sb.toString(); | |
} | |
public static boolean match(String regex, String str) { | |
if(regex.equals(str)) return true; | |
if(!regex.contains("*")) return false; | |
Deque<String> buffer = new ArrayDeque<>(); | |
buffer.add(regex); | |
while(!buffer.isEmpty()) { | |
String curString = buffer.remove(); | |
// Find the leftmost open parenthesis | |
int j = 0; | |
while(curString.charAt(j) != '(') { | |
j++; | |
} | |
// Scan to the closing parenthesis | |
int k = j + 1, depth = 0; | |
while(curString.charAt(k) != ')' || depth > 0) { | |
if(curString.charAt(k) == '(') { | |
depth++; | |
} else if(curString.charAt(k) == ')') { | |
depth--; | |
} | |
k++; | |
} | |
// Go forward one to the asterisk | |
k++; | |
// Replace (abc)* with abc, abcabc, abcabcabc, abcabcabcabc, etc. | |
for(int i = 0; i < BRANCHING_FACTOR; i++) { | |
String replaced = replaceNTimes(curString, j, k, i); | |
if(replaced.equals(str)) { | |
return true; | |
} | |
if(replaced.contains("*")) { | |
buffer.add(replaced); | |
} | |
} | |
} | |
return false; | |
} | |
private static void shouldMatch(String regex, String needle) { | |
System.out.println("The regex " + regex + " should match the needle " + needle + " - " + (match(regex, needle) ? "OK" : "ERROR")); | |
} | |
private static void shouldNotMatch(String regex, String needle) { | |
System.out.println("The regex " + regex + " should not match the needle " + needle + " - " + (!match(regex, needle) ? "OK" : "ERROR")); | |
} | |
public static void main(String[] args) { | |
shouldMatch("abcd", "abcd"); | |
shouldMatch("(abc)*", "abc"); | |
shouldMatch("(abc)*(abc)*", "abcabc"); | |
shouldMatch("b(oo)*p", "bp"); | |
shouldMatch("(b(an)*a)*", "bananaba"); | |
shouldMatch("b(a(a)*)*", "b"); | |
shouldMatch("b(a(a)*)*", "ba"); | |
shouldMatch("b(a(a)*)*", "baa"); | |
// This is going to fail because it requires too much expansion. That's the downside of cheating. | |
shouldMatch("(a)*", "aaaaaaaaaaa"); | |
shouldNotMatch("abcd", "abc"); | |
shouldNotMatch("abc", "abcd"); | |
shouldNotMatch("b(a(c)*a)*", "bacac"); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Consider the subset of regular expressions that only use lowercase letters, parentheses, and the Kleene star *. | |
The presence of a Kleene star means that the preceding parenthesized phrase can appear zero or more times. | |
Parentheses are only valid around the text preceding a Kleene star. Write a program that takes such a regular | |
expression and a string and determines whether or not that entire string matches the regular expression. | |
For example, the string on the left matches the regular expression on the right: | |
hello hello | |
testtest (test)* | |
helllllllo he(l)*lo | |
bp b(oo)*p | |
abcd a(bc)*(bc)*d | |
bananaba (b(an)*a)* | |
Don’t use any existing regular expression utilities. | |
You may assume that the regex you’re given is syntactically valid. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment