Skip to content

Instantly share code, notes, and snippets.

@mumrah
Last active July 9, 2017 16:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mumrah/6104234 to your computer and use it in GitHub Desktop.
Save mumrah/6104234 to your computer and use it in GitHub Desktop.
Iterative regular expression building with Lucene's RegExp and Automaton
package default;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.RegExp;
public class RegexAutomatonTest {
public void testSSN() {
Automaton full = new RegExp("[0-9]{3}-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton invalid1 = new RegExp("000-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton invalid2 = new RegExp("[0-9]{3}-00-[0-9]{4}").toAutomaton();
Automaton invalid3 = new RegExp("[0-9]{3}-[0-9]{2}-0000").toAutomaton();
Automaton invalid4 = new RegExp("666-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton invalid5 = new RegExp("<900-999>-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton extra1 = new RegExp("900-00-1234").toAutomaton();
Automaton extra2 = new RegExp("666-45-6789").toAutomaton();
Automaton re = full.minus(invalid1)
.minus(invalid2)
.minus(invalid3)
.minus(invalid4)
.minus(invalid5)
.union(extra1)
.union(extra2);
Automaton re1 = Automaton.minimize(re);
// Positive assertions
assert match(re1, "078-05-1120"); // A famous (and valid) stolen SSN
assert match(re1, "900-00-1234"); // Normally invalid, but explicitly added (extra1)
assert match(re1, "666-45-6789"); // Normally invalid, but explicitly added (extra2)
// Negative assertions
assert !match(re1, "987-65-4320");
assert !match(re1, "000-65-4320");
assert !match(re1, "987-00-4320");
assert !match(re1, "987-65-0000");
assert !match(re1, "000-65-0000");
assert !match(re1, "000-00-0000");
assert !match(re1, "666-12-1345");
}
static boolean match(Automaton a, String test) {
Automaton query = BasicAutomata.makeString(test);
Automaton result = a.intersection(query);
return result.getSingleton() != null && result.getSingleton().equals(test);
}
}
Automaton full = new RegExp("[0-9]{3}-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton invalid1 = new RegExp("000-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton invalid2 = new RegExp("[0-9]{3}-00-[0-9]{4}").toAutomaton();
Automaton invalid3 = new RegExp("[0-9]{3}-[0-9]{2}-0000").toAutomaton();
Automaton invalid4 = new RegExp("666-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton invalid5 = new RegExp("<900-999>-[0-9]{2}-[0-9]{4}").toAutomaton();
Automaton extra1 = new RegExp("900-00-1234").toAutomaton();
Automaton extra2 = new RegExp("666-45-6789").toAutomaton();
Automaton re = full.minus(invalid1)
.minus(invalid2)
.minus(invalid3)
.minus(invalid4)
.minus(invalid5)
Automaton extra1 = new RegExp("900-00-1234").toAutomaton();
Automaton extra2 = new RegExp("666-45-6789").toAutomaton();
Automaton re = full.minus(invalid1)
.minus(invalid2)
.minus(invalid3)
.minus(invalid4)
.minus(invalid5)
.union(extra1)
.union(extra2);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment