Created
September 24, 2012 21:54
-
-
Save noel-yap/3778691 to your computer and use it in GitHub Desktop.
Regex matchers in Scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.siliconvalleypatterns | |
/** | |
* @author noel.yap@gmail.com | |
*/ | |
class CompiledRegexMatcher(val regex: String) { | |
def compile(): List[Token] = { | |
def compile(regex: String): List[Token] = { | |
if (regex.isEmpty) { | |
Nil | |
} else { | |
val token = regex.head match { | |
case '\\' => { | |
if (regex.length < 2) { | |
throw new Exception("malformed regex") | |
} else { | |
new EscapedToken(regex(1)) | |
} | |
} | |
case '^' => new BeginToken | |
case '$' => new EndToken | |
case '*' => throw new Exception("malformed regex") | |
case c => { | |
val underlying = if (c == '.') { | |
new AnyToken | |
} else { | |
new LiteralToken(c) | |
} | |
if (regex.length > 1 && regex(1) == '*') { | |
new ZeroOrMoreToken(underlying) | |
} else { | |
underlying | |
} | |
} | |
} | |
token +: compile(token.munch(regex)) | |
} | |
} | |
compile(regex) | |
} | |
def ~(text:String): Boolean = { | |
matches(compile(), text) | |
} | |
def matches(regex: List[Token], text: String): Boolean = { | |
def restOfRegexMatches(regex: List[Token], text: String): (Boolean, Boolean) = { | |
if (regex.isEmpty) { | |
(true, false) | |
} else { | |
val regexHead = regex.head | |
val (isMatch, restOfText) = regexHead.matches(text) | |
(isMatch && restOfRegexMatches(regex.tail, restOfText)._1, regexHead.tryAgain()) | |
} | |
} | |
val (isMatch, tryAgain) = restOfRegexMatches(regex, text) | |
isMatch || tryAgain && !text.isEmpty && matches(regex, text.tail) | |
} | |
trait TryAgainTrait { | |
def tryAgain(): Boolean | |
} | |
trait TryAgain extends TryAgainTrait { | |
override val tryAgain = true | |
} | |
trait DoNotTryAgain extends TryAgainTrait { | |
override val tryAgain = false | |
} | |
trait RegexMuncher { | |
val munchAmount: Int | |
def munch(regex: String): String = regex.substring(munchAmount) | |
} | |
trait OneCharMuncher extends RegexMuncher { | |
override val munchAmount = 1 | |
} | |
trait TwoCharMuncher extends RegexMuncher { | |
override val munchAmount = 2 | |
} | |
abstract class Token extends RegexMuncher with TryAgainTrait { | |
def canEqual(that: Any) = that.isInstanceOf[Token] | |
override def equals(that: Any) = that match { | |
case t: Token => t canEqual this | |
case _ => false | |
} | |
def matches(text: String): (Boolean, String) | |
} | |
abstract class CharToken extends Token with OneCharMuncher with TryAgain { | |
override def canEqual(that: Any) = that.isInstanceOf[CharToken] | |
override def matches(text: String) = { | |
if (text.isEmpty) { | |
(false, text) | |
} else { | |
(matchesChar(text.head), text.tail) | |
} | |
} | |
def matchesChar(char: => Char): Boolean | |
} | |
class LiteralToken(val literal: Char) extends CharToken { | |
override def canEqual(that: Any) = that.isInstanceOf[LiteralToken] | |
override def equals(that: Any) = that match { | |
case t: LiteralToken => super.equals(that) && (t canEqual this) && t.literal == this.literal | |
case _ => false | |
} | |
override def hashCode() = 41 + literal.hashCode | |
override def matchesChar(char: => Char) = literal == char | |
} | |
class EscapedToken(literal: Char) extends LiteralToken(literal) with TwoCharMuncher with TryAgain { | |
override def canEqual(that: Any) = that.isInstanceOf[EscapedToken] | |
} | |
class AnyToken extends CharToken { | |
override def canEqual(that: Any) = that.isInstanceOf[AnyToken] | |
override def matchesChar(char: => Char) = true | |
} | |
class BeginToken extends Token with OneCharMuncher with DoNotTryAgain { | |
override def canEqual(that: Any) = that.isInstanceOf[BeginToken] | |
override def matches(text: String) = (true, text) | |
} | |
class EndToken extends Token with OneCharMuncher with DoNotTryAgain { | |
override def canEqual(that: Any) = that.isInstanceOf[EndToken] | |
override def matches(text: String) = (text.isEmpty, text) | |
} | |
class ZeroOrMoreToken(val underlying: CharToken) extends Token with TwoCharMuncher with TryAgain { | |
override def canEqual(that: Any) = that.isInstanceOf[ZeroOrMoreToken] | |
override def equals(that: Any) = that match { | |
case t: ZeroOrMoreToken => super.equals(that) && (t canEqual this) && t.underlying == this.underlying | |
case _ => false | |
} | |
override def hashCode() = 41 + underlying.hashCode | |
override def matches(text: String) = { | |
def munch(t: String): String = { | |
if (underlying.matches(t)._1) { | |
munch(t.tail) | |
} else { | |
t | |
} | |
} | |
(true, munch(text)) | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.siliconvalleypatterns | |
import org.specs2.mutable.Specification | |
/** | |
* @author noel.yap@gmail.com | |
*/ | |
class CompiledRegexMatcherSpec extends Specification { | |
"$" should { | |
val regexMatcher = new CompiledRegexMatcher("$") | |
"compile to EndToken" in { | |
regexMatcher.compile() mustEqual List(new regexMatcher.EndToken) | |
} | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"not match non-empty text" in { | |
regexMatcher ~ "$" mustEqual false | |
} | |
} | |
"^" should { | |
val regexMatcher = new CompiledRegexMatcher("^") | |
"compile to BeginToken" in { | |
regexMatcher.compile() mustEqual List(new regexMatcher.BeginToken) | |
} | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"match non-empty text" in { | |
regexMatcher ~ "^" mustEqual true | |
} | |
} | |
"." should { | |
val regexMatcher = new CompiledRegexMatcher(".") | |
"compile to AnyToken" in { | |
regexMatcher.compile() mustEqual List(new regexMatcher.AnyToken) | |
} | |
"not match empty text" in { | |
regexMatcher ~ "" mustEqual false | |
} | |
"match non-empty text" in { | |
regexMatcher ~ "." mustEqual true | |
} | |
} | |
"literal" should { | |
val regexMatcher = new CompiledRegexMatcher("a") | |
"compile to LiteralToken" in { | |
regexMatcher.compile() mustEqual List(new regexMatcher.LiteralToken('a')) | |
} | |
"not match empty text" in { | |
regexMatcher ~ "" mustEqual false | |
} | |
"not match that text" in { | |
regexMatcher ~ "0" mustEqual false | |
} | |
"match itself" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
} | |
"literal*" should { | |
val regexMatcher = new CompiledRegexMatcher("^a*$") | |
"compile" in { | |
regexMatcher.compile() mustEqual List( | |
new regexMatcher.BeginToken, | |
new regexMatcher.ZeroOrMoreToken(new regexMatcher.LiteralToken('a')), | |
new regexMatcher.EndToken) | |
} | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"match one" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
"match multiple" in { | |
regexMatcher ~ "aa" mustEqual true | |
} | |
"not match" in { | |
regexMatcher ~ "0" mustEqual false | |
} | |
} | |
".*" should { | |
val regexMatcher = new CompiledRegexMatcher("^.*$") | |
"compile" in { | |
regexMatcher.compile() mustEqual List( | |
new regexMatcher.BeginToken, | |
new regexMatcher.ZeroOrMoreToken(new regexMatcher.AnyToken), | |
new regexMatcher.EndToken) | |
} | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"match one" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
"match multiple" in { | |
regexMatcher ~ "aa" mustEqual true | |
} | |
} | |
"*" should { | |
val regexMatcher = new CompiledRegexMatcher("*") | |
"not compile" in { | |
regexMatcher.compile() must throwAn[Exception](message = "malformed regex") | |
} | |
"not match empty text" in { | |
regexMatcher ~ "" must throwAn[Exception](message = "malformed regex") | |
} | |
"not match non-empty text" in { | |
regexMatcher ~ "" must throwAn[Exception](message = "malformed regex") | |
} | |
} | |
"bc*d" should { | |
val regexMatcher = new CompiledRegexMatcher("bc*d") | |
"compile" in { | |
import regexMatcher._ | |
regexMatcher.compile() mustEqual List( | |
new LiteralToken('b'), | |
new ZeroOrMoreToken(new LiteralToken('c')), | |
new LiteralToken('d')) | |
} | |
"match abde" in { | |
regexMatcher ~ "abde" mustEqual true | |
} | |
"match abcde" in { | |
regexMatcher ~ "abcde" mustEqual true | |
} | |
"match abccde" in { | |
regexMatcher ~ "abccde" mustEqual true | |
} | |
"not match abfde" in { | |
regexMatcher ~ "abfde" mustEqual false | |
} | |
} | |
"^a" should { | |
val regexMatcher = new CompiledRegexMatcher("^a") | |
"compile" in { | |
regexMatcher.compile() mustEqual List( | |
new regexMatcher.BeginToken, | |
new regexMatcher.LiteralToken('a')) | |
} | |
"match a" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
"match ab" in { | |
regexMatcher ~ "ab" mustEqual true | |
} | |
"not match ba" in { | |
regexMatcher ~ "ba" mustEqual false | |
} | |
} | |
"""\""" should { | |
"not compile" in { | |
val regexMatcher = new CompiledRegexMatcher("\\") | |
regexMatcher.compile() must throwAn[Exception](message = "malformed regex") | |
} | |
"compile" in { | |
val regexMatcher = new CompiledRegexMatcher( """^\\$""") | |
import regexMatcher._ | |
regexMatcher.compile() mustEqual List( | |
new BeginToken, | |
new EscapedToken('\\'), | |
new EndToken) | |
} | |
"""match \""" in { | |
val regexMatcher = new CompiledRegexMatcher( """^\\$""") | |
regexMatcher ~ """\""" mustEqual true | |
} | |
"match ^" in { | |
val regexMatcher = new CompiledRegexMatcher( """^\^$""") | |
regexMatcher ~ "^" mustEqual true | |
} | |
"match ." in { | |
val regexMatcher = new CompiledRegexMatcher( """^\.$""") | |
regexMatcher ~ "." mustEqual true | |
} | |
"match *" in { | |
val regexMatcher = new CompiledRegexMatcher( """^\*$""") | |
regexMatcher ~ "*" mustEqual true | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CoreRegexMatcher(val regex: String) { | |
def ~(text: String): Boolean = { | |
def restOfRegexMatches(regex: String, text: String): (Boolean, Boolean) = { | |
if (regex.isEmpty) { | |
(true, false) | |
} else { | |
regex.head match { | |
case '\\' => (regex(1) == text.head, true) | |
case '*' => throw new Exception("malformed regex") | |
case '^' => { | |
(restOfRegexMatches(regex.tail, text)._1, false) | |
} | |
case '$' => (text.isEmpty, false) | |
case _ => { | |
val firstCharMatches = !text.isEmpty && (regex.head == text.head || regex.head == '.') | |
if (regex.length < 2 || regex(1) != '*') { | |
(firstCharMatches && restOfRegexMatches(regex.tail, text.tail)._1, true) | |
} else { | |
(firstCharMatches || restOfRegexMatches(regex.substring(2), text)._1, true) | |
} | |
} | |
} | |
} | |
} | |
val (isMatch, tryAgain) = restOfRegexMatches(regex, text) | |
isMatch || tryAgain && !text.isEmpty && this ~ text.tail | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.specs2.mutable.Specification | |
class CoreRegexMatcherSpec extends Specification { | |
"$" should { | |
val regexMatcher = new CoreRegexMatcher("$") | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"not match non-empty text" in { | |
regexMatcher ~ "$" mustEqual false | |
} | |
} | |
"^" should { | |
val regexMatcher = new CoreRegexMatcher("^") | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"match non-empty text" in { | |
regexMatcher ~ "^" mustEqual true | |
} | |
} | |
"." should { | |
val regexMatcher = new CoreRegexMatcher(".") | |
"not match empty text" in { | |
regexMatcher ~ "" mustEqual false | |
} | |
"match non-empty text" in { | |
regexMatcher ~ "." mustEqual true | |
} | |
} | |
"literal" should { | |
val regexMatcher = new CoreRegexMatcher("a") | |
"not match empty text" in { | |
regexMatcher ~ "" mustEqual false | |
} | |
"not match that text" in { | |
regexMatcher ~ "0" mustEqual false | |
} | |
"match itself" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
} | |
"literal*" should { | |
val regexMatcher = new CoreRegexMatcher("^a*$") | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"match one" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
"match multiple" in { | |
regexMatcher ~ "aa" mustEqual true | |
} | |
"not match" in { | |
regexMatcher ~ "0" mustEqual false | |
} | |
} | |
".*" should { | |
val regexMatcher = new CoreRegexMatcher("^.*$") | |
"match empty text" in { | |
regexMatcher ~ "" mustEqual true | |
} | |
"match one" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
"match multiple" in { | |
regexMatcher ~ "aa" mustEqual true | |
} | |
} | |
"*" should { | |
val regexMatcher = new CoreRegexMatcher("*") | |
"not match empty text" in { | |
regexMatcher ~ "" must throwA[Exception](message="malformed regex") | |
} | |
"not match non-empty text" in { | |
regexMatcher ~ "" must throwA[Exception](message="malformed regex") | |
} | |
} | |
"bc*d" should { | |
val regexMatcher = new CoreRegexMatcher("bc*d") | |
"match abde" in { | |
regexMatcher ~ "abde" mustEqual true | |
} | |
"match abcde" in { | |
regexMatcher ~ "abcde" mustEqual true | |
} | |
"match abccde" in { | |
regexMatcher ~ "abccde" mustEqual true | |
} | |
"not match abfde" in { | |
regexMatcher ~ "abfde" mustEqual false | |
} | |
} | |
"^a" should { | |
val regexMatcher = new CoreRegexMatcher("^a") | |
"match a" in { | |
regexMatcher ~ "a" mustEqual true | |
} | |
"match ab" in { | |
regexMatcher ~ "ab" mustEqual true | |
} | |
"not match ba" in { | |
regexMatcher ~ "ba" mustEqual false | |
} | |
} | |
"""\""" should { | |
"""match \""" in { | |
val regexMatcher = new CoreRegexMatcher("""^\\$""") | |
regexMatcher ~ """\""" mustEqual true | |
} | |
"match ^" in { | |
val regexMatcher = new CoreRegexMatcher("""^\^$""") | |
regexMatcher ~ "^" mustEqual true | |
} | |
"match ." in { | |
val regexMatcher = new CoreRegexMatcher("""^\.$""") | |
regexMatcher ~ "." mustEqual true | |
} | |
"match *" in { | |
val regexMatcher = new CoreRegexMatcher("""^\*$""") | |
regexMatcher ~ "*" mustEqual true | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment