Skip to content

Instantly share code, notes, and snippets.

@dacr
Last active May 27, 2023 06:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dacr/2b76d147f283b4b1145d823b603b9ebb to your computer and use it in GitHub Desktop.
Save dacr/2b76d147f283b4b1145d823b603b9ebb to your computer and use it in GitHub Desktop.
Advanced operations on strings with regular expressions / published by https://github.com/dacr/code-examples-manager #41e3e12c-dbbf-4f43-9c4b-5cf05869dffa/77004bcddf078e4af3af16edef916228621dc738
// summary : Advanced operations on strings with regular expressions
// keywords : scala, scalatest, regex, cheatsheet, @testable
// publish : gist
// authors : David Crosson
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
// id : 41e3e12c-dbbf-4f43-9c4b-5cf05869dffa
// created-on : 2021-11-19T07:29:40+01:00
// managed-by : https://github.com/dacr/code-examples-manager
// run-with : scala-cli $file
// ---------------------
//> using scala "3.3.0"
//> using dep "org.scalatest::scalatest:3.2.16"
//> using objectWrapper
// ---------------------
import org.scalatest._
import flatspec._
import matchers._
import OptionValues._
import java.util.Locale
import java.text.{DecimalFormat, NumberFormat}
class AdvancedRegexOperations extends AnyFlatSpec with should.Matchers {
override def suiteName="AdvancedStringOperations"
// ---------------------------------------------------------------------------------------------
"split" should "be able to split in 2 parts using the last dot thanks to zero-width positive lookahead regexp" in {
val aType="ab.cd.de"
val Array(aPackage, aClassName) = aType.split("[.](?=[^.]*$)", 2)
aPackage shouldBe "ab.cd"
aClassName shouldBe "de"
}
it should "be able to split on characters while preserving those characters" in {
val in = "abc. truc? blah, blu."
in.split("""\s*(?<=[?.,])\s*""").toList shouldBe List("abc.", "truc?", "blah,", "blu.")
}
// ---------------------------------------------------------------------------------------------
"regexp" should "fully match" in {
val MyRE="TO(.*)TA".r
"TOTUTA" match {
case MyRE(inside)=> inside shouldBe "TU"
case _ => fail()
}
"TA" match {
case MyRE(inside) => fail("")
case _ => succeed
}
}
it should "partially match" in {
val MyRE="TO(.*)TA".r.unanchored
"xxTOTUTAxx" match {
case MyRE(inside)=> inside shouldBe "TU"
case _ => fail()
}
}
it should "provide alternative styles" in {
val MyRE="TO(.*)TA".r
val sample = "TOTUTA"
val MyRE(sub)=sample ; sub shouldBe "TU"
(sample match {case MyRE(in)=>in}) shouldBe "TU"
Some(sample).collect{case MyRE(in)=>in}.value shouldBe "TU"
MyRE.findFirstMatchIn(sample).map(_.group(1)).value shouldBe "TU"
}
it should "match several arguments with explicit unitary groups" in {
val EntryRE = """(\d+),(\d+),(\d+)""".r
EntryRE.matches("1,2,3") shouldBe true
val EntryRE(a,b,c) = "1,2,3"
a shouldBe "1"
b shouldBe "2"
c shouldBe "3"
}
it should "no match several arguments using repeats unfortunately" in {
val EntryRE = """(\d+)(?:,(\d+)){2}""".r
EntryRE.matches("1,2,3") shouldBe true
info("So take care as matching doesn't work with complex regular expression :(")
intercept[MatchError] {
val EntryRE(a, b, c) = "1,2,3"
a shouldBe "1"
b shouldBe "2"
c shouldBe "3"
}
}
/*
it should "be possible to create interpolated regexp" in {
// requires : "dev.bgahagan" %% "scala-regex-interpolation" % "1.0.0"
import dev.bgahagan.regex.intrpl._
val key="truc"
val MyRE=r"$key-(.*)-machin"
val MyRE(word) = "truc-bidule-machin"
word shouldBe "bidule"
"truc-bidule-machin" match {
case r"""\w+-(\w+)$centerWord-\w+""" => centerWord shouldBe "bidule"
}
"truc-bidule-machin" match {
case r"""(\w+)$a-(\w+)$b-(\w+)$c""" => (a,b,c) shouldBe ("truc","bidule","machin")
}
}
*/
it should "be possible to use named arguments" in {
val MyRE="TO(?<in>.*)TA".r
val sample = "TOTUTA"
MyRE.findFirstMatchIn(sample).map(_.group("in")).value shouldBe "TU"
}
it should "be easy to filter collections" in {
val input = List("1","a","2")
val NumRE = """\d+""".r
input.collect{case d@NumRE()=>d} shouldBe List("1", "2")
val input2 = List("1","a","2", "3", "b")
val NumRE2 = """(\d+)""".r
input2.collect{case NumRE2(d)=>d} shouldBe List("1", "2", "3")
val input3 = List("t1","a","2w", "a3k", "b", "4")
val NumRE3 = """(\d+)""".r.unanchored
input3.collect{case NumRE3(d)=>d} shouldBe List("1", "2", "3", "4")
}
it should "be possible to find all matches" in {
val re = """(\[\w+\])""".r
re.findAllMatchIn("A [1] [B] [CD] [123] truc").map {
case m => m.group(1)
}.toList should contain allOf("[1]", "[B]", "[CD]", "[123]")
}
it should "be possible to minimize matches length using lazy quantifiers: ?? *? +? {m,n}?" in {
val sample = """A12B123B"""
info(s"default is take the most, here $sample")
val re1 = """A.*B""".r
re1.findFirstIn(sample).value shouldBe "A12B123B"
info("now we want the smallest match; here A12B, this is done through lazy quantifiers")
val re2 = """A.*?B""".r
re2.findFirstIn(sample).value shouldBe "A12B"
info("interesting blog post : https://mariusschulz.com/2014/06/03/why-using-in-regular-expressions-is-almost-never-what-you-actually-want")
info("known in java Pattern doc as Reluctant quantifiers - but not well explained")
}
it should "support advanced characters intervals" in {
val re1 = """[^a-h&&d-p]+""".r
re1.matches("mnp") shouldBe true
re1.matches("hij") shouldBe false
val re2 = """[^a-h&&[^r-z]]+""".r
re2.matches("mnop") shouldBe true
re2.matches("hr") shouldBe false
val re3 = """[\d\w]+""".r
re3.matches("dave42") shouldBe true
re3.matches("dAVe42") shouldBe true
re3.matches("john-doe") shouldBe false
val re4 = """[^a\dc]""".r
}
}
org.scalatest.tools.Runner.main(Array("-oDF", "-s", classOf[AdvancedRegexOperations].getName))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment