dacr/regex-operations.sc

## regex-operations.sc
// summary : Advanced operations on strings with regular expressions
// keywords : scala, scalatest, regex, cheatsheet, @testable
// publish : gist
// authors : David Crosson
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
// id : 41e3e12c-dbbf-4f43-9c4b-5cf05869dffa
// created-on : 2021-11-19T07:29:40+01:00
// managed-by : https://github.com/dacr/code-examples-manager
// run-with : scala-cli $file

// ---------------------
//> using scala "3.4.2"
//> using dep "org.scalatest::scalatest:3.2.16"
//> using objectWrapper
// ---------------------

import org.scalatest._
import flatspec._
import matchers._
import OptionValues._
import java.util.Locale
import java.text.{DecimalFormat, NumberFormat}

class AdvancedRegexOperations extends AnyFlatSpec with should.Matchers {
  override def suiteName="AdvancedStringOperations"

  // ---------------------------------------------------------------------------------------------
  "split" should "be able to split in 2 parts using the last dot thanks to zero-width positive lookahead regexp" in {
    val aType="ab.cd.de"
    val Array(aPackage, aClassName) = aType.split("[.](?=[^.]*$)", 2)
    aPackage shouldBe "ab.cd"
    aClassName shouldBe "de"
  }

  it should "be able to split on characters while preserving those characters" in {
    val in = "abc. truc? blah, blu."
    in.split("""\s*(?<=[?.,])\s*""").toList shouldBe List("abc.", "truc?", "blah,", "blu.")
  }

  // ---------------------------------------------------------------------------------------------
  "regexp" should "fully match" in {
    val MyRE="TO(.*)TA".r
    "TOTUTA" match {
      case MyRE(inside)=> inside shouldBe "TU"
      case _ => fail()
    }
    "TA" match {
      case MyRE(inside) => fail("")
      case _ => succeed
    }
  }

  it should "partially match" in {
    val MyRE="TO(.*)TA".r.unanchored
    "xxTOTUTAxx" match {
      case MyRE(inside)=> inside shouldBe "TU"
      case _ => fail()
    }
  }

  it should "provide alternative styles" in {
    val MyRE="TO(.*)TA".r
    val sample = "TOTUTA"
    val MyRE(sub)=sample ; sub shouldBe "TU"
    (sample match {case MyRE(in)=>in}) shouldBe "TU"
    Some(sample).collect{case MyRE(in)=>in}.value shouldBe "TU"
    MyRE.findFirstMatchIn(sample).map(_.group(1)).value shouldBe "TU"
  }

  it should "match several arguments with explicit unitary groups" in {
    val EntryRE = """(\d+),(\d+),(\d+)""".r
    EntryRE.matches("1,2,3") shouldBe true
    val EntryRE(a,b,c) = "1,2,3"
    a shouldBe "1"
    b shouldBe "2"
    c shouldBe "3"
  }

  it should "no match several arguments using repeats unfortunately" in {
    val EntryRE = """(\d+)(?:,(\d+)){2}""".r
    EntryRE.matches("1,2,3") shouldBe true
    info("So take care as matching doesn't work with complex regular expression :(")
    intercept[MatchError] {
      val EntryRE(a, b, c) = "1,2,3"
      a shouldBe "1"
      b shouldBe "2"
      c shouldBe "3"
    }
  }
/*
  it should "be possible to create interpolated regexp" in {
    // requires : "dev.bgahagan" %% "scala-regex-interpolation" % "1.0.0"
    import dev.bgahagan.regex.intrpl._
    val key="truc"
    val MyRE=r"$key-(.*)-machin"
    val MyRE(word) = "truc-bidule-machin"
    word shouldBe "bidule"

    "truc-bidule-machin" match {
      case r"""\w+-(\w+)$centerWord-\w+""" => centerWord shouldBe "bidule"
    }
    "truc-bidule-machin" match {
      case r"""(\w+)$a-(\w+)$b-(\w+)$c""" => (a,b,c) shouldBe ("truc","bidule","machin")
    }
  }
*/

  it should "be possible to use named arguments" in {
    val MyRE="TO(?<in>.*)TA".r
    val sample = "TOTUTA"
    MyRE.findFirstMatchIn(sample).map(_.group("in")).value shouldBe "TU"
  }

  it should "be easy to filter collections" in {
    val input = List("1","a","2")
    val NumRE = """\d+""".r
    input.collect{case d@NumRE()=>d} shouldBe List("1", "2")

    val input2 = List("1","a","2", "3", "b")
    val NumRE2 = """(\d+)""".r
    input2.collect{case NumRE2(d)=>d} shouldBe List("1", "2", "3")

    val input3 = List("t1","a","2w", "a3k", "b", "4")
    val NumRE3 = """(\d+)""".r.unanchored
    input3.collect{case NumRE3(d)=>d} shouldBe List("1", "2", "3", "4")
  }

  it should "be possible to find all matches" in {
    val re = """(\[\w+\])""".r
    re.findAllMatchIn("A [1] [B] [CD] [123] truc").map {
      case m => m.group(1)
    }.toList should contain allOf("[1]", "[B]", "[CD]", "[123]")
  }

  it should "be possible to minimize matches length using lazy quantifiers: ?? *? +? {m,n}?" in {
    val sample = """A12B123B"""
    info(s"default is take the most, here $sample")
    val re1 = """A.*B""".r
    re1.findFirstIn(sample).value shouldBe "A12B123B"
    info("now we want the smallest match; here A12B, this is done through lazy quantifiers")
    val re2 = """A.*?B""".r
    re2.findFirstIn(sample).value shouldBe "A12B"
    info("interesting blog post : https://mariusschulz.com/2014/06/03/why-using-in-regular-expressions-is-almost-never-what-you-actually-want")
    info("known in java Pattern doc as Reluctant quantifiers - but not well explained")
  }

  it should "support advanced characters intervals" in {
    val re1 = """[^a-h&&d-p]+""".r
    re1.matches("mnp") shouldBe true
    re1.matches("hij") shouldBe false

    val re2 = """[^a-h&&[^r-z]]+""".r
    re2.matches("mnop") shouldBe true
    re2.matches("hr") shouldBe false

    val re3 = """[\d\w]+""".r
    re3.matches("dave42") shouldBe true
    re3.matches("dAVe42") shouldBe true
    re3.matches("john-doe") shouldBe false
    val re4 = """[^a\dc]""".r
  }
}

org.scalatest.tools.Runner.main(Array("-oDF", "-s", classOf[AdvancedRegexOperations].getName))
	// summary : Advanced operations on strings with regular expressions
	// keywords : scala, scalatest, regex, cheatsheet, @testable
	// publish : gist
	// authors : David Crosson
	// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
	// id : 41e3e12c-dbbf-4f43-9c4b-5cf05869dffa
	// created-on : 2021-11-19T07:29:40+01:00
	// managed-by : https://github.com/dacr/code-examples-manager
	// run-with : scala-cli $file

	// ---------------------
	//> using scala "3.4.2"
	//> using dep "org.scalatest::scalatest:3.2.16"
	//> using objectWrapper
	// ---------------------

	import org.scalatest._
	import flatspec._
	import matchers._
	import OptionValues._
	import java.util.Locale
	import java.text.{DecimalFormat, NumberFormat}

	class AdvancedRegexOperations extends AnyFlatSpec with should.Matchers {
	override def suiteName="AdvancedStringOperations"

	// ---------------------------------------------------------------------------------------------
	"split" should "be able to split in 2 parts using the last dot thanks to zero-width positive lookahead regexp" in {
	val aType="ab.cd.de"
	val Array(aPackage, aClassName) = aType.split("[.](?=[^.]*$)", 2)
	aPackage shouldBe "ab.cd"
	aClassName shouldBe "de"
	}

	it should "be able to split on characters while preserving those characters" in {
	val in = "abc. truc? blah, blu."
	in.split("""\s(?<=[?.,])\s""").toList shouldBe List("abc.", "truc?", "blah,", "blu.")
	}

	// ---------------------------------------------------------------------------------------------
	"regexp" should "fully match" in {
	val MyRE="TO(.*)TA".r
	"TOTUTA" match {
	case MyRE(inside)=> inside shouldBe "TU"
	case _ => fail()
	}
	"TA" match {
	case MyRE(inside) => fail("")
	case _ => succeed
	}
	}

	it should "partially match" in {
	val MyRE="TO(.*)TA".r.unanchored
	"xxTOTUTAxx" match {
	case MyRE(inside)=> inside shouldBe "TU"
	case _ => fail()
	}
	}

	it should "provide alternative styles" in {
	val MyRE="TO(.*)TA".r
	val sample = "TOTUTA"
	val MyRE(sub)=sample ; sub shouldBe "TU"
	(sample match {case MyRE(in)=>in}) shouldBe "TU"
	Some(sample).collect{case MyRE(in)=>in}.value shouldBe "TU"
	MyRE.findFirstMatchIn(sample).map(_.group(1)).value shouldBe "TU"
	}

	it should "match several arguments with explicit unitary groups" in {
	val EntryRE = """(\d+),(\d+),(\d+)""".r
	EntryRE.matches("1,2,3") shouldBe true
	val EntryRE(a,b,c) = "1,2,3"
	a shouldBe "1"
	b shouldBe "2"
	c shouldBe "3"
	}

	it should "no match several arguments using repeats unfortunately" in {
	val EntryRE = """(\d+)(?:,(\d+)){2}""".r
	EntryRE.matches("1,2,3") shouldBe true
	info("So take care as matching doesn't work with complex regular expression :(")
	intercept[MatchError] {
	val EntryRE(a, b, c) = "1,2,3"
	a shouldBe "1"
	b shouldBe "2"
	c shouldBe "3"
	}
	}
	/*
	it should "be possible to create interpolated regexp" in {
	// requires : "dev.bgahagan" %% "scala-regex-interpolation" % "1.0.0"
	import dev.bgahagan.regex.intrpl._
	val key="truc"
	val MyRE=r"$key-(.*)-machin"
	val MyRE(word) = "truc-bidule-machin"
	word shouldBe "bidule"

	"truc-bidule-machin" match {
	case r"""\w+-(\w+)$centerWord-\w+""" => centerWord shouldBe "bidule"
	}
	"truc-bidule-machin" match {
	case r"""(\w+)$a-(\w+)$b-(\w+)$c""" => (a,b,c) shouldBe ("truc","bidule","machin")
	}
	}
	*/

	it should "be possible to use named arguments" in {
	val MyRE="TO(?<in>.*)TA".r
	val sample = "TOTUTA"
	MyRE.findFirstMatchIn(sample).map(_.group("in")).value shouldBe "TU"
	}

	it should "be easy to filter collections" in {
	val input = List("1","a","2")
	val NumRE = """\d+""".r
	input.collect{case d@NumRE()=>d} shouldBe List("1", "2")

	val input2 = List("1","a","2", "3", "b")
	val NumRE2 = """(\d+)""".r
	input2.collect{case NumRE2(d)=>d} shouldBe List("1", "2", "3")

	val input3 = List("t1","a","2w", "a3k", "b", "4")
	val NumRE3 = """(\d+)""".r.unanchored
	input3.collect{case NumRE3(d)=>d} shouldBe List("1", "2", "3", "4")
	}

	it should "be possible to find all matches" in {
	val re = """(\[\w+\])""".r
	re.findAllMatchIn("A [1] [B] [CD] [123] truc").map {
	case m => m.group(1)
	}.toList should contain allOf("[1]", "[B]", "[CD]", "[123]")
	}

	it should "be possible to minimize matches length using lazy quantifiers: ?? *? +? {m,n}?" in {
	val sample = """A12B123B"""
	info(s"default is take the most, here $sample")
	val re1 = """A.*B""".r
	re1.findFirstIn(sample).value shouldBe "A12B123B"
	info("now we want the smallest match; here A12B, this is done through lazy quantifiers")
	val re2 = """A.*?B""".r
	re2.findFirstIn(sample).value shouldBe "A12B"
	info("interesting blog post : https://mariusschulz.com/2014/06/03/why-using-in-regular-expressions-is-almost-never-what-you-actually-want")
	info("known in java Pattern doc as Reluctant quantifiers - but not well explained")
	}

	it should "support advanced characters intervals" in {
	val re1 = """[^a-h&&d-p]+""".r
	re1.matches("mnp") shouldBe true
	re1.matches("hij") shouldBe false

	val re2 = """[^a-h&&[^r-z]]+""".r
	re2.matches("mnop") shouldBe true
	re2.matches("hr") shouldBe false

	val re3 = """[\d\w]+""".r
	re3.matches("dave42") shouldBe true
	re3.matches("dAVe42") shouldBe true
	re3.matches("john-doe") shouldBe false
	val re4 = """[^a\dc]""".r
	}
	}

	org.scalatest.tools.Runner.main(Array("-oDF", "-s", classOf[AdvancedRegexOperations].getName))