Skip to content

Instantly share code, notes, and snippets.

@agemooij
Last active September 22, 2022 11:02
Show Gist options
  • Save agemooij/15a0eaebc2c1ddd5ddf4 to your computer and use it in GitHub Desktop.
Save agemooij/15a0eaebc2c1ddd5ddf4 to your computer and use it in GitHub Desktop.
Scala text normalization
package rfs.rebb
package common
/**
* Performs standard Java/unicode normalization on the trimmed and lowercased form
* of the input String and then adds a few extra tricks for dealing with special
* characters.
*
* JVM/Unicode normalization references (warning: learning curve black hole, beware!):
*
* - http://docs.oracle.com/javase/7/docs/api/java/text/Normalizer.html
* - http://stackoverflow.com/questions/5697171/regex-what-is-incombiningdiacriticalmarks
* - http://stackoverflow.com/questions/1453171/%C5%84-%C7%B9-%C5%88-%C3%B1-%E1%B9%85-%C5%86-%E1%B9%87-%E1%B9%8B-%E1%B9%89-%CC%88-%C9%B2-%C6%9E-%E1%B6%87-%C9%B3-%C8%B5-n-or-remove-diacritical-marks-from-unicode-cha
* - http://lipn.univ-paris13.fr/~cerin/BD/unicode.html
* - http://www.unicode.org/reports/tr15/tr15-23.html
* - http://www.unicode.org/reports/tr44/#Properties
*
* Some special cases, like "ø" and "ß" are not being stripped/replaced by the
* Java/Unicode normalizer so we have to replace them ourselves.
*/
trait NormalizeSupport {
import java.text.Normalizer.{ normalize ⇒ jnormalize, _ }
def normalize(in: String): String = {
val cleaned = in.trim.toLowerCase
val normalized = jnormalize(cleaned, Form.NFD).replaceAll("[\\p{InCombiningDiacriticalMarks}\\p{IsM}\\p{IsLm}\\p{IsSk}]+", "")
normalized.replaceAll("'s", "")
.replaceAll("ß", "ss")
.replaceAll("ø", "o")
.replaceAll("[^a-zA-Z0-9-]+", "-")
.replaceAll("-+", "-")
.stripSuffix("-")
}
}
object NormalizeSupport extends NormalizeSupport
package rfs.rebb
package common
import org.scalatest._
import Matchers._
import common._
class NormalizeSupportSpec extends UnitSpec with NormalizeSupport {
"NormalizeSupport" should {
"correctly normalize non -ASCII characters" in {
normalize("ÀÁÂÃĀĂȦÄẢÅǍȀȂĄẠḀẦẤàáâä") shouldBe "aaaaaaaaaaaaaaaaaaaaaa"
normalize("ÉÊẼĒĔËȆȄȨĖèéêẽēȅë") shouldBe "eeeeeeeeeeeeeeeee"
normalize("ÌÍÏïØøÒÖÔöÜüŇñÇçß") shouldBe "iiiioooooouunnccss"
}
"normalize 's to nothing" in {
normalize("aa'sbba") shouldBe "aabba"
}
"normalize & for -" in {
normalize("aa & bb") shouldBe "aa-bb"
normalize("aa&& & &&& bb") shouldBe "aa-bb"
}
"normalize brackets to -" in {
normalize("aa(bb)cc") shouldBe "aa-bb-cc"
normalize("aa((((bb)))cc") shouldBe "aa-bb-cc"
}
"normalize multiples of '-' to a single '-'" in {
normalize("a----a--b-b-------a") shouldBe "a-a-b-b-a"
}
"normalize to lowercase" in {
normalize("AAbAbbB") shouldBe "aababbb"
}
"normalize a string with several diacritical marks" in {
normalize("a'sa((%%$ & b___--BB a") shouldBe "aa-b-bb-a"
}
normalizationTestCasesSharedWithNl.foreach {
case (input, expectedOutput) ⇒
s"""normalize "${input}" to "${expectedOutput}".""" in {
normalize(input) shouldBe expectedOutput
}
}
}
private def normalizationTestCasesSharedWithNl: List[(String, String)] = {
import org.parboiled.common._
val data = FileUtils.readAllTextFromResource("normalization-checks.csv")
val lines = data.trim.split("""\r?\n""").toList
lines.map(line ⇒ line.split("""\|\|""")).map(parts ⇒ (parts(0), parts(1)))
}
}
@nasazh
Copy link

nasazh commented Apr 11, 2022

thanks for sharing this! a lifesaver :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment