agemooij/NormalizeSupport.scala

## NormalizeSupport.scala
package rfs.rebb
package common

/**
 * Performs standard Java/unicode normalization on the trimmed and lowercased form
 * of the input String and then adds a few extra tricks for dealing with special
 * characters.
 *
 * JVM/Unicode normalization references (warning: learning curve black hole, beware!):
 *
 * - http://docs.oracle.com/javase/7/docs/api/java/text/Normalizer.html
 * - http://stackoverflow.com/questions/5697171/regex-what-is-incombiningdiacriticalmarks
 * - http://stackoverflow.com/questions/1453171/%C5%84-%C7%B9-%C5%88-%C3%B1-%E1%B9%85-%C5%86-%E1%B9%87-%E1%B9%8B-%E1%B9%89-%CC%88-%C9%B2-%C6%9E-%E1%B6%87-%C9%B3-%C8%B5-n-or-remove-diacritical-marks-from-unicode-cha
 * - http://lipn.univ-paris13.fr/~cerin/BD/unicode.html
 * - http://www.unicode.org/reports/tr15/tr15-23.html
 * - http://www.unicode.org/reports/tr44/#Properties
 *
 * Some special cases, like "ø" and "ß" are not being stripped/replaced by the
 * Java/Unicode normalizer so we have to replace them ourselves.
 */
trait NormalizeSupport {
  import java.text.Normalizer.{ normalize ⇒ jnormalize, _ }

  def normalize(in: String): String = {
    val cleaned = in.trim.toLowerCase
    val normalized = jnormalize(cleaned, Form.NFD).replaceAll("[\\p{InCombiningDiacriticalMarks}\\p{IsM}\\p{IsLm}\\p{IsSk}]+", "")

    normalized.replaceAll("'s", "")
      .replaceAll("ß", "ss")
      .replaceAll("ø", "o")
      .replaceAll("[^a-zA-Z0-9-]+", "-")
      .replaceAll("-+", "-")
      .stripSuffix("-")
  }
}

object NormalizeSupport extends NormalizeSupport

## NormalizeSupportSpec.scala
package rfs.rebb
package common

import org.scalatest._
import Matchers._

import common._

class NormalizeSupportSpec extends UnitSpec with NormalizeSupport {

  "NormalizeSupport" should {
    "correctly normalize non -ASCII characters" in {
      normalize("ÀÁÂÃĀĂȦÄẢÅǍȀȂĄẠḀẦẤàáâä") shouldBe "aaaaaaaaaaaaaaaaaaaaaa"
      normalize("ÉÊẼĒĔËȆȄȨĖèéêẽēȅë") shouldBe "eeeeeeeeeeeeeeeee"
      normalize("ÌÍÏïØøÒÖÔöÜüŇñÇçß") shouldBe "iiiioooooouunnccss"
    }

    "normalize 's to nothing" in {
      normalize("aa'sbba") shouldBe "aabba"
    }

    "normalize & for -" in {
      normalize("aa & bb") shouldBe "aa-bb"
      normalize("aa&& & &&& bb") shouldBe "aa-bb"
    }

    "normalize brackets to -" in {
      normalize("aa(bb)cc") shouldBe "aa-bb-cc"
      normalize("aa((((bb)))cc") shouldBe "aa-bb-cc"
    }

    "normalize multiples of '-' to a single '-'" in {
      normalize("a----a--b-b-------a") shouldBe "a-a-b-b-a"
    }

    "normalize to lowercase" in {
      normalize("AAbAbbB") shouldBe "aababbb"
    }

    "normalize a string with several diacritical marks" in {
      normalize("a'sa((%%$ & b___--BB a") shouldBe "aa-b-bb-a"
    }

    normalizationTestCasesSharedWithNl.foreach {
      case (input, expectedOutput) ⇒
        s"""normalize "${input}" to "${expectedOutput}".""" in {
          normalize(input) shouldBe expectedOutput
        }
    }
  }

  private def normalizationTestCasesSharedWithNl: List[(String, String)] = {
    import org.parboiled.common._
    val data = FileUtils.readAllTextFromResource("normalization-checks.csv")
    val lines = data.trim.split("""\r?\n""").toList

    lines.map(line ⇒ line.split("""\|\|""")).map(parts ⇒ (parts(0), parts(1)))
  }
}
	package rfs.rebb
	package common

	/**
	* Performs standard Java/unicode normalization on the trimmed and lowercased form
	* of the input String and then adds a few extra tricks for dealing with special
	* characters.
	*
	* JVM/Unicode normalization references (warning: learning curve black hole, beware!):
	*
	* - http://docs.oracle.com/javase/7/docs/api/java/text/Normalizer.html
	* - http://stackoverflow.com/questions/5697171/regex-what-is-incombiningdiacriticalmarks
	* - http://stackoverflow.com/questions/1453171/%C5%84-%C7%B9-%C5%88-%C3%B1-%E1%B9%85-%C5%86-%E1%B9%87-%E1%B9%8B-%E1%B9%89-%CC%88-%C9%B2-%C6%9E-%E1%B6%87-%C9%B3-%C8%B5-n-or-remove-diacritical-marks-from-unicode-cha
	* - http://lipn.univ-paris13.fr/~cerin/BD/unicode.html
	* - http://www.unicode.org/reports/tr15/tr15-23.html
	* - http://www.unicode.org/reports/tr44/#Properties
	*
	* Some special cases, like "ø" and "ß" are not being stripped/replaced by the
	* Java/Unicode normalizer so we have to replace them ourselves.
	*/
	trait NormalizeSupport {
	import java.text.Normalizer.{ normalize ⇒ jnormalize, _ }

	def normalize(in: String): String = {
	val cleaned = in.trim.toLowerCase
	val normalized = jnormalize(cleaned, Form.NFD).replaceAll("[\\p{InCombiningDiacriticalMarks}\\p{IsM}\\p{IsLm}\\p{IsSk}]+", "")

	normalized.replaceAll("'s", "")
	.replaceAll("ß", "ss")
	.replaceAll("ø", "o")
	.replaceAll("[^a-zA-Z0-9-]+", "-")
	.replaceAll("-+", "-")
	.stripSuffix("-")
	}
	}

	object NormalizeSupport extends NormalizeSupport
	package rfs.rebb
	package common

	import org.scalatest._
	import Matchers._

	import common._

	class NormalizeSupportSpec extends UnitSpec with NormalizeSupport {

	"NormalizeSupport" should {
	"correctly normalize non -ASCII characters" in {
	normalize("ÀÁÂÃĀĂȦÄẢÅǍȀȂĄẠḀẦẤàáâä") shouldBe "aaaaaaaaaaaaaaaaaaaaaa"
	normalize("ÉÊẼĒĔËȆȄȨĖèéêẽēȅë") shouldBe "eeeeeeeeeeeeeeeee"
	normalize("ÌÍÏïØøÒÖÔöÜüŇñÇçß") shouldBe "iiiioooooouunnccss"
	}

	"normalize 's to nothing" in {
	normalize("aa'sbba") shouldBe "aabba"
	}

	"normalize & for -" in {
	normalize("aa & bb") shouldBe "aa-bb"
	normalize("aa&& & &&& bb") shouldBe "aa-bb"
	}

	"normalize brackets to -" in {
	normalize("aa(bb)cc") shouldBe "aa-bb-cc"
	normalize("aa((((bb)))cc") shouldBe "aa-bb-cc"
	}

	"normalize multiples of '-' to a single '-'" in {
	normalize("a----a--b-b-------a") shouldBe "a-a-b-b-a"
	}

	"normalize to lowercase" in {
	normalize("AAbAbbB") shouldBe "aababbb"
	}

	"normalize a string with several diacritical marks" in {
	normalize("a'sa((%%$ & b___--BB a") shouldBe "aa-b-bb-a"
	}

	normalizationTestCasesSharedWithNl.foreach {
	case (input, expectedOutput) ⇒
	s"""normalize "${input}" to "${expectedOutput}".""" in {
	normalize(input) shouldBe expectedOutput
	}
	}
	}

	private def normalizationTestCasesSharedWithNl: List[(String, String)] = {
	import org.parboiled.common._
	val data = FileUtils.readAllTextFromResource("normalization-checks.csv")
	val lines = data.trim.split("""\r?\n""").toList

	lines.map(line ⇒ line.split("""\\|\\|""")).map(parts ⇒ (parts(0), parts(1)))
	}
	}