Created November 18, 2015 19:38
URL Normalizer
import java.util.regex.{Pattern, Matcher}
import scala.annotation.tailrec
object UrlNormalizer {
* URI.normalize handles path normalization. Doc:
* e.g. /a/b/../c will become /a/c
* Fragment is also removed.
* Further normalization that does not change semantics done according to:
* 1. lower casing scheme and host name
* 2. upper casing letters in escape sequences. (e.g. %2c to %2C)
* 3. decode unnecessary encoded characters. (e.g. %2E to ., %5F to _)
* 4. remove default port
private val escapePattern = Pattern.compile("%[0-9a-fA-F]{2}")
def normalize(url: String, removeFragment: Boolean = false): String = {
val semiNormalized =
try {
val uri = new URI(url).normalize()
val scheme = uri.getScheme.toLowerCase
val host = uri.getHost.toLowerCase
val port = (scheme, uri.getPort) match {
// -1 means default port.
case ("http", 80) | ("https", 443) | (_, -1) => ""
case _ => s":${uri.getPort}"
val path = uri.getRawPath match {
case null => ""
case p => p
val query = uri.getRawQuery match {
case null => ""
case q => s"?$q"
} catch {
case e: Exception => url
def foldMatches(matcher: Matcher, result: String, input: String, lastIndex: Int): String = {
if (matcher.find) {
val matchResult = matcher.toMatchResult
val value = Integer.parseInt(, 16) match {
case x if x >= 0x41 && x <= 0x5a || x >= 0x61 && x <= 0x7a || x >= 0x30 && x <= 0x39 =>
case x if x == 0x2d || x == 0x2e || x == 0x5f || x == 0x7e =>
case _ =>
val newResult = result + input.substring(lastIndex, matchResult.start) + value
val newIndex = matchResult.end
foldMatches(matcher, newResult, input, newIndex)
} else {
result + input.substring(lastIndex)
foldMatches(escapePattern.matcher(semiNormalized), "", semiNormalized, 0)
import org.junit.runner.RunWith
import org.scalatest.{BeforeAndAfterEach, MustMatchers, WordSpec}
import org.scalatest.junit.JUnitRunner
class UrlNormalizerSpec
extends WordSpec
with MustMatchers
with BeforeAndAfterEach {
"normalize" should {
"return normalized url" when {
"scheme or host name has upper case letter" in {
UrlNormalizer.normalize("hTTp://www.Twitter.Com") must equal("")
UrlNormalizer.normalize("hTTp://www.Twitter.Com/") must equal("")
"path contains . or .." in {
UrlNormalizer.normalize("") must equal("")
"port is default" in {
UrlNormalizer.normalize("") must equal("")
UrlNormalizer.normalize("") must equal("")
"fragment should be removed" in {
UrlNormalizer.normalize("") must equal("")
"query string has lower cased escape" in {
UrlNormalizer.normalize("") must equal("")
UrlNormalizer.normalize("") must equal("")
"query string has un-necessary escaped letters" in {
UrlNormalizer.normalize("") must equal(
"return url unchanged" when {
"url is invalid" in {
UrlNormalizer.normalize(":ABCD") must equal(":ABCD")
UrlNormalizer.normalize("") must equal("")
