jiulongw/UrlNormalizer.scala

## UrlNormalizer.scala
import java.net.URI
import java.util.regex.{Pattern, Matcher}

import scala.annotation.tailrec

object UrlNormalizer {
  /**
   * URI.normalize handles path normalization. Doc: http://docs.oracle.com/javase/7/docs/api/java/net/URI.html#normalize()
   * e.g. /a/b/../c will become /a/c
   * Fragment is also removed.
   *
   * Further normalization that does not change semantics done according to: https://en.wikipedia.org/wiki/URL_normalization
   * 1. lower casing scheme and host name
   * 2. upper casing letters in escape sequences. (e.g. %2c to %2C)
   * 3. decode unnecessary encoded characters. (e.g. %2E to ., %5F to _)
   * 4. remove default port
   */
  private val escapePattern = Pattern.compile("%[0-9a-fA-F]{2}")

  def normalize(url: String, removeFragment: Boolean = false): String = {
    val semiNormalized =
      try {
        val uri = new URI(url).normalize()
        val scheme = uri.getScheme.toLowerCase
        val host = uri.getHost.toLowerCase
        val port = (scheme, uri.getPort) match {
          // -1 means default port.
          case ("http", 80) | ("https", 443) | (_, -1) => ""
          case _ => s":${uri.getPort}"
        }

        val path = uri.getRawPath match {
          case null => ""
          case p => p
        }

        val query = uri.getRawQuery match {
          case null => ""
          case q => s"?$q"
        }

        s"$scheme://$host$port$path$query"
      } catch {
        case e: Exception => url
      }

    @tailrec
    def foldMatches(matcher: Matcher, result: String, input: String, lastIndex: Int): String = {
      if (matcher.find) {
        val matchResult = matcher.toMatchResult
        val value = Integer.parseInt(matchResult.group.substring(1), 16) match {
          case x if x >= 0x41 && x <= 0x5a || x >= 0x61 && x <= 0x7a || x >= 0x30 && x <= 0x39 =>
            x.toChar
          case x if x == 0x2d || x == 0x2e || x == 0x5f || x == 0x7e =>
            x.toChar
          case _ => matchResult.group.toUpperCase
        }

        val newResult = result + input.substring(lastIndex, matchResult.start) + value
        val newIndex = matchResult.end

        foldMatches(matcher, newResult, input, newIndex)
      } else {
        result + input.substring(lastIndex)
      }
    }

    foldMatches(escapePattern.matcher(semiNormalized), "", semiNormalized, 0)
  }
}

## UrlNormalizerSpec.scala
import org.junit.runner.RunWith
import org.scalatest.{BeforeAndAfterEach, MustMatchers, WordSpec}
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class UrlNormalizerSpec
  extends WordSpec
  with MustMatchers
  with BeforeAndAfterEach {

  "normalize" should {
    "return normalized url" when {
      "scheme or host name has upper case letter" in {
        UrlNormalizer.normalize("hTTp://www.Twitter.Com") must equal("http://www.twitter.com")
        UrlNormalizer.normalize("hTTp://www.Twitter.Com/") must equal("http://www.twitter.com/")
      }
      "path contains . or .." in {
        UrlNormalizer.normalize("http://www.twitter.com/a/b/../c/./d") must equal("http://www.twitter.com/a/c/d")
      }
      "port is default" in {
        UrlNormalizer.normalize("http://www.twitter.com:80/") must equal("http://www.twitter.com/")
        UrlNormalizer.normalize("https://www.twitter.com:443/") must equal("https://www.twitter.com/")
      }
      "fragment should be removed" in {
        UrlNormalizer.normalize("http://www.twitter.com/?q=abc#frag") must equal("http://www.twitter.com/?q=abc")
      }
      "query string has lower cased escape" in {
        UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f") must equal("http://www.twitter.com/?q=abc%2Fdef%3F")
        UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f%Aa") must equal("http://www.twitter.com/?q=abc%2Fdef%3F%AA")
      }
      "query string has un-necessary escaped letters" in {
        UrlNormalizer.normalize("http://www.twitter.com/?q=%43%5a%61%7a%30%39%2d%2e%5F%7E-normal") must equal(
          "http://www.twitter.com/?q=CZaz09-._~-normal")
      }
    }

    "return url unchanged" when {
      "url is invalid" in {
        UrlNormalizer.normalize(":ABCD") must equal(":ABCD")
        UrlNormalizer.normalize("") must equal("")
      }
    }
  }
}
	import java.net.URI
	import java.util.regex.{Pattern, Matcher}

	import scala.annotation.tailrec

	object UrlNormalizer {
	/**
	* URI.normalize handles path normalization. Doc: http://docs.oracle.com/javase/7/docs/api/java/net/URI.html#normalize()
	* e.g. /a/b/../c will become /a/c
	* Fragment is also removed.
	*
	* Further normalization that does not change semantics done according to: https://en.wikipedia.org/wiki/URL_normalization
	* 1. lower casing scheme and host name
	* 2. upper casing letters in escape sequences. (e.g. %2c to %2C)
	* 3. decode unnecessary encoded characters. (e.g. %2E to ., %5F to _)
	* 4. remove default port
	*/
	private val escapePattern = Pattern.compile("%[0-9a-fA-F]{2}")

	def normalize(url: String, removeFragment: Boolean = false): String = {
	val semiNormalized =
	try {
	val uri = new URI(url).normalize()
	val scheme = uri.getScheme.toLowerCase
	val host = uri.getHost.toLowerCase
	val port = (scheme, uri.getPort) match {
	// -1 means default port.
	case ("http", 80) \| ("https", 443) \| (_, -1) => ""
	case _ => s":${uri.getPort}"
	}

	val path = uri.getRawPath match {
	case null => ""
	case p => p
	}

	val query = uri.getRawQuery match {
	case null => ""
	case q => s"?$q"
	}

	s"$scheme://$host$port$path$query"
	} catch {
	case e: Exception => url
	}

	@tailrec
	def foldMatches(matcher: Matcher, result: String, input: String, lastIndex: Int): String = {
	if (matcher.find) {
	val matchResult = matcher.toMatchResult
	val value = Integer.parseInt(matchResult.group.substring(1), 16) match {
	case x if x >= 0x41 && x <= 0x5a \|\| x >= 0x61 && x <= 0x7a \|\| x >= 0x30 && x <= 0x39 =>
	x.toChar
	case x if x == 0x2d \|\| x == 0x2e \|\| x == 0x5f \|\| x == 0x7e =>
	x.toChar
	case _ => matchResult.group.toUpperCase
	}

	val newResult = result + input.substring(lastIndex, matchResult.start) + value
	val newIndex = matchResult.end

	foldMatches(matcher, newResult, input, newIndex)
	} else {
	result + input.substring(lastIndex)
	}
	}

	foldMatches(escapePattern.matcher(semiNormalized), "", semiNormalized, 0)
	}
	}
	import org.junit.runner.RunWith
	import org.scalatest.{BeforeAndAfterEach, MustMatchers, WordSpec}
	import org.scalatest.junit.JUnitRunner

	@RunWith(classOf[JUnitRunner])
	class UrlNormalizerSpec
	extends WordSpec
	with MustMatchers
	with BeforeAndAfterEach {

	"normalize" should {
	"return normalized url" when {
	"scheme or host name has upper case letter" in {
	UrlNormalizer.normalize("hTTp://www.Twitter.Com") must equal("http://www.twitter.com")
	UrlNormalizer.normalize("hTTp://www.Twitter.Com/") must equal("http://www.twitter.com/")
	}
	"path contains . or .." in {
	UrlNormalizer.normalize("http://www.twitter.com/a/b/../c/./d") must equal("http://www.twitter.com/a/c/d")
	}
	"port is default" in {
	UrlNormalizer.normalize("http://www.twitter.com:80/") must equal("http://www.twitter.com/")
	UrlNormalizer.normalize("https://www.twitter.com:443/") must equal("https://www.twitter.com/")
	}
	"fragment should be removed" in {
	UrlNormalizer.normalize("http://www.twitter.com/?q=abc#frag") must equal("http://www.twitter.com/?q=abc")
	}
	"query string has lower cased escape" in {
	UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f") must equal("http://www.twitter.com/?q=abc%2Fdef%3F")
	UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f%Aa") must equal("http://www.twitter.com/?q=abc%2Fdef%3F%AA")
	}
	"query string has un-necessary escaped letters" in {
	UrlNormalizer.normalize("http://www.twitter.com/?q=%43%5a%61%7a%30%39%2d%2e%5F%7E-normal") must equal(
	"http://www.twitter.com/?q=CZaz09-._~-normal")
	}
	}

	"return url unchanged" when {
	"url is invalid" in {
	UrlNormalizer.normalize(":ABCD") must equal(":ABCD")
	UrlNormalizer.normalize("") must equal("")
	}
	}
	}
	}