Skip to content

Instantly share code, notes, and snippets.

@jiulongw
Created November 18, 2015 19:38
Show Gist options
  • Save jiulongw/d380476237d03c616679 to your computer and use it in GitHub Desktop.
Save jiulongw/d380476237d03c616679 to your computer and use it in GitHub Desktop.
URL Normalizer
import java.net.URI
import java.util.regex.{Pattern, Matcher}
import scala.annotation.tailrec
object UrlNormalizer {
/**
* URI.normalize handles path normalization. Doc: http://docs.oracle.com/javase/7/docs/api/java/net/URI.html#normalize()
* e.g. /a/b/../c will become /a/c
* Fragment is also removed.
*
* Further normalization that does not change semantics done according to: https://en.wikipedia.org/wiki/URL_normalization
* 1. lower casing scheme and host name
* 2. upper casing letters in escape sequences. (e.g. %2c to %2C)
* 3. decode unnecessary encoded characters. (e.g. %2E to ., %5F to _)
* 4. remove default port
*/
private val escapePattern = Pattern.compile("%[0-9a-fA-F]{2}")
def normalize(url: String, removeFragment: Boolean = false): String = {
val semiNormalized =
try {
val uri = new URI(url).normalize()
val scheme = uri.getScheme.toLowerCase
val host = uri.getHost.toLowerCase
val port = (scheme, uri.getPort) match {
// -1 means default port.
case ("http", 80) | ("https", 443) | (_, -1) => ""
case _ => s":${uri.getPort}"
}
val path = uri.getRawPath match {
case null => ""
case p => p
}
val query = uri.getRawQuery match {
case null => ""
case q => s"?$q"
}
s"$scheme://$host$port$path$query"
} catch {
case e: Exception => url
}
@tailrec
def foldMatches(matcher: Matcher, result: String, input: String, lastIndex: Int): String = {
if (matcher.find) {
val matchResult = matcher.toMatchResult
val value = Integer.parseInt(matchResult.group.substring(1), 16) match {
case x if x >= 0x41 && x <= 0x5a || x >= 0x61 && x <= 0x7a || x >= 0x30 && x <= 0x39 =>
x.toChar
case x if x == 0x2d || x == 0x2e || x == 0x5f || x == 0x7e =>
x.toChar
case _ => matchResult.group.toUpperCase
}
val newResult = result + input.substring(lastIndex, matchResult.start) + value
val newIndex = matchResult.end
foldMatches(matcher, newResult, input, newIndex)
} else {
result + input.substring(lastIndex)
}
}
foldMatches(escapePattern.matcher(semiNormalized), "", semiNormalized, 0)
}
}
import org.junit.runner.RunWith
import org.scalatest.{BeforeAndAfterEach, MustMatchers, WordSpec}
import org.scalatest.junit.JUnitRunner
@RunWith(classOf[JUnitRunner])
class UrlNormalizerSpec
extends WordSpec
with MustMatchers
with BeforeAndAfterEach {
"normalize" should {
"return normalized url" when {
"scheme or host name has upper case letter" in {
UrlNormalizer.normalize("hTTp://www.Twitter.Com") must equal("http://www.twitter.com")
UrlNormalizer.normalize("hTTp://www.Twitter.Com/") must equal("http://www.twitter.com/")
}
"path contains . or .." in {
UrlNormalizer.normalize("http://www.twitter.com/a/b/../c/./d") must equal("http://www.twitter.com/a/c/d")
}
"port is default" in {
UrlNormalizer.normalize("http://www.twitter.com:80/") must equal("http://www.twitter.com/")
UrlNormalizer.normalize("https://www.twitter.com:443/") must equal("https://www.twitter.com/")
}
"fragment should be removed" in {
UrlNormalizer.normalize("http://www.twitter.com/?q=abc#frag") must equal("http://www.twitter.com/?q=abc")
}
"query string has lower cased escape" in {
UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f") must equal("http://www.twitter.com/?q=abc%2Fdef%3F")
UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f%Aa") must equal("http://www.twitter.com/?q=abc%2Fdef%3F%AA")
}
"query string has un-necessary escaped letters" in {
UrlNormalizer.normalize("http://www.twitter.com/?q=%43%5a%61%7a%30%39%2d%2e%5F%7E-normal") must equal(
"http://www.twitter.com/?q=CZaz09-._~-normal")
}
}
"return url unchanged" when {
"url is invalid" in {
UrlNormalizer.normalize(":ABCD") must equal(":ABCD")
UrlNormalizer.normalize("") must equal("")
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment