Created
November 18, 2015 19:38
-
-
Save jiulongw/d380476237d03c616679 to your computer and use it in GitHub Desktop.
URL Normalizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.net.URI | |
import java.util.regex.{Pattern, Matcher} | |
import scala.annotation.tailrec | |
object UrlNormalizer { | |
/** | |
* URI.normalize handles path normalization. Doc: http://docs.oracle.com/javase/7/docs/api/java/net/URI.html#normalize() | |
* e.g. /a/b/../c will become /a/c | |
* Fragment is also removed. | |
* | |
* Further normalization that does not change semantics done according to: https://en.wikipedia.org/wiki/URL_normalization | |
* 1. lower casing scheme and host name | |
* 2. upper casing letters in escape sequences. (e.g. %2c to %2C) | |
* 3. decode unnecessary encoded characters. (e.g. %2E to ., %5F to _) | |
* 4. remove default port | |
*/ | |
private val escapePattern = Pattern.compile("%[0-9a-fA-F]{2}") | |
def normalize(url: String, removeFragment: Boolean = false): String = { | |
val semiNormalized = | |
try { | |
val uri = new URI(url).normalize() | |
val scheme = uri.getScheme.toLowerCase | |
val host = uri.getHost.toLowerCase | |
val port = (scheme, uri.getPort) match { | |
// -1 means default port. | |
case ("http", 80) | ("https", 443) | (_, -1) => "" | |
case _ => s":${uri.getPort}" | |
} | |
val path = uri.getRawPath match { | |
case null => "" | |
case p => p | |
} | |
val query = uri.getRawQuery match { | |
case null => "" | |
case q => s"?$q" | |
} | |
s"$scheme://$host$port$path$query" | |
} catch { | |
case e: Exception => url | |
} | |
@tailrec | |
def foldMatches(matcher: Matcher, result: String, input: String, lastIndex: Int): String = { | |
if (matcher.find) { | |
val matchResult = matcher.toMatchResult | |
val value = Integer.parseInt(matchResult.group.substring(1), 16) match { | |
case x if x >= 0x41 && x <= 0x5a || x >= 0x61 && x <= 0x7a || x >= 0x30 && x <= 0x39 => | |
x.toChar | |
case x if x == 0x2d || x == 0x2e || x == 0x5f || x == 0x7e => | |
x.toChar | |
case _ => matchResult.group.toUpperCase | |
} | |
val newResult = result + input.substring(lastIndex, matchResult.start) + value | |
val newIndex = matchResult.end | |
foldMatches(matcher, newResult, input, newIndex) | |
} else { | |
result + input.substring(lastIndex) | |
} | |
} | |
foldMatches(escapePattern.matcher(semiNormalized), "", semiNormalized, 0) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.junit.runner.RunWith | |
import org.scalatest.{BeforeAndAfterEach, MustMatchers, WordSpec} | |
import org.scalatest.junit.JUnitRunner | |
@RunWith(classOf[JUnitRunner]) | |
class UrlNormalizerSpec | |
extends WordSpec | |
with MustMatchers | |
with BeforeAndAfterEach { | |
"normalize" should { | |
"return normalized url" when { | |
"scheme or host name has upper case letter" in { | |
UrlNormalizer.normalize("hTTp://www.Twitter.Com") must equal("http://www.twitter.com") | |
UrlNormalizer.normalize("hTTp://www.Twitter.Com/") must equal("http://www.twitter.com/") | |
} | |
"path contains . or .." in { | |
UrlNormalizer.normalize("http://www.twitter.com/a/b/../c/./d") must equal("http://www.twitter.com/a/c/d") | |
} | |
"port is default" in { | |
UrlNormalizer.normalize("http://www.twitter.com:80/") must equal("http://www.twitter.com/") | |
UrlNormalizer.normalize("https://www.twitter.com:443/") must equal("https://www.twitter.com/") | |
} | |
"fragment should be removed" in { | |
UrlNormalizer.normalize("http://www.twitter.com/?q=abc#frag") must equal("http://www.twitter.com/?q=abc") | |
} | |
"query string has lower cased escape" in { | |
UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f") must equal("http://www.twitter.com/?q=abc%2Fdef%3F") | |
UrlNormalizer.normalize("http://www.twitter.com/?q=abc%2fdef%3f%Aa") must equal("http://www.twitter.com/?q=abc%2Fdef%3F%AA") | |
} | |
"query string has un-necessary escaped letters" in { | |
UrlNormalizer.normalize("http://www.twitter.com/?q=%43%5a%61%7a%30%39%2d%2e%5F%7E-normal") must equal( | |
"http://www.twitter.com/?q=CZaz09-._~-normal") | |
} | |
} | |
"return url unchanged" when { | |
"url is invalid" in { | |
UrlNormalizer.normalize(":ABCD") must equal(":ABCD") | |
UrlNormalizer.normalize("") must equal("") | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment