Last active
August 29, 2015 13:57
-
-
Save fivesmallq/9893244 to your computer and use it in GitHub Desktop.
用来处理URL相对地址到绝对地址的转换 https://zaproxy.googlecode.com/svn-history/r2698/trunk/src/org/zaproxy/zap/spider/URLCanonicalizer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Zed Attack Proxy (ZAP) and its related class files. | |
* | |
* ZAP is an HTTP/HTTPS proxy for assessing web application security. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
* | |
* ZAP: Based on work by Yasser Ganjisaffar <lastname at gmail dot com> | |
* from project http://code.google.com/p/crawler4j/ | |
*/ | |
package org.zaproxy.zap.spider; | |
import java.net.MalformedURLException; | |
import java.net.URI; | |
import java.net.URISyntaxException; | |
import java.net.URL; | |
import java.net.URLDecoder; | |
import java.net.URLEncoder; | |
import java.util.HashSet; | |
import java.util.Map; | |
import java.util.Set; | |
import java.util.SortedMap; | |
import java.util.TreeMap; | |
import org.apache.commons.httpclient.URIException; | |
import org.apache.log4j.Logger; | |
import org.zaproxy.zap.spider.SpiderParam.HandleParametersOption; | |
/** | |
* The URLCanonicalizer is used for the process of converting an URL into a canonical (normalized) form. See | |
* <a href="http://en.wikipedia.org/wiki/URL_normalization">URL Normalization</a> for a reference. <br/> | |
* <br/> | |
* | |
* Note: some parts of the code are adapted from: <a | |
* href="http://stackoverflow.com/a/4057470/405418">stackoverflow</a> | |
*/ | |
public final class URLCanonicalizer { | |
/** The Constant log. */ | |
private static final Logger log = Logger.getLogger(URLCanonicalizer.class); | |
/** The Constant IRRELEVANT_PARAMETERS defining the parameter names which are ignored in the URL. */ | |
private static final Set<String> IRRELEVANT_PARAMETERS = new HashSet<>(3); | |
static { | |
IRRELEVANT_PARAMETERS.add("jsessionid"); | |
IRRELEVANT_PARAMETERS.add("phpsessid"); | |
IRRELEVANT_PARAMETERS.add("aspsessionid"); | |
} | |
/** | |
* Private constructor to avoid initialization of object. | |
*/ | |
private URLCanonicalizer(){} | |
/** | |
* Gets the canonical url. | |
* | |
* @param url the url | |
* @return the canonical url | |
*/ | |
public static String getCanonicalURL(String url) { | |
return getCanonicalURL(url, null); | |
} | |
/** | |
* Gets the canonical url, starting from a relative or absolute url found in a given context (baseURL). | |
* | |
* @param url the url string defining the reference | |
* @param baseURL the context in which this url was found | |
* @return the canonical url | |
*/ | |
public static String getCanonicalURL(String url, String baseURL) { | |
try { | |
/* Build the absolute URL, from the url and the baseURL */ | |
String resolvedURL = URLResolver.resolveUrl(baseURL == null ? "" : baseURL, url); | |
log.debug("Resolved URL: " + resolvedURL); | |
URI canonicalURI = new URI(resolvedURL); | |
/* Some checking. */ | |
if (canonicalURI.getScheme() == null) { | |
throw new MalformedURLException("Protocol could not be reliably evaluated from uri: " + canonicalURI | |
+ " and base url: " + baseURL); | |
} | |
if (canonicalURI.getHost() == null) { | |
throw new MalformedURLException("Host could not be reliably evaluated from: " + canonicalURI); | |
} | |
/* | |
* Normalize: no empty segments (i.e., "//"), no segments equal to ".", and no segments equal to | |
* ".." that are preceded by a segment not equal to "..". | |
*/ | |
String path = canonicalURI.normalize().getPath(); | |
/* Convert '//' -> '/' */ | |
int idx = path.indexOf("//"); | |
while (idx >= 0) { | |
path = path.replace("//", "/"); | |
idx = path.indexOf("//"); | |
} | |
/* Drop starting '/../' */ | |
while (path.startsWith("/../")) { | |
path = path.substring(3); | |
} | |
/* Trim */ | |
path = path.trim(); | |
/* Process parameters and sort them. */ | |
final SortedMap<String, String> params = createParameterMap(canonicalURI.getQuery()); | |
final String queryString; | |
String canonicalParams = canonicalize(params); | |
queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams); | |
/* Add starting slash if needed */ | |
if (path.length() == 0) { | |
path = "/" + path; | |
} | |
/* Drop default port: example.com:80 -> example.com */ | |
int port = canonicalURI.getPort(); | |
if (port == 80) { | |
port = -1; | |
} | |
/* Lowercasing protocol and host */ | |
String protocol = canonicalURI.getScheme().toLowerCase(); | |
String host = canonicalURI.getHost().toLowerCase(); | |
String pathAndQueryString = normalizePath(path) + queryString; | |
URL result = new URL(protocol, host, port, pathAndQueryString); | |
return result.toExternalForm(); | |
} catch (MalformedURLException ex) { | |
log.warn("Error while Processing URL in the spidering process (on base " + baseURL + "): " | |
+ ex.getMessage()); | |
return null; | |
} catch (URISyntaxException ex) { | |
log.warn("Error while Processing URI in the spidering process (on base " + baseURL + "): " | |
+ ex.getMessage()); | |
return null; | |
} | |
} | |
/** | |
* Builds a String representation of the URI with cleaned parameters, that can be used when checking if an | |
* URI was already visited. The URI provided as a parameter should be already cleaned and canonicalized, | |
* so it should be build with a result from {@link #getCanonicalURL(String)}. | |
* | |
* <p> | |
* When building the URI representation, the same format should be used for all the cases, as it may | |
* affect the number of times the pages are visited and reported if the option HandleParametersOption is | |
* changed while the spider is running. | |
* </p> | |
* | |
* @param uri the uri | |
* @param handleParameters the handle parameters option | |
* @return the string representation of the URI | |
* @throws URIException the URI exception | |
*/ | |
public static String buildCleanedParametersURIRepresentation(org.apache.commons.httpclient.URI uri, | |
SpiderParam.HandleParametersOption handleParameters) throws URIException { | |
// If the option is set to use all the information, just use the default string representation | |
if (handleParameters.equals(HandleParametersOption.USE_ALL)) { | |
return uri.toString(); | |
} | |
// If the option is set to ignore parameters completely, ignore the query completely | |
if (handleParameters.equals(HandleParametersOption.IGNORE_COMPLETELY)) { | |
StringBuilder retVal = new StringBuilder(); | |
retVal.append(uri.getScheme()).append("://").append(uri.getHost()); | |
if (uri.getPort() != -1) { | |
retVal.append(':').append(uri.getPort()); | |
} | |
if (uri.getPath() != null) { | |
retVal.append(uri.getPath()); | |
} | |
return retVal.toString(); | |
} | |
// If the option is set to ignore the value, we get the parameters and we only add their name to the | |
// query | |
if (handleParameters.equals(HandleParametersOption.IGNORE_VALUE)) { | |
StringBuilder retVal = new StringBuilder(); | |
retVal.append(uri.getScheme()).append("://").append(uri.getHost()); | |
if (uri.getPort() != -1) { | |
retVal.append(':').append(uri.getPort()); | |
} | |
if (uri.getPath() != null) { | |
retVal.append(uri.getPath()); | |
} | |
// Get the parameters' names | |
SortedMap<String, String> params = createParameterMap(uri.getQuery()); | |
StringBuilder sb = new StringBuilder(); | |
if (params != null && !params.isEmpty()) { | |
for (String key : params.keySet()) { | |
// Ignore irrelevant parameters | |
if (IRRELEVANT_PARAMETERS.contains(key) || key.startsWith("utm_")) { | |
continue; | |
} | |
if (sb.length() > 0) { | |
sb.append('&'); | |
} | |
sb.append(key); | |
} | |
} | |
// Add the parameters' names to the uri representation. | |
if(sb.length()>0) { | |
retVal.append("?").append(sb); | |
} | |
return retVal.toString(); | |
} | |
// Should not be reached | |
return uri.toString(); | |
} | |
/** | |
* Takes a query string, separates the constituent name-value pairs, and stores them in a SortedMap | |
* ordered by lexicographical order. | |
* | |
* @param queryString the query string | |
* @return Null if there is no query string. | |
*/ | |
private static SortedMap<String, String> createParameterMap(final String queryString) { | |
if (queryString == null || queryString.isEmpty()) { | |
return null; | |
} | |
final String[] pairs = queryString.split("&"); | |
final SortedMap<String, String> params = new TreeMap<>(); | |
for (final String pair : pairs) { | |
if (pair.length() == 0) { | |
continue; | |
} | |
String[] tokens = pair.split("=", 2); | |
switch (tokens.length) { | |
case 1: | |
if (pair.charAt(0) == '=') { | |
params.put("", tokens[0]); | |
} else { | |
params.put(tokens[0], ""); | |
} | |
break; | |
case 2: | |
params.put(tokens[0], tokens[1]); | |
break; | |
} | |
} | |
return params; | |
} | |
/** | |
* Canonicalize the query string. | |
* | |
* @param sortedParamMap Parameter name-value pairs in lexicographical order. | |
* @return Canonical form of query string. | |
*/ | |
private static String canonicalize(final SortedMap<String, String> sortedParamMap) { | |
if (sortedParamMap == null || sortedParamMap.isEmpty()) { | |
return ""; | |
} | |
final StringBuilder sb = new StringBuilder(100); | |
for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) { | |
final String key = pair.getKey().toLowerCase(); | |
// Ignore irrelevant parameters | |
if (IRRELEVANT_PARAMETERS.contains(key) || key.startsWith("utm_")) { | |
continue; | |
} | |
if (sb.length() > 0) { | |
sb.append('&'); | |
} | |
sb.append(percentEncodeRfc3986(pair.getKey())); | |
if (!pair.getValue().isEmpty()) { | |
sb.append('='); | |
sb.append(percentEncodeRfc3986(pair.getValue())); | |
} | |
} | |
return sb.toString(); | |
} | |
/** | |
* Percent-encode values according the RFC 3986. The built-in Java URLEncoder does not encode according to | |
* the RFC, so we make the extra replacements. | |
* | |
* @param string Decoded string. | |
* @return Encoded string per RFC 3986. | |
*/ | |
private static String percentEncodeRfc3986(String string) { | |
try { | |
string = string.replace("+", "%2B"); | |
string = URLDecoder.decode(string, "UTF-8"); | |
string = URLEncoder.encode(string, "UTF-8"); | |
return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~"); | |
} catch (Exception e) { | |
return string; | |
} | |
} | |
/** | |
* Normalize path. | |
* | |
* @param path the path | |
* @return the string | |
*/ | |
private static String normalizePath(final String path) { | |
return path.replace("%7E", "~").replace(" ", "%20"); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This class is adopted from Htmlunit with the following copyright: | |
* | |
* Copyright (c) 2002-2012 Gargoyle Software Inc. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
* | |
* ZAP: Based on work by Gargoyle Software and adopted from Htmlunit (as stated above). | |
*/ | |
package org.zaproxy.zap.spider; | |
/** | |
* The UrlResolve handles the process of resolving a relative URL against a base URL (context). | |
*/ | |
public final class URLResolver { | |
/** | |
* Private constructor to avoid initialization of object. | |
*/ | |
private URLResolver(){} | |
/** | |
* Resolves a given relative URL against a base URL. See <a | |
* href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a> Section 4 for more details. | |
* | |
* @param baseUrl The base URL in which to resolve the specification. | |
* @param relativeUrl The relative URL to resolve against the base URL. | |
* @return the resolved specification. | |
*/ | |
public static String resolveUrl(final String baseUrl, final String relativeUrl) { | |
// Parameter check | |
if (baseUrl == null) { | |
throw new IllegalArgumentException("Base URL must not be null"); | |
} | |
if (relativeUrl == null) { | |
throw new IllegalArgumentException("Relative URL must not be null"); | |
} | |
final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim()); | |
return url.toString(); | |
} | |
/** | |
* Returns the index within the specified string of the first occurrence of the specified search | |
* character. | |
* | |
* @param s the string to search | |
* @param searchChar the character to search for | |
* @param beginIndex the index at which to start the search | |
* @param endIndex the index at which to stop the search | |
* @return the index of the first occurrence of the character in the string or <tt>-1</tt> | |
*/ | |
private static int indexOf(final String s, final char searchChar, final int beginIndex, final int endIndex) { | |
for (int i = beginIndex; i < endIndex; i++) { | |
if (s.charAt(i) == searchChar) { | |
return i; | |
} | |
} | |
return -1; | |
} | |
/** | |
* Parses a given specification using the algorithm depicted in <a | |
* href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:<br/> | |
* <br/> | |
* | |
* <p> | |
* Section 2.4: Parsing a URL | |
* </p> | |
* | |
* An accepted method for parsing URLs is useful to clarify the generic-RL syntax of Section 2.2 | |
* and to describe the algorithm for resolving relative URLs presented in Section 4. This | |
* section describes the parsing rules for breaking down a URL (relative or absolute) into the | |
* component parts described in Section 2.1. The rules assume that the URL has already been | |
* separated from any surrounding text and copied to a "parse string". The rules are listed in | |
* the order in which they would be applied by the parser. | |
* | |
* @param spec The specification to parse. | |
* @return the parsed specification. | |
*/ | |
private static Url parseUrl(final String spec) { | |
final Url url = new Url(); | |
int startIndex = 0; | |
int endIndex = spec.length(); | |
// Section 2.4.1: Parsing the Fragment Identifier | |
// | |
// If the parse string contains a crosshatch "#" character, then the | |
// substring after the first (left-most) crosshatch "#" and up to the | |
// end of the parse string is the <fragment> identifier. If the | |
// crosshatch is the last character, or no crosshatch is present, then | |
// the fragment identifier is empty. The matched substring, including | |
// the crosshatch character, is removed from the parse string before | |
// continuing. | |
// | |
// Note that the fragment identifier is not considered part of the URL. | |
// However, since it is often attached to the URL, parsers must be able | |
// to recognize and set aside fragment identifiers as part of the | |
// process. | |
final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex); | |
if (crosshatchIndex >= 0) { | |
url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex); | |
endIndex = crosshatchIndex; | |
} | |
// Section 2.4.2: Parsing the Scheme | |
// | |
// If the parse string contains a colon ":" after the first character | |
// and before any characters not allowed as part of a scheme name (i.e., | |
// any not an alphanumeric, plus "+", period ".", or hyphen "-"), the | |
// <scheme> of the URL is the substring of characters up to but not | |
// including the first colon. These characters and the colon are then | |
// removed from the parse string before continuing. | |
final int colonIndex = indexOf(spec, ':', startIndex, endIndex); | |
if (colonIndex > 0) { | |
final String scheme = spec.substring(startIndex, colonIndex); | |
if (isValidScheme(scheme)) { | |
url.scheme_ = scheme; | |
startIndex = colonIndex + 1; | |
} | |
} | |
// Section 2.4.3: Parsing the Network Location/Login | |
// | |
// If the parse string begins with a double-slash "//", then the | |
// substring of characters after the double-slash and up to, but not | |
// including, the next slash "/" character is the network location/login | |
// (<net_loc>) of the URL. If no trailing slash "/" is present, the | |
// entire remaining parse string is assigned to <net_loc>. The double- | |
// slash and <net_loc> are removed from the parse string before | |
// continuing. | |
// | |
// Note: We also accept a question mark "?" or a semicolon ";" character as | |
// delimiters for the network location/login (<net_loc>) of the URL. | |
final int locationStartIndex; | |
int locationEndIndex; | |
if (spec.startsWith("//", startIndex)) { | |
locationStartIndex = startIndex + 2; | |
locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex); | |
if (locationEndIndex >= 0) { | |
startIndex = locationEndIndex; | |
} | |
} else { | |
locationStartIndex = -1; | |
locationEndIndex = -1; | |
} | |
// Section 2.4.4: Parsing the Query Information | |
// | |
// If the parse string contains a question mark "?" character, then the | |
// substring after the first (left-most) question mark "?" and up to the | |
// end of the parse string is the <query> information. If the question | |
// mark is the last character, or no question mark is present, then the | |
// query information is empty. The matched substring, including the | |
// question mark character, is removed from the parse string before | |
// continuing. | |
final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex); | |
if (questionMarkIndex >= 0) { | |
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { | |
// The substring of characters after the double-slash and up to, but not | |
// including, the question mark "?" character is the network location/login | |
// (<net_loc>) of the URL. | |
locationEndIndex = questionMarkIndex; | |
startIndex = questionMarkIndex; | |
} | |
url.query_ = spec.substring(questionMarkIndex + 1, endIndex); | |
endIndex = questionMarkIndex; | |
} | |
// Section 2.4.5: Parsing the Parameters | |
// | |
// If the parse string contains a semicolon ";" character, then the | |
// substring after the first (left-most) semicolon ";" and up to the end | |
// of the parse string is the parameters (<params>). If the semicolon | |
// is the last character, or no semicolon is present, then <params> is | |
// empty. The matched substring, including the semicolon character, is | |
// removed from the parse string before continuing. | |
final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex); | |
if (semicolonIndex >= 0) { | |
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { | |
// The substring of characters after the double-slash and up to, but not | |
// including, the semicolon ";" character is the network location/login | |
// (<net_loc>) of the URL. | |
locationEndIndex = semicolonIndex; | |
startIndex = semicolonIndex; | |
} | |
url.parameters_ = spec.substring(semicolonIndex + 1, endIndex); | |
endIndex = semicolonIndex; | |
} | |
// Section 2.4.6: Parsing the Path | |
// | |
// After the above steps, all that is left of the parse string is the URL <path> and the | |
// slash "/" that may precede it. Even though the initial slash is not part of the URL path, | |
// the parser must remember whether or not it was present so that later processes can | |
// differentiate between relative and absolute paths. Often this is done by simply storing | |
// the preceding slash along with the path. | |
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { | |
// The entire remaining parse string is assigned to the network location/login | |
// (<net_loc>) of the URL. | |
locationEndIndex = endIndex; | |
} else if (startIndex < endIndex) { | |
url.path_ = spec.substring(startIndex, endIndex); | |
} | |
// Set the network location/login (<net_loc>) of the URL. | |
if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) { | |
url.location_ = spec.substring(locationStartIndex, locationEndIndex); | |
} | |
return url; | |
} | |
/** | |
* Checks if it is a valid scheme. | |
* | |
* @param scheme the scheme | |
* @return true, if is a valid scheme name | |
*/ | |
private static boolean isValidScheme(final String scheme) { | |
final int length = scheme.length(); | |
if (length <= 0) { | |
return false; | |
} | |
char c = scheme.charAt(0); | |
if (!Character.isLetter(c)) { | |
return false; | |
} | |
for (int i = 1; i < length; i++) { | |
c = scheme.charAt(i); | |
if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') { | |
return false; | |
} | |
} | |
return true; | |
} | |
/** | |
* Resolves a given relative URL against a base URL using the algorithm depicted in <a | |
* href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>: | |
* | |
* Section 4: Resolving Relative URLs | |
* | |
* This section describes an example algorithm for resolving URLs within a context in which the | |
* URLs may be relative, such that the result is always a URL in absolute form. Although this | |
* algorithm cannot guarantee that the resulting URL will equal that intended by the original | |
* author, it does guarantee that any valid URL (relative or absolute) can be consistently | |
* transformed to an absolute form given a valid base URL. | |
* | |
* @param baseUrl The base URL in which to resolve the specification. | |
* @param relativeUrl The relative URL to resolve against the base URL. | |
* @return the resolved specification. | |
*/ | |
private static Url resolveUrl(final Url baseUrl, final String relativeUrl) { | |
final Url url = parseUrl(relativeUrl); | |
// Step 1: The base URL is established according to the rules of | |
// Section 3. If the base URL is the empty string (unknown), | |
// the embedded URL is interpreted as an absolute URL and | |
// we are done. | |
if (baseUrl == null) { | |
return url; | |
} | |
// Step 2: Both the base and embedded URLs are parsed into their component parts as | |
// described in Section 2.4. | |
// a) If the embedded URL is entirely empty, it inherits the | |
// entire base URL (i.e., is set equal to the base URL) and we are done. | |
if (relativeUrl.length() == 0) { | |
return new Url(baseUrl); | |
} | |
// b) If the embedded URL starts with a scheme name, it is interpreted as an absolute URL | |
// and we are done. | |
if (url.scheme_ != null) { | |
return url; | |
} | |
// c) Otherwise, the embedded URL inherits the scheme of the base URL. | |
url.scheme_ = baseUrl.scheme_; | |
// Step 3: If the embedded URL's <net_loc> is non-empty, we skip to Step 7. Otherwise, the | |
// embedded URL inherits the <net_loc> (if any) of the base URL. | |
if (url.location_ != null) { | |
return url; | |
} | |
url.location_ = baseUrl.location_; | |
// Step 4: If the embedded URL path is preceded by a slash "/", the | |
// path is not relative and we skip to Step 7. | |
if ((url.path_ != null) && ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) { | |
url.path_ = removeLeadingSlashPoints(url.path_); | |
return url; | |
} | |
// Step 5: If the embedded URL path is empty (and not preceded by a | |
// slash), then the embedded URL inherits the base URL path, | |
// and | |
if (url.path_ == null) { | |
url.path_ = baseUrl.path_; | |
// a) if the embedded URL's <params> is non-empty, we skip to | |
// step 7; otherwise, it inherits the <params> of the base | |
// URL (if any) and | |
if (url.parameters_ != null) { | |
return url; | |
} | |
url.parameters_ = baseUrl.parameters_; | |
// b) if the embedded URL's <query> is non-empty, we skip to | |
// step 7; otherwise, it inherits the <query> of the base | |
// URL (if any) and we skip to step 7. | |
if (url.query_ != null) { | |
return url; | |
} | |
url.query_ = baseUrl.query_; | |
return url; | |
} | |
// Step 6: The last segment of the base URL's path (anything | |
// following the rightmost slash "/", or the entire path if no | |
// slash is present) is removed and the embedded URL's path is | |
// appended in its place. | |
final String basePath = baseUrl.path_; | |
String path = ""; | |
if (basePath != null) { | |
final int lastSlashIndex = basePath.lastIndexOf('/'); | |
if (lastSlashIndex >= 0) { | |
path = basePath.substring(0, lastSlashIndex + 1); | |
} | |
} else { | |
path = "/"; | |
} | |
path = path.concat(url.path_); | |
// The following operations are then applied, in order, to the new path: | |
// a) All occurrences of "./", where "." is a complete path segment, are removed. | |
int pathSegmentIndex; | |
while ((pathSegmentIndex = path.indexOf("/./")) >= 0) { | |
path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3)); | |
} | |
// b) If the path ends with "." as a complete path segment, that "." is removed. | |
if (path.endsWith("/.")) { | |
path = path.substring(0, path.length() - 1); | |
} | |
// c) All occurrences of "<segment>/../", where <segment> is a complete path segment not | |
// equal to "..", are removed. Removal of these path segments is performed iteratively, | |
// removing the leftmost matching pattern on each iteration, until no matching pattern | |
// remains. | |
while ((pathSegmentIndex = path.indexOf("/../")) > 0) { | |
final String pathSegment = path.substring(0, pathSegmentIndex); | |
final int slashIndex = pathSegment.lastIndexOf('/'); | |
if (slashIndex < 0) { | |
continue; | |
} | |
if (!"..".equals(pathSegment.substring(slashIndex))) { | |
path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4)); | |
} | |
} | |
// d) If the path ends with "<segment>/..", where <segment> is a complete path segment not | |
// equal to "..", that "<segment>/.." is removed. | |
if (path.endsWith("/..")) { | |
final String pathSegment = path.substring(0, path.length() - 3); | |
final int slashIndex = pathSegment.lastIndexOf('/'); | |
if (slashIndex >= 0) { | |
path = path.substring(0, slashIndex + 1); | |
} | |
} | |
path = removeLeadingSlashPoints(path); | |
url.path_ = path; | |
// Step 7: The resulting URL components, including any inherited from | |
// the base URL, are recombined to give the absolute form of | |
// the embedded URL. | |
return url; | |
} | |
/** | |
* "/.." at the beginning should be removed as browsers do (not in RFC). | |
* | |
* @param path the path | |
* @return the cleaned string defining the path. | |
*/ | |
private static String removeLeadingSlashPoints(String path) { | |
while (path.startsWith("/..")) { | |
path = path.substring(3); | |
} | |
return path; | |
} | |
/** | |
* Class <tt>Url</tt> represents a Uniform Resource Locator. | |
* | |
* @author Martin Tamme | |
*/ | |
private static class Url { | |
/** The scheme_. */ | |
private String scheme_; | |
/** The location_. */ | |
private String location_; | |
/** The path_. */ | |
private String path_; | |
/** The parameters_. */ | |
private String parameters_; | |
/** The query_. */ | |
private String query_; | |
/** The fragment_. */ | |
private String fragment_; | |
/** | |
* Creates an <tt>Url</tt> object. | |
*/ | |
public Url() { | |
} | |
/** | |
* Creates an <tt>Url</tt> object from the specified <tt>Url</tt> object. | |
* | |
* @param url an <tt>Url</tt> object. | |
*/ | |
public Url(final Url url) { | |
scheme_ = url.scheme_; | |
location_ = url.location_; | |
path_ = url.path_; | |
parameters_ = url.parameters_; | |
query_ = url.query_; | |
fragment_ = url.fragment_; | |
} | |
/** | |
* Returns a string representation of the <tt>Url</tt> object. | |
* | |
* @return a string representation of the <tt>Url</tt> object. | |
*/ | |
@Override | |
public String toString() { | |
final StringBuilder sb = new StringBuilder(); | |
if (scheme_ != null) { | |
sb.append(scheme_); | |
sb.append(':'); | |
} | |
if (location_ != null) { | |
sb.append("//"); | |
sb.append(location_); | |
} | |
if (path_ != null) { | |
sb.append(path_); | |
} | |
if (parameters_ != null) { | |
sb.append(';'); | |
sb.append(parameters_); | |
} | |
if (query_ != null) { | |
sb.append('?'); | |
sb.append(query_); | |
} | |
if (fragment_ != null) { | |
sb.append('#'); | |
sb.append(fragment_); | |
} | |
return sb.toString(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment