Created
September 8, 2012 12:25
-
-
Save agentgt/3674391 to your computer and use it in GitHub Desktop.
URIExtractor extracts URI and URL from a stream.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.snaphop.util; | |
import static com.google.common.collect.Iterators.transform; | |
import java.io.IOException; | |
import java.io.Reader; | |
import java.net.URI; | |
import java.net.URISyntaxException; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Set; | |
import com.google.common.base.Function; | |
import com.google.common.collect.AbstractIterator; | |
import com.google.common.collect.ImmutableList; | |
import com.google.common.collect.ImmutableSet; | |
public class URIExtractor { | |
public static Iterator<URI> extractURIs(final Reader reader, String ... schemes) { | |
return transform(extractURIs(reader, DEFAULT_STRATEGY_CHAIN, schemes), new Function<ExtractedURI, URI>() { | |
@Override | |
public URI apply(ExtractedURI input) { | |
return input.getUri(); | |
} | |
}); | |
} | |
public static Iterator<ExtractedURI> extractURIs( | |
final Reader reader, | |
final Iterable<ToURIStrategy> strategies, | |
String ... schemes) { | |
final Set<String> schemeSet = ImmutableSet.copyOf(schemes); | |
return new AbstractIterator<ExtractedURI>() { | |
private long position = 0; | |
@Override | |
protected ExtractedURI computeNext() { | |
try { | |
return doNextURI(); | |
} catch (IOException e) { | |
throw new RuntimeException(e); | |
} | |
} | |
protected ExtractedURI doNextURI() throws IOException { | |
StringBuilder b = new StringBuilder(); | |
int current; | |
ExtractedURI uri = null; | |
while ( (current = reader.read()) != -1) { | |
if (isWhitespace(current) && b.length() > 2) { | |
uri = tryForURI(b); | |
b.setLength(0); | |
if (uri != null) break; | |
} | |
else if (isWhitespace (current)) { | |
b.setLength(0); | |
} | |
else if (b.length() > 2000) { | |
b.setLength(0); | |
} | |
else { | |
b.appendCodePoint(current); | |
} | |
position++; | |
} | |
//Handle the case if the URI is at the end. | |
if (uri == null && b.length() > 2) { | |
uri = tryForURI(b); | |
} | |
if (uri != null) return uri; | |
return endOfData(); | |
} | |
private ExtractedURI tryForURI(StringBuilder b) { | |
String potentialUri = b.toString(); | |
URI uri; | |
if ( (uri = doURI(potentialUri)) != null) | |
return new ExtractedURI(uri, this.position, potentialUri); | |
else return null; | |
} | |
private URI doURI(String potentialUri) { | |
URI uri = toURI(potentialUri); | |
for (ToURIStrategy s : strategies) { | |
uri = s.convert(potentialUri); | |
if (uri != null) break; | |
} | |
if (uri == null) return null; | |
if ( ! schemeSet.contains(uri.getScheme()) ) return null; | |
return uri; | |
} | |
}; | |
} | |
public static class ExtractedURI { | |
private final URI uri; | |
private final long position; | |
private final String preprocessed; | |
public ExtractedURI(URI uri, long position, String preprocessed) { | |
super(); | |
this.uri = uri; | |
this.position = position; | |
this.preprocessed = preprocessed; | |
} | |
public URI getUri() { | |
return uri; | |
} | |
public long getPosition() { | |
return position; | |
} | |
public String getPreprocessed() { | |
return preprocessed; | |
} | |
} | |
public static ToURIStrategy DEFAULT_STRATEGY = new ToURIStrategy() { | |
@Override | |
public URI convert(String potentialUri) { | |
return toURI(potentialUri); | |
} | |
}; | |
public static ToURIStrategy REMOVE_LAST_STRATEGY = new ToURIStrategy() { | |
@Override | |
public URI convert(String potentialUri) { | |
if (potentialUri.length() > 1) { | |
return toURI(potentialUri.substring(0, potentialUri.length() - 1)); | |
} | |
return null; | |
} | |
}; | |
public static List<ToURIStrategy> DEFAULT_STRATEGY_CHAIN = ImmutableList.of( | |
new RemoveSurroundsWithToURIStrategy("'"), | |
new RemoveSurroundsWithToURIStrategy("\""), | |
new RemoveSurroundsWithToURIStrategy("(", ")"), | |
new RemoveEndsWithToURIStrategy("."), | |
DEFAULT_STRATEGY, | |
REMOVE_LAST_STRATEGY); | |
public static class RemoveEndsWithToURIStrategy implements ToURIStrategy { | |
private final String endsWith; | |
public RemoveEndsWithToURIStrategy(String endsWith) { | |
super(); | |
this.endsWith = endsWith; | |
} | |
@Override | |
public URI convert(String potentialUri) { | |
URI u = null; | |
if (potentialUri.endsWith(endsWith)) { | |
String t = potentialUri.substring(0,potentialUri.length() - endsWith.length()); | |
u = toURI(t); | |
} | |
return u; | |
} | |
} | |
//private static List<ToURIStrategy> strategies; | |
public static class RemoveSurroundsWithToURIStrategy implements ToURIStrategy { | |
private final String startsWith; | |
private final String endsWith; | |
public RemoveSurroundsWithToURIStrategy(String surroundWith) { | |
super(); | |
this.startsWith = surroundWith; | |
this.endsWith = surroundWith; | |
} | |
public RemoveSurroundsWithToURIStrategy(String startsWith, String endsWith) { | |
super(); | |
this.startsWith = startsWith; | |
this.endsWith = endsWith; | |
} | |
@Override | |
public URI convert(String potentialUri) { | |
URI u = null; | |
if (potentialUri.startsWith(startsWith) && potentialUri.endsWith(endsWith)) { | |
String t = potentialUri.substring(startsWith.length(), potentialUri.length() - endsWith.length()); | |
u = toURI(t); | |
} | |
return u; | |
} | |
} | |
public static interface ToURIStrategy { | |
public URI convert(String potentialUri); | |
} | |
private static URI toURI(String potentialUri) { | |
try { | |
return new URI(potentialUri); | |
} catch (URISyntaxException e) { | |
return null; | |
} | |
} | |
//hmm what is whitespace https://spreadsheets.google.com/a/evocatus.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ | |
public static boolean isWhitespace(int c) { | |
return Character.isWhitespace(c); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment