Skip to content

Instantly share code, notes, and snippets.

@agentgt
Created September 8, 2012 12:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save agentgt/3674391 to your computer and use it in GitHub Desktop.
Save agentgt/3674391 to your computer and use it in GitHub Desktop.
URIExtractor extracts URI and URL from a stream.
package com.snaphop.util;
import static com.google.common.collect.Iterators.transform;
import java.io.IOException;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.google.common.base.Function;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
public class URIExtractor {
public static Iterator<URI> extractURIs(final Reader reader, String ... schemes) {
return transform(extractURIs(reader, DEFAULT_STRATEGY_CHAIN, schemes), new Function<ExtractedURI, URI>() {
@Override
public URI apply(ExtractedURI input) {
return input.getUri();
}
});
}
public static Iterator<ExtractedURI> extractURIs(
final Reader reader,
final Iterable<ToURIStrategy> strategies,
String ... schemes) {
final Set<String> schemeSet = ImmutableSet.copyOf(schemes);
return new AbstractIterator<ExtractedURI>() {
private long position = 0;
@Override
protected ExtractedURI computeNext() {
try {
return doNextURI();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
protected ExtractedURI doNextURI() throws IOException {
StringBuilder b = new StringBuilder();
int current;
ExtractedURI uri = null;
while ( (current = reader.read()) != -1) {
if (isWhitespace(current) && b.length() > 2) {
uri = tryForURI(b);
b.setLength(0);
if (uri != null) break;
}
else if (isWhitespace (current)) {
b.setLength(0);
}
else if (b.length() > 2000) {
b.setLength(0);
}
else {
b.appendCodePoint(current);
}
position++;
}
//Handle the case if the URI is at the end.
if (uri == null && b.length() > 2) {
uri = tryForURI(b);
}
if (uri != null) return uri;
return endOfData();
}
private ExtractedURI tryForURI(StringBuilder b) {
String potentialUri = b.toString();
URI uri;
if ( (uri = doURI(potentialUri)) != null)
return new ExtractedURI(uri, this.position, potentialUri);
else return null;
}
private URI doURI(String potentialUri) {
URI uri = toURI(potentialUri);
for (ToURIStrategy s : strategies) {
uri = s.convert(potentialUri);
if (uri != null) break;
}
if (uri == null) return null;
if ( ! schemeSet.contains(uri.getScheme()) ) return null;
return uri;
}
};
}
public static class ExtractedURI {
private final URI uri;
private final long position;
private final String preprocessed;
public ExtractedURI(URI uri, long position, String preprocessed) {
super();
this.uri = uri;
this.position = position;
this.preprocessed = preprocessed;
}
public URI getUri() {
return uri;
}
public long getPosition() {
return position;
}
public String getPreprocessed() {
return preprocessed;
}
}
public static ToURIStrategy DEFAULT_STRATEGY = new ToURIStrategy() {
@Override
public URI convert(String potentialUri) {
return toURI(potentialUri);
}
};
public static ToURIStrategy REMOVE_LAST_STRATEGY = new ToURIStrategy() {
@Override
public URI convert(String potentialUri) {
if (potentialUri.length() > 1) {
return toURI(potentialUri.substring(0, potentialUri.length() - 1));
}
return null;
}
};
public static List<ToURIStrategy> DEFAULT_STRATEGY_CHAIN = ImmutableList.of(
new RemoveSurroundsWithToURIStrategy("'"),
new RemoveSurroundsWithToURIStrategy("\""),
new RemoveSurroundsWithToURIStrategy("(", ")"),
new RemoveEndsWithToURIStrategy("."),
DEFAULT_STRATEGY,
REMOVE_LAST_STRATEGY);
public static class RemoveEndsWithToURIStrategy implements ToURIStrategy {
private final String endsWith;
public RemoveEndsWithToURIStrategy(String endsWith) {
super();
this.endsWith = endsWith;
}
@Override
public URI convert(String potentialUri) {
URI u = null;
if (potentialUri.endsWith(endsWith)) {
String t = potentialUri.substring(0,potentialUri.length() - endsWith.length());
u = toURI(t);
}
return u;
}
}
//private static List<ToURIStrategy> strategies;
public static class RemoveSurroundsWithToURIStrategy implements ToURIStrategy {
private final String startsWith;
private final String endsWith;
public RemoveSurroundsWithToURIStrategy(String surroundWith) {
super();
this.startsWith = surroundWith;
this.endsWith = surroundWith;
}
public RemoveSurroundsWithToURIStrategy(String startsWith, String endsWith) {
super();
this.startsWith = startsWith;
this.endsWith = endsWith;
}
@Override
public URI convert(String potentialUri) {
URI u = null;
if (potentialUri.startsWith(startsWith) && potentialUri.endsWith(endsWith)) {
String t = potentialUri.substring(startsWith.length(), potentialUri.length() - endsWith.length());
u = toURI(t);
}
return u;
}
}
public static interface ToURIStrategy {
public URI convert(String potentialUri);
}
private static URI toURI(String potentialUri) {
try {
return new URI(potentialUri);
} catch (URISyntaxException e) {
return null;
}
}
//hmm what is whitespace https://spreadsheets.google.com/a/evocatus.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ
public static boolean isWhitespace(int c) {
return Character.isWhitespace(c);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment