Skip to content

Instantly share code, notes, and snippets.

@juliano
Created July 31, 2012 23:43
Show Gist options
  • Save juliano/3221751 to your computer and use it in GitHub Desktop.
Save juliano/3221751 to your computer and use it in GitHub Desktop.
Spider
package br.com.spider;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
import java.util.Set;
public class ContentReader {
public Set<String> linksFor(final String baseUrl) {
String content = null;
try {
URL url = new URL(baseUrl);
URLConnection conn = url.openConnection();
content = new Scanner(conn.getInputStream()).useDelimiter("$$").next();
} catch (IOException e) {
throw new RuntimeException(e);
}
return new LinksExtractor().getLinks(content);
}
}
package br.com.spider;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LinksExtractor {
public Set<String> getLinks(final String content) {
Pattern pattern = Pattern.compile("(?i)(?s)<\\s*?a.*?href=\"(.*?)\".*?>");
Matcher matcher = pattern.matcher(content);
List<String> list = new ArrayList<String>();
while (matcher.find()) {
list.add(matcher.group(1));
}
return onlyHtml(list);
}
private Set<String> onlyHtml(final List<String> list) {
Set<String> onlyHtmlList = new HashSet<String>();
for (String link : list) {
if (link.endsWith(".html") && !link.startsWith("http")) {
onlyHtmlList.add(link);
}
}
return onlyHtmlList;
}
}
package br.com.spider;
import static java.util.concurrent.TimeUnit.SECONDS;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicInteger;
public class Spider {
private final String baseUrl;
public final AtomicInteger executorCounter;
private final ThreadPoolExecutor executor;
public Spider(final String baseUrl) {
this.baseUrl = baseUrl;
executorCounter = new AtomicInteger(0);
executor = new ThreadPoolExecutor(6, 6, 0, SECONDS, new LinkedBlockingDeque<Runnable>(Integer.MAX_VALUE));
}
public void go() {
try {
executor.execute(new SpiderRunnable(baseUrl, new Visitor(), executor, executorCounter));
while (executorCounter.get() != 0) {
System.out.println("Executores rodando: " + executorCounter.get());
sleep();
}
} finally {
executor.shutdown();
}
}
private void sleep() {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new RuntimeException("main thread died. ", e);
}
}
public static void main(final String[] args) {
long initialTime = System.currentTimeMillis();
new Spider("http://www.javaperformance.com.br").go();
System.out.println("Executado em: " + (System.currentTimeMillis() - initialTime));
}
}
package br.com.spider;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicInteger;
public class SpiderRunnable implements Runnable {
private final String baseUrl;
private final String localLink;
private final Visitor visitor;
private final ContentReader reader;
private final ThreadPoolExecutor executor;
private final AtomicInteger executorCounter;
private SpiderRunnable(final String baseUrl, final String link, final Visitor visitor,
final ThreadPoolExecutor executor, final AtomicInteger executorCounter) {
this.baseUrl = baseUrl;
this.localLink = baseUrl + link;
this.visitor = visitor;
this.executor = executor;
this.executorCounter = executorCounter;
this.reader = new ContentReader();
executorCounter.incrementAndGet();
}
public SpiderRunnable(final String url, final Visitor visitor, final ThreadPoolExecutor executor,
final AtomicInteger executorCounter) {
this(url, "/", visitor, executor, executorCounter);
}
public void run() {
visitor.visit(localLink);
for (String link : reader.linksFor(localLink)) {
if (!visitor.visited(baseUrl + link)) {
executor.execute(new SpiderRunnable(baseUrl, link, visitor, executor, executorCounter));
}
}
executorCounter.decrementAndGet();
}
}
package br.com.spider;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class Visitor {
private final Map<String, Boolean> visitedLinks;
public Visitor() {
visitedLinks = new ConcurrentHashMap<String, Boolean>();
}
public void visit(final String url) {
System.out.println("visiting " + url);
}
public boolean visited(final String url) {
Boolean visited = visitedLinks.get(url);
if (visited != null) {
return visited;
}
visitedLinks.put(url, true);
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment