Created
July 31, 2012 23:43
-
-
Save juliano/3221751 to your computer and use it in GitHub Desktop.
Spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.spider; | |
import java.io.IOException; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.Scanner; | |
import java.util.Set; | |
public class ContentReader { | |
public Set<String> linksFor(final String baseUrl) { | |
String content = null; | |
try { | |
URL url = new URL(baseUrl); | |
URLConnection conn = url.openConnection(); | |
content = new Scanner(conn.getInputStream()).useDelimiter("$$").next(); | |
} catch (IOException e) { | |
throw new RuntimeException(e); | |
} | |
return new LinksExtractor().getLinks(content); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.spider; | |
import java.util.ArrayList; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Set; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class LinksExtractor { | |
public Set<String> getLinks(final String content) { | |
Pattern pattern = Pattern.compile("(?i)(?s)<\\s*?a.*?href=\"(.*?)\".*?>"); | |
Matcher matcher = pattern.matcher(content); | |
List<String> list = new ArrayList<String>(); | |
while (matcher.find()) { | |
list.add(matcher.group(1)); | |
} | |
return onlyHtml(list); | |
} | |
private Set<String> onlyHtml(final List<String> list) { | |
Set<String> onlyHtmlList = new HashSet<String>(); | |
for (String link : list) { | |
if (link.endsWith(".html") && !link.startsWith("http")) { | |
onlyHtmlList.add(link); | |
} | |
} | |
return onlyHtmlList; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.spider; | |
import static java.util.concurrent.TimeUnit.SECONDS; | |
import java.util.concurrent.LinkedBlockingDeque; | |
import java.util.concurrent.ThreadPoolExecutor; | |
import java.util.concurrent.atomic.AtomicInteger; | |
public class Spider { | |
private final String baseUrl; | |
public final AtomicInteger executorCounter; | |
private final ThreadPoolExecutor executor; | |
public Spider(final String baseUrl) { | |
this.baseUrl = baseUrl; | |
executorCounter = new AtomicInteger(0); | |
executor = new ThreadPoolExecutor(6, 6, 0, SECONDS, new LinkedBlockingDeque<Runnable>(Integer.MAX_VALUE)); | |
} | |
public void go() { | |
try { | |
executor.execute(new SpiderRunnable(baseUrl, new Visitor(), executor, executorCounter)); | |
while (executorCounter.get() != 0) { | |
System.out.println("Executores rodando: " + executorCounter.get()); | |
sleep(); | |
} | |
} finally { | |
executor.shutdown(); | |
} | |
} | |
private void sleep() { | |
try { | |
Thread.sleep(1000); | |
} catch (InterruptedException e) { | |
throw new RuntimeException("main thread died. ", e); | |
} | |
} | |
public static void main(final String[] args) { | |
long initialTime = System.currentTimeMillis(); | |
new Spider("http://www.javaperformance.com.br").go(); | |
System.out.println("Executado em: " + (System.currentTimeMillis() - initialTime)); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.spider; | |
import java.util.concurrent.ThreadPoolExecutor; | |
import java.util.concurrent.atomic.AtomicInteger; | |
public class SpiderRunnable implements Runnable { | |
private final String baseUrl; | |
private final String localLink; | |
private final Visitor visitor; | |
private final ContentReader reader; | |
private final ThreadPoolExecutor executor; | |
private final AtomicInteger executorCounter; | |
private SpiderRunnable(final String baseUrl, final String link, final Visitor visitor, | |
final ThreadPoolExecutor executor, final AtomicInteger executorCounter) { | |
this.baseUrl = baseUrl; | |
this.localLink = baseUrl + link; | |
this.visitor = visitor; | |
this.executor = executor; | |
this.executorCounter = executorCounter; | |
this.reader = new ContentReader(); | |
executorCounter.incrementAndGet(); | |
} | |
public SpiderRunnable(final String url, final Visitor visitor, final ThreadPoolExecutor executor, | |
final AtomicInteger executorCounter) { | |
this(url, "/", visitor, executor, executorCounter); | |
} | |
public void run() { | |
visitor.visit(localLink); | |
for (String link : reader.linksFor(localLink)) { | |
if (!visitor.visited(baseUrl + link)) { | |
executor.execute(new SpiderRunnable(baseUrl, link, visitor, executor, executorCounter)); | |
} | |
} | |
executorCounter.decrementAndGet(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package br.com.spider; | |
import java.util.Map; | |
import java.util.concurrent.ConcurrentHashMap; | |
public class Visitor { | |
private final Map<String, Boolean> visitedLinks; | |
public Visitor() { | |
visitedLinks = new ConcurrentHashMap<String, Boolean>(); | |
} | |
public void visit(final String url) { | |
System.out.println("visiting " + url); | |
} | |
public boolean visited(final String url) { | |
Boolean visited = visitedLinks.get(url); | |
if (visited != null) { | |
return visited; | |
} | |
visitedLinks.put(url, true); | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment