Skip to content

Instantly share code, notes, and snippets.

@austinschwartz
Last active October 21, 2016 23:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save austinschwartz/32c8681794e0cbf06c114175f63f13b3 to your computer and use it in GitHub Desktop.
Save austinschwartz/32c8681794e0cbf06c114175f63f13b3 to your computer and use it in GitHub Desktop.
In-memory crawler
import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.*;
public class Crawler
{
public int urlID;
public ConcurrentHashMap<String, Set<Integer>> ii;
public ConcurrentHashMap<Integer, String> urls;
public ConcurrentHashMap<String, Integer> urlToIndex;
public Set<Integer> crawled;
public ConcurrentLinkedQueue<Integer> queue;
public Crawler() {
urlID = 0;
ii = new ConcurrentHashMap<>();
urls = new ConcurrentHashMap<>();
urlToIndex = new ConcurrentHashMap<>();
queue = new ConcurrentLinkedQueue<>();
crawled = Collections.newSetFromMap(new ConcurrentHashMap<>());
}
public boolean urlInDB(String url) {
return crawled.contains(url);
}
public synchronized void addURL(String url) {
if (crawled.contains(url) || !url.contains("purdue.edu") || !url.contains("http"))
return;
int currID = urlID;
urlID += 1;
urls.put(currID, url);
urlToIndex.put(url, currID);
queue.offer(currID);
}
public void crawl() {
while (!queue.isEmpty()) {
int currID = queue.poll();
String url = urls.get(currID);
if (urlInDB(url))
continue;
try {
crawl(url);
crawled.add(urlID);
} catch (Exception e) {
e.printStackTrace();
}
}
}
public void crawl(String urlScanned) throws IOException {
System.out.println(Thread.currentThread() + " " + crawled.size() + " - Crawling " + urlScanned);
Document doc = Jsoup.connect(urlScanned).get();
Elements links = doc.select("a[href]");
links.forEach((Element a) -> addURL(a.attr("abs:href")));
String text = doc.body().text();
Pattern p = Pattern.compile("[A-Za-z]*");
for (String str : text.split(" ")) {
Matcher m = p.matcher(str);
if (m.matches()) {
String word = str.toLowerCase().replace("\"", "\'");
Set<Integer> set = ii.getOrDefault(word, new HashSet<>());
set.add(urlToIndex.get(urlScanned));
ii.put(word, set);
}
}
}
public static void main(String[] args) throws InterruptedException {
Crawler crawler = new Crawler();
String root = "http://cs.purdue.edu";
try {
crawler.crawl(root);
} catch (Exception e) {}
Thread.sleep(5000);
for (int i = 0; i < 8; i++) {
Thread thread = new Thread(() -> crawler.crawl());
thread.start();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment