Last active
October 21, 2016 23:08
-
-
Save austinschwartz/32c8681794e0cbf06c114175f63f13b3 to your computer and use it in GitHub Desktop.
In-memory crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.*; | |
import org.jsoup.select.Elements; | |
import java.io.*; | |
import java.util.*; | |
import java.util.concurrent.*; | |
import java.util.regex.*; | |
public class Crawler | |
{ | |
public int urlID; | |
public ConcurrentHashMap<String, Set<Integer>> ii; | |
public ConcurrentHashMap<Integer, String> urls; | |
public ConcurrentHashMap<String, Integer> urlToIndex; | |
public Set<Integer> crawled; | |
public ConcurrentLinkedQueue<Integer> queue; | |
public Crawler() { | |
urlID = 0; | |
ii = new ConcurrentHashMap<>(); | |
urls = new ConcurrentHashMap<>(); | |
urlToIndex = new ConcurrentHashMap<>(); | |
queue = new ConcurrentLinkedQueue<>(); | |
crawled = Collections.newSetFromMap(new ConcurrentHashMap<>()); | |
} | |
public boolean urlInDB(String url) { | |
return crawled.contains(url); | |
} | |
public synchronized void addURL(String url) { | |
if (crawled.contains(url) || !url.contains("purdue.edu") || !url.contains("http")) | |
return; | |
int currID = urlID; | |
urlID += 1; | |
urls.put(currID, url); | |
urlToIndex.put(url, currID); | |
queue.offer(currID); | |
} | |
public void crawl() { | |
while (!queue.isEmpty()) { | |
int currID = queue.poll(); | |
String url = urls.get(currID); | |
if (urlInDB(url)) | |
continue; | |
try { | |
crawl(url); | |
crawled.add(urlID); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
public void crawl(String urlScanned) throws IOException { | |
System.out.println(Thread.currentThread() + " " + crawled.size() + " - Crawling " + urlScanned); | |
Document doc = Jsoup.connect(urlScanned).get(); | |
Elements links = doc.select("a[href]"); | |
links.forEach((Element a) -> addURL(a.attr("abs:href"))); | |
String text = doc.body().text(); | |
Pattern p = Pattern.compile("[A-Za-z]*"); | |
for (String str : text.split(" ")) { | |
Matcher m = p.matcher(str); | |
if (m.matches()) { | |
String word = str.toLowerCase().replace("\"", "\'"); | |
Set<Integer> set = ii.getOrDefault(word, new HashSet<>()); | |
set.add(urlToIndex.get(urlScanned)); | |
ii.put(word, set); | |
} | |
} | |
} | |
public static void main(String[] args) throws InterruptedException { | |
Crawler crawler = new Crawler(); | |
String root = "http://cs.purdue.edu"; | |
try { | |
crawler.crawl(root); | |
} catch (Exception e) {} | |
Thread.sleep(5000); | |
for (int i = 0; i < 8; i++) { | |
Thread thread = new Thread(() -> crawler.crawl()); | |
thread.start(); | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment