Skip to content

Instantly share code, notes, and snippets.

@ssaurel
Created November 30, 2019 21:32
Show Gist options
  • Save ssaurel/f7def9bc6f4299fd6c5c6ebfe04e2fd1 to your computer and use it in GitHub Desktop.
Save ssaurel/f7def9bc6f4299fd6c5c6ebfe04e2fd1 to your computer and use it in GitHub Desktop.
MyCrawler implements a simple Web Crawler in Java
package com.ssaurel.mycrawler;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.commons.collections4.MultiValuedMap;
import org.apache.commons.collections4.multimap.HashSetValuedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class MyCrawler {
private static final String DOMAIN = "toutsurlebitcoin.fr";
private static final boolean SAME_DOMAIN = true;
private static final int MAX_DEPTH = 10;
private MultiValuedMap < String, Out > inOut;
public MyCrawler() {
inOut = new HashSetValuedHashMap < > ();
}
public static String getDomainName(String url) {
String domain = null;
try {
URI uri = new URI(url);
domain = uri.getHost();
domain = domain.startsWith("www.") ? domain.substring(4) : domain;
if (domain.startsWith("mailto:")) {
domain = null;
}
} catch (URISyntaxException e) {
}
return domain;
}
public void crawl(String URL, int depth) {
System.out.println("Crawling Depth = " + depth + " | URL = " + URL);
try {
String domain = getDomainName(URL);
if (domain != null && !inOut.containsKey(URL)
&& (!SAME_DOMAIN || domain.equals(DOMAIN)) &&
(depth < MAX_DEPTH)) {
Document document = Jsoup.connect(URL).get();
Elements ahrefs = document.select("a[href]");
depth++;
for (Element ahref: ahrefs) {
String dest = ahref.attr("abs:href");
String anchor = ahref.text();
inOut.put(URL, new Out(dest, anchor));
crawl(dest, depth);
}
}
} catch (Exception e) {
// ... Error for URL ...
}
}
public static void main(String[] args) {
MyCrawler crawler = new MyCrawler();
crawler.crawl("https://www.toutsurlebitcoin.fr/", 0);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment