Created
November 30, 2019 21:32
-
-
Save ssaurel/f7def9bc6f4299fd6c5c6ebfe04e2fd1 to your computer and use it in GitHub Desktop.
MyCrawler implements a simple Web Crawler in Java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.ssaurel.mycrawler; | |
import java.net.URI; | |
import java.net.URISyntaxException; | |
import org.apache.commons.collections4.MultiValuedMap; | |
import org.apache.commons.collections4.multimap.HashSetValuedHashMap; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
public class MyCrawler { | |
private static final String DOMAIN = "toutsurlebitcoin.fr"; | |
private static final boolean SAME_DOMAIN = true; | |
private static final int MAX_DEPTH = 10; | |
private MultiValuedMap < String, Out > inOut; | |
public MyCrawler() { | |
inOut = new HashSetValuedHashMap < > (); | |
} | |
public static String getDomainName(String url) { | |
String domain = null; | |
try { | |
URI uri = new URI(url); | |
domain = uri.getHost(); | |
domain = domain.startsWith("www.") ? domain.substring(4) : domain; | |
if (domain.startsWith("mailto:")) { | |
domain = null; | |
} | |
} catch (URISyntaxException e) { | |
} | |
return domain; | |
} | |
public void crawl(String URL, int depth) { | |
System.out.println("Crawling Depth = " + depth + " | URL = " + URL); | |
try { | |
String domain = getDomainName(URL); | |
if (domain != null && !inOut.containsKey(URL) | |
&& (!SAME_DOMAIN || domain.equals(DOMAIN)) && | |
(depth < MAX_DEPTH)) { | |
Document document = Jsoup.connect(URL).get(); | |
Elements ahrefs = document.select("a[href]"); | |
depth++; | |
for (Element ahref: ahrefs) { | |
String dest = ahref.attr("abs:href"); | |
String anchor = ahref.text(); | |
inOut.put(URL, new Out(dest, anchor)); | |
crawl(dest, depth); | |
} | |
} | |
} catch (Exception e) { | |
// ... Error for URL ... | |
} | |
} | |
public static void main(String[] args) { | |
MyCrawler crawler = new MyCrawler(); | |
crawler.crawl("https://www.toutsurlebitcoin.fr/", 0); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment