Last active
December 1, 2019 11:02
-
-
Save ssaurel/d0d9d0fba866ae324ad9ace81832f3d3 to your computer and use it in GitHub Desktop.
Backlink Checker in Java on the SSaurel's Blog
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.ssaurel.mycrawler; | |
import java.net.URI; | |
import java.net.URISyntaxException; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import java.util.List; | |
import java.util.Map.Entry; | |
import org.apache.commons.collections4.MultiValuedMap; | |
import org.apache.commons.collections4.multimap.HashSetValuedHashMap; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
public class MyCrawler { | |
private static final String DOMAIN = "toutsurlebitcoin.fr"; | |
private static final boolean SAME_DOMAIN = true; | |
private static final int MAX_DEPTH = 10; | |
private MultiValuedMap < String, Out > inOut; | |
public MyCrawler() { | |
inOut = new HashSetValuedHashMap < > (); | |
} | |
public static String getDomainName(String url) { | |
String domain = null; | |
try { | |
URI uri = new URI(url); | |
domain = uri.getHost(); | |
domain = domain.startsWith("www.") ? domain.substring(4) : domain; | |
if (domain.startsWith("mailto:")) { | |
domain = null; | |
} | |
} catch (URISyntaxException e) { | |
} | |
return domain; | |
} | |
public void crawl(String URL, int depth) { | |
System.out.println("Crawling Depth = " + depth + " | URL = " + URL); | |
try { | |
String domain = getDomainName(URL); | |
if (domain != null && !inOut.containsKey(URL) | |
&& (!SAME_DOMAIN || domain.equals(DOMAIN)) && | |
(depth < MAX_DEPTH)) { | |
Document document = Jsoup.connect(URL).get(); | |
Elements ahrefs = document.select("a[href]"); | |
depth++; | |
for (Element ahref: ahrefs) { | |
String dest = ahref.attr("abs:href"); | |
String anchor = ahref.text(); | |
inOut.put(URL, new Out(dest, anchor)); | |
crawl(dest, depth); | |
} | |
} | |
} catch (Exception e) { | |
// ... Error for URL ... | |
} | |
} | |
public void print() { | |
for (Entry < String, Collection < Out >> links: inOut.asMap().entrySet()) { | |
System.out.println("\n" + links.getKey() + "\n"); | |
for (Out link: links.getValue()) { | |
System.out.println("\t" + link.href + " | Anchor = " + link.anchor); | |
} | |
} | |
} | |
public void printBacklinks(String url) { | |
List < Tuple > list = new ArrayList < > (); | |
for (Entry < String, Collection < Out >> links: inOut.asMap().entrySet()) { | |
String in = links.getKey(); | |
for (Out out: links.getValue()) { | |
if (out.href.equals(url)) { | |
Tuple t = Tuple.GENERIC; | |
t.href = in ; | |
if (list.contains(t)) { | |
int index = list.indexOf(t); | |
t = list.get(index); | |
} else { | |
t = new Tuple( in ); | |
} | |
t.addValue(out.anchor); | |
if (!list.contains(t)) { | |
list.add(t); | |
} | |
} | |
} | |
} | |
System.out.println("\nURL = " + url); | |
System.out.println("Nb backlinks = " + list.size()); | |
for (Tuple link: list) { | |
System.out.print("\t" + link.href + " | Keywords = "); | |
for (String w: link.anchors) { | |
System.out.print(w + ", "); | |
} | |
System.out.println(); | |
} | |
} | |
public static void main(String[] args) { | |
MyCrawler crawler = new MyCrawler(); | |
crawler.crawl("https://www.toutsurlebitcoin.fr/", 0); | |
//crawler.print(); | |
crawler.printBacklinks("https://www.toutsurlebitcoin.fr/quest-ce-que-le-bitcoin.htm"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment