Created
November 22, 2019 14:38
-
-
Save jackson-rz/718751c30dc267a85c3b4f0aa130f2ea to your computer and use it in GitHub Desktop.
Multithreaded web Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Solution { | |
public List<String> crawl(String startUrl, HtmlParser htmlParser) { | |
// find hostname | |
int index = startUrl.indexOf('/', 7); | |
String hostname = (index != -1) ? startUrl.substring(0, index) : startUrl; | |
// multi-thread | |
Crawler crawler = new Crawler(startUrl, hostname, htmlParser); | |
crawler.result = new HashSet<>(); // reset result as static property belongs to class, it will go through all of the test cases | |
Thread thread = new Thread(crawler); | |
thread.start(); | |
crawler.joinThread(thread); // wait for thread to complete | |
return new ArrayList<>(crawler.result); | |
} | |
} | |
class Crawler implements Runnable { | |
String startUrl; | |
String hostname; | |
HtmlParser htmlParser; | |
public static volatile Set<String> result = new HashSet<>(); | |
public Crawler(String startUrl, String hostname, HtmlParser htmlParser){ | |
this.startUrl = startUrl; | |
this.hostname = hostname; | |
this.htmlParser = htmlParser; | |
} | |
@Override | |
public void run(){ | |
if(this.startUrl.startsWith(hostname) && !this.result.contains(this.startUrl)){ | |
addUrl(this.result, this.startUrl); | |
List<Thread> threads = new ArrayList<>(); | |
for(String s: htmlParser.getUrls(startUrl)){ | |
Crawler crawler = new Crawler(s, hostname, htmlParser); | |
Thread thread = new Thread(crawler); | |
thread.start(); | |
threads.add(thread); | |
} | |
for(Thread t: threads){ | |
joinThread(t); // wait for all threads to complete | |
} | |
} | |
} | |
public static synchronized void addUrl(Set<String> result, String url){ | |
result.add(url); | |
} | |
public static void joinThread(Thread thread){ | |
try{ | |
thread.join(); | |
} catch(InterruptedException e){ | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment