Skip to content

Instantly share code, notes, and snippets.

@maxandersen
Created August 31, 2022 13:53
Show Gist options
  • Save maxandersen/11e7d484a7b4e90fdcd29efe1fa32634 to your computer and use it in GitHub Desktop.
Save maxandersen/11e7d484a7b4e90fdcd29efe1fa32634 to your computer and use it in GitHub Desktop.
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
//JAVA 19+
//JAVAC_OPTIONS --enable-preview --source 19
//JAVA_OPTIONS --enable-preview
//DEPS org.jsoup:jsoup:1.15.3
//DESCRIPTION web crawler using Java 19 and Loom.
public class crawl {
Set<URI> foundURIs = new HashSet<>();
LinkedBlockingDeque<URI> deque = new LinkedBlockingDeque<>();
public void start(URI startURI) {
deque.add(startURI);
try (ExecutorService httpClientExecutorService = Executors.newVirtualThreadPerTaskExecutor();
ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor()) {
HttpClient client = HttpClient.newBuilder()
.followRedirects(HttpClient.Redirect.ALWAYS)
.connectTimeout(Duration.ofSeconds(1))
.executor(httpClientExecutorService)
.build();
while (foundURIs.size()>=0) {
try {
URI uri = deque.take();
executor.submit(() -> crawl(uri, client));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
System.out.println("foundURIs = " + foundURIs.size());
}
private void crawl(URI uri, HttpClient client) {
var request = HttpRequest.newBuilder()
.uri(uri)
.GET()
.build();
try {
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
var doc = Jsoup.parse(response.body(), uri.toString());
Elements links = doc.select("a[href]");
//Elements media = doc.select("[src]");
// Elements imports = doc.select("link[href]");
links.stream().map(l -> l.absUrl("href"))
.filter(s -> !s.isBlank())
.forEach(s -> {
if (s.startsWith(uri.toASCIIString()) && foundURIs.add(URI.create(s))) {
deque.add(URI.create(s));
}
});
} catch (Exception e) {
System.out.println("Failed to parse URI: " + uri);
e.printStackTrace();
}
}
public static void main(String[] args) {
crawl webCrawler = new crawl();
webCrawler.start(URI.create("https://quarkus.io/"));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment