Skip to content

Instantly share code, notes, and snippets.

@maxandersen
Created August 30, 2022 14:35
Show Gist options
  • Save maxandersen/da43dfe63e4964af263fb0696ab2d35c to your computer and use it in GitHub Desktop.
Save maxandersen/da43dfe63e4964af263fb0696ab2d35c to your computer and use it in GitHub Desktop.
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.regex.Pattern;
//JAVA 19+
//JAVAC_OPTIONS --enable-preview --source 19
//JAVA_OPTIONS --enable-preview
public class crawl {
Pattern urlRegex = Pattern.compile("[-a-zA-Z\\d@:%._+~#=]{1,256}\\.[a-zA-Z\\d()]{1,6}\\b([-a-zA-Z\\d()@:%_+.~#?&/=]*)");
Set<URI> foundURIs = new HashSet<>();
LinkedBlockingDeque<URI> deque = new LinkedBlockingDeque<>();
public void start(URI startURI) {
deque.add(startURI);
try (ExecutorService httpClientExecutorService = Executors.newVirtualThreadPerTaskExecutor();
ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor()) {
HttpClient client = HttpClient.newBuilder()
.followRedirects(HttpClient.Redirect.ALWAYS)
.connectTimeout(Duration.ofSeconds(1))
.executor(httpClientExecutorService)
.build();
while (foundURIs.size() < 600) {
try {
URI uri = deque.take();
System.out.println("uri = " + uri);
executor.submit(() -> crawl(uri, client));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
System.out.println("foundURIs = " + foundURIs.size());
}
private void crawl(URI uri, HttpClient client) {
var request = HttpRequest.newBuilder()
.uri(uri)
.GET()
.build();
try {
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
urlRegex.matcher(response.body())
.results()
.map(m -> m.group(0))
.map(s -> response.uri().resolve(s))
.forEach(s -> {
if (s.toString().startsWith(uri.toString()) && foundURIs.add(s)) {
System.out.printf("%s - Found '%s' in '%s'\n", Thread.currentThread(), s, uri);
deque.add(s);
}
});
} catch (Exception e) {
System.out.println("Failed to parse URI: " + uri);
}
}
public static void main(String[] args) {
crawl webCrawler = new crawl();
webCrawler.start(URI.create("https://quarkus.io/"));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment