Created
August 30, 2022 14:35
-
-
Save maxandersen/da43dfe63e4964af263fb0696ab2d35c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.net.URI; | |
import java.net.http.HttpClient; | |
import java.net.http.HttpRequest; | |
import java.net.http.HttpResponse; | |
import java.time.Duration; | |
import java.util.HashSet; | |
import java.util.Set; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import java.util.concurrent.LinkedBlockingDeque; | |
import java.util.regex.Pattern; | |
//JAVA 19+ | |
//JAVAC_OPTIONS --enable-preview --source 19 | |
//JAVA_OPTIONS --enable-preview | |
public class crawl { | |
Pattern urlRegex = Pattern.compile("[-a-zA-Z\\d@:%._+~#=]{1,256}\\.[a-zA-Z\\d()]{1,6}\\b([-a-zA-Z\\d()@:%_+.~#?&/=]*)"); | |
Set<URI> foundURIs = new HashSet<>(); | |
LinkedBlockingDeque<URI> deque = new LinkedBlockingDeque<>(); | |
public void start(URI startURI) { | |
deque.add(startURI); | |
try (ExecutorService httpClientExecutorService = Executors.newVirtualThreadPerTaskExecutor(); | |
ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor()) { | |
HttpClient client = HttpClient.newBuilder() | |
.followRedirects(HttpClient.Redirect.ALWAYS) | |
.connectTimeout(Duration.ofSeconds(1)) | |
.executor(httpClientExecutorService) | |
.build(); | |
while (foundURIs.size() < 600) { | |
try { | |
URI uri = deque.take(); | |
System.out.println("uri = " + uri); | |
executor.submit(() -> crawl(uri, client)); | |
} catch (InterruptedException e) { | |
throw new RuntimeException(e); | |
} | |
} | |
} | |
System.out.println("foundURIs = " + foundURIs.size()); | |
} | |
private void crawl(URI uri, HttpClient client) { | |
var request = HttpRequest.newBuilder() | |
.uri(uri) | |
.GET() | |
.build(); | |
try { | |
var response = client.send(request, HttpResponse.BodyHandlers.ofString()); | |
urlRegex.matcher(response.body()) | |
.results() | |
.map(m -> m.group(0)) | |
.map(s -> response.uri().resolve(s)) | |
.forEach(s -> { | |
if (s.toString().startsWith(uri.toString()) && foundURIs.add(s)) { | |
System.out.printf("%s - Found '%s' in '%s'\n", Thread.currentThread(), s, uri); | |
deque.add(s); | |
} | |
}); | |
} catch (Exception e) { | |
System.out.println("Failed to parse URI: " + uri); | |
} | |
} | |
public static void main(String[] args) { | |
crawl webCrawler = new crawl(); | |
webCrawler.start(URI.create("https://quarkus.io/")); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment