Skip to content

Instantly share code, notes, and snippets.

@nvurgaft
Created August 1, 2015 09:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nvurgaft/132cc0fb4c37d7767ef8 to your computer and use it in GitHub Desktop.
Save nvurgaft/132cc0fb4c37d7767ef8 to your computer and use it in GitHub Desktop.
simple web crawler functionality
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] args) {
URL url;
HttpURLConnection http;
Pattern pattern;
Matcher matcher;
BufferedReader reader = null;
StringBuilder sb;
// taken from : http://stackoverflow.com/questions/163360/regular-expression-to-match-urls-in-java
String urlRegex = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
try {
url = new URL("http://perlmonks.org");
http = (HttpURLConnection) url.openConnection();
http.setRequestMethod("GET");
reader = new BufferedReader(new InputStreamReader(http.getInputStream()));
sb = new StringBuilder();
String line;
while ((line = reader.readLine())!=null) {
sb.append(line);
}
// the entire HTML content of the page
//System.out.println(sb.toString());
// find URLs
pattern = Pattern.compile(urlRegex);
matcher = pattern.matcher(sb.toString());
System.out.println("Website URL's: ");
int count = 0;
while(matcher.find()) {
++count;
System.out.println(count + ". " + sb.toString().substring(matcher.start(), matcher.end()));
}
} catch (Exception ex) {
System.out.println("Exception occured");
ex.printStackTrace();
} finally {
if (reader!=null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment