Skip to content

Instantly share code, notes, and snippets.

@mubbashir10
Created July 13, 2016 09:52
Show Gist options
  • Save mubbashir10/bcb5c85628aabc1b03115d0a527030d7 to your computer and use it in GitHub Desktop.
Save mubbashir10/bcb5c85628aabc1b03115d0a527030d7 to your computer and use it in GitHub Desktop.
Extract images and anchor tag (hyperlinks) from a given website using JAVA and jSoup. Raw
/*
To run this program you need jSoup in your classpath.
Author: Mubbashir10
URL: http://mubbashir10.com
*/
import java.util.*;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class WebSpider{
private static Scanner input;
private static String path;
private static Document doc;
private static Elements links;
private static Elements images;
public static void main(String[] args) {
//input from user
input = new Scanner(System.in);
System.out.println("Kindly enter URL:(e.g http://google.com)");
path = input.nextLine();
try {
//connecting using http protocol
doc = Jsoup.connect(path).get();
//getting a tags
System.out.println("Page contains following anchor tags");
links = doc.select("a[href]");
for (Element link : links)
System.out.println("\nLink: " + link);
//getting img tags
System.out.println("Page contains following image tags");
images = doc.select("img[src]");
for (Element source : images)
System.out.println("\nImage : " + source);
}
//catching exception
catch (IOException e){
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment