namkyu/JsoupTest.java

## JsoupTest.java
package com.kyu.app;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.junit.Test;

import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.jsoup.Jsoup.connect;

public class JSoupTest {

    @Test
    public void test() {
        String htmlString = "<html><head><title>My title</title></head>"
                + "<body>Body content</body></html>"
                + "<div id='mydiv'>Contents of a div element</div>";

        Document doc = Jsoup.parse(htmlString);
        String title = doc.title();
        String body = doc.body().text();
        Element divTag = doc.getElementById("mydiv");

        System.out.printf("title : %s%n", title);
        System.out.printf("body : %s%n", body);
        System.out.printf("divTag : %s", divTag.text());
    }

    @Test
    public void connect테스트() throws IOException {
        String url = "https://www.google.com";

        Document doc = connect(url).get();
        String title = doc.title();
        System.out.printf("title : %s%n", title);

        String html = doc.html();
        System.out.printf("html : %s%n", html);
    }

    @Test
    public void query() throws IOException {
        String url = "http://www.jsoup.org";
        Document document = connect(url).get();

        String description = document.select("meta[name=description]").first().attr("content");
        System.out.println("description : " + description);

        String keywords = document.select("meta[name=keywords]").first().attr("content");
        System.out.println("keywords : " + keywords);
    }

    @Test
    public void link파싱() throws IOException {
        String url = "http://jsoup.org";

        Document document = connect(url).get();
        Elements links = document.select("a[href]");

        for (Element link : links) {
            System.out.println("link : " + link.attr("href"));
            System.out.println("text : " + link.text());
        }
    }

    @Test
    public void html문법검증() {
        String htmlString = "<html><head><title>My title</title></head>"
                + "<body><center>Body content</center></body></html>";

        boolean valid = Jsoup.isValid(htmlString, Whitelist.basic());

        if (valid) {
            System.out.println("The document is valid");
        } else {
            System.out.println("The document is not valid.");
            System.out.println("Cleaned document");

            Document dirtyDoc = Jsoup.parse(htmlString);
            Document cleanDoc = new Cleaner(Whitelist.basic()).clean(dirtyDoc);

            System.out.println(cleanDoc.html());
        }
    }

    @Test
    public void 모든이미지정보() throws IOException {
        Document doc = connect("https://naver.com").get();
        Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]");

        for (Element image : images) {
            System.out.println("\nsrc : " + image.attr("src"));
            System.out.println("height : " + image.attr("height"));
            System.out.println("width : " + image.attr("width"));
            System.out.println("alt : " + image.attr("alt"));

        }
    }

    @Test
    public void form의input정보추출() throws IOException {
        Document doc = connect("https://naver.com").get();
        Element formElement = doc.getElementById("sform");

        Elements inputElements = formElement.getElementsByTag("input");
        for (Element inputElement : inputElements) {
            String key = inputElement.attr("name");
            String value = inputElement.attr("value");
            System.out.println("Param name: "+key+" \nParam value: "+value);
        }
    }

    private static Matcher matcher;
    private static final String DOMAIN_NAME_PATTERN = "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,15}";
    private static Pattern patrn = Pattern.compile(DOMAIN_NAME_PATTERN);

    public static String getDomainName(String url) {
        String domainName = "";
        matcher = patrn.matcher(url);

        if (matcher.find()) {
            domainName = matcher.group(0).toLowerCase().trim();
        }

        return domainName;
    }

    @Test
    public void 구글search() throws IOException {
        String query = "spring%20cloud";
        String url = "https://www.google.com/search?q=" + query;

        Document doc = Jsoup
                .connect(url)
                .userAgent("Jsoup client")
                .timeout(5000).get();

        Elements links = doc.select("a[href]");
        Set<String> result = new HashSet<>();

        for (Element link : links) {
            String attr1 = link.attr("href");
            String attr2 = link.attr("class");
            if (!attr2.startsWith("_Zkb") && attr1.startsWith("/url?q=")) {
                result.add(getDomainName(attr1));
            }
        }

        for (String el : result) {
            System.out.println(el);
        }
    }

    @Test
    public void 헤더정보() throws IOException {
        Connection.Response response = Jsoup
                .connect("https://naver.com")
                .method(Connection.Method.GET)
                .followRedirects(false)
                .execute();

        Map<String, String> headers = response.headers();
        System.out.println(headers);
    }

}
	package com.kyu.app;

	import org.jsoup.Connection;
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Document;
	import org.jsoup.nodes.Element;
	import org.jsoup.safety.Cleaner;
	import org.jsoup.safety.Whitelist;
	import org.jsoup.select.Elements;
	import org.junit.Test;

	import java.io.IOException;
	import java.util.HashSet;
	import java.util.Map;
	import java.util.Set;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import static org.jsoup.Jsoup.connect;

	public class JSoupTest {

	@Test
	public void test() {
	String htmlString = "<html><head><title>My title</title></head>"
	+ "<body>Body content</body></html>"
	+ "<div id='mydiv'>Contents of a div element</div>";

	Document doc = Jsoup.parse(htmlString);
	String title = doc.title();
	String body = doc.body().text();
	Element divTag = doc.getElementById("mydiv");

	System.out.printf("title : %s%n", title);
	System.out.printf("body : %s%n", body);
	System.out.printf("divTag : %s", divTag.text());
	}

	@Test
	public void connect테스트() throws IOException {
	String url = "https://www.google.com";

	Document doc = connect(url).get();
	String title = doc.title();
	System.out.printf("title : %s%n", title);

	String html = doc.html();
	System.out.printf("html : %s%n", html);
	}

	@Test
	public void query() throws IOException {
	String url = "http://www.jsoup.org";
	Document document = connect(url).get();

	String description = document.select("meta[name=description]").first().attr("content");
	System.out.println("description : " + description);

	String keywords = document.select("meta[name=keywords]").first().attr("content");
	System.out.println("keywords : " + keywords);
	}

	@Test
	public void link파싱() throws IOException {
	String url = "http://jsoup.org";

	Document document = connect(url).get();
	Elements links = document.select("a[href]");

	for (Element link : links) {
	System.out.println("link : " + link.attr("href"));
	System.out.println("text : " + link.text());
	}
	}

	@Test
	public void html문법검증() {
	String htmlString = "<html><head><title>My title</title></head>"
	+ "<body><center>Body content</center></body></html>";

	boolean valid = Jsoup.isValid(htmlString, Whitelist.basic());

	if (valid) {
	System.out.println("The document is valid");
	} else {
	System.out.println("The document is not valid.");
	System.out.println("Cleaned document");

	Document dirtyDoc = Jsoup.parse(htmlString);
	Document cleanDoc = new Cleaner(Whitelist.basic()).clean(dirtyDoc);

	System.out.println(cleanDoc.html());
	}
	}

	@Test
	public void 모든이미지정보() throws IOException {
	Document doc = connect("https://naver.com").get();
	Elements images = doc.select("img[src~=(?i)\\.(png\|jpe?g\|gif)]");

	for (Element image : images) {
	System.out.println("\nsrc : " + image.attr("src"));
	System.out.println("height : " + image.attr("height"));
	System.out.println("width : " + image.attr("width"));
	System.out.println("alt : " + image.attr("alt"));

	}
	}

	@Test
	public void form의input정보추출() throws IOException {
	Document doc = connect("https://naver.com").get();
	Element formElement = doc.getElementById("sform");

	Elements inputElements = formElement.getElementsByTag("input");
	for (Element inputElement : inputElements) {
	String key = inputElement.attr("name");
	String value = inputElement.attr("value");
	System.out.println("Param name: "+key+" \nParam value: "+value);
	}
	}

	private static Matcher matcher;
	private static final String DOMAIN_NAME_PATTERN = "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,15}";
	private static Pattern patrn = Pattern.compile(DOMAIN_NAME_PATTERN);

	public static String getDomainName(String url) {
	String domainName = "";
	matcher = patrn.matcher(url);

	if (matcher.find()) {
	domainName = matcher.group(0).toLowerCase().trim();
	}

	return domainName;
	}

	@Test
	public void 구글search() throws IOException {
	String query = "spring%20cloud";
	String url = "https://www.google.com/search?q=" + query;

	Document doc = Jsoup
	.connect(url)
	.userAgent("Jsoup client")
	.timeout(5000).get();

	Elements links = doc.select("a[href]");
	Set<String> result = new HashSet<>();

	for (Element link : links) {
	String attr1 = link.attr("href");
	String attr2 = link.attr("class");
	if (!attr2.startsWith("_Zkb") && attr1.startsWith("/url?q=")) {
	result.add(getDomainName(attr1));
	}
	}

	for (String el : result) {
	System.out.println(el);
	}
	}

	@Test
	public void 헤더정보() throws IOException {
	Connection.Response response = Jsoup
	.connect("https://naver.com")
	.method(Connection.Method.GET)
	.followRedirects(false)
	.execute();

	Map<String, String> headers = response.headers();
	System.out.println(headers);
	}

	}