Created
May 16, 2018 00:50
-
-
Save namkyu/3c4e6c6511daba0a300301d86c7cae2b to your computer and use it in GitHub Desktop.
Jsoup #jsoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.kyu.app; | |
import org.jsoup.Connection; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.safety.Cleaner; | |
import org.jsoup.safety.Whitelist; | |
import org.jsoup.select.Elements; | |
import org.junit.Test; | |
import java.io.IOException; | |
import java.util.HashSet; | |
import java.util.Map; | |
import java.util.Set; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import static org.jsoup.Jsoup.connect; | |
public class JSoupTest { | |
@Test | |
public void test() { | |
String htmlString = "<html><head><title>My title</title></head>" | |
+ "<body>Body content</body></html>" | |
+ "<div id='mydiv'>Contents of a div element</div>"; | |
Document doc = Jsoup.parse(htmlString); | |
String title = doc.title(); | |
String body = doc.body().text(); | |
Element divTag = doc.getElementById("mydiv"); | |
System.out.printf("title : %s%n", title); | |
System.out.printf("body : %s%n", body); | |
System.out.printf("divTag : %s", divTag.text()); | |
} | |
@Test | |
public void connect테스트() throws IOException { | |
String url = "https://www.google.com"; | |
Document doc = connect(url).get(); | |
String title = doc.title(); | |
System.out.printf("title : %s%n", title); | |
String html = doc.html(); | |
System.out.printf("html : %s%n", html); | |
} | |
@Test | |
public void query() throws IOException { | |
String url = "http://www.jsoup.org"; | |
Document document = connect(url).get(); | |
String description = document.select("meta[name=description]").first().attr("content"); | |
System.out.println("description : " + description); | |
String keywords = document.select("meta[name=keywords]").first().attr("content"); | |
System.out.println("keywords : " + keywords); | |
} | |
@Test | |
public void link파싱() throws IOException { | |
String url = "http://jsoup.org"; | |
Document document = connect(url).get(); | |
Elements links = document.select("a[href]"); | |
for (Element link : links) { | |
System.out.println("link : " + link.attr("href")); | |
System.out.println("text : " + link.text()); | |
} | |
} | |
@Test | |
public void html문법검증() { | |
String htmlString = "<html><head><title>My title</title></head>" | |
+ "<body><center>Body content</center></body></html>"; | |
boolean valid = Jsoup.isValid(htmlString, Whitelist.basic()); | |
if (valid) { | |
System.out.println("The document is valid"); | |
} else { | |
System.out.println("The document is not valid."); | |
System.out.println("Cleaned document"); | |
Document dirtyDoc = Jsoup.parse(htmlString); | |
Document cleanDoc = new Cleaner(Whitelist.basic()).clean(dirtyDoc); | |
System.out.println(cleanDoc.html()); | |
} | |
} | |
@Test | |
public void 모든이미지정보() throws IOException { | |
Document doc = connect("https://naver.com").get(); | |
Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]"); | |
for (Element image : images) { | |
System.out.println("\nsrc : " + image.attr("src")); | |
System.out.println("height : " + image.attr("height")); | |
System.out.println("width : " + image.attr("width")); | |
System.out.println("alt : " + image.attr("alt")); | |
} | |
} | |
@Test | |
public void form의input정보추출() throws IOException { | |
Document doc = connect("https://naver.com").get(); | |
Element formElement = doc.getElementById("sform"); | |
Elements inputElements = formElement.getElementsByTag("input"); | |
for (Element inputElement : inputElements) { | |
String key = inputElement.attr("name"); | |
String value = inputElement.attr("value"); | |
System.out.println("Param name: "+key+" \nParam value: "+value); | |
} | |
} | |
private static Matcher matcher; | |
private static final String DOMAIN_NAME_PATTERN = "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,15}"; | |
private static Pattern patrn = Pattern.compile(DOMAIN_NAME_PATTERN); | |
public static String getDomainName(String url) { | |
String domainName = ""; | |
matcher = patrn.matcher(url); | |
if (matcher.find()) { | |
domainName = matcher.group(0).toLowerCase().trim(); | |
} | |
return domainName; | |
} | |
@Test | |
public void 구글search() throws IOException { | |
String query = "spring%20cloud"; | |
String url = "https://www.google.com/search?q=" + query; | |
Document doc = Jsoup | |
.connect(url) | |
.userAgent("Jsoup client") | |
.timeout(5000).get(); | |
Elements links = doc.select("a[href]"); | |
Set<String> result = new HashSet<>(); | |
for (Element link : links) { | |
String attr1 = link.attr("href"); | |
String attr2 = link.attr("class"); | |
if (!attr2.startsWith("_Zkb") && attr1.startsWith("/url?q=")) { | |
result.add(getDomainName(attr1)); | |
} | |
} | |
for (String el : result) { | |
System.out.println(el); | |
} | |
} | |
@Test | |
public void 헤더정보() throws IOException { | |
Connection.Response response = Jsoup | |
.connect("https://naver.com") | |
.method(Connection.Method.GET) | |
.followRedirects(false) | |
.execute(); | |
Map<String, String> headers = response.headers(); | |
System.out.println(headers); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment