Skip to content

Instantly share code, notes, and snippets.

@namkyu
Created May 16, 2018 00:50
Show Gist options
  • Save namkyu/3c4e6c6511daba0a300301d86c7cae2b to your computer and use it in GitHub Desktop.
Save namkyu/3c4e6c6511daba0a300301d86c7cae2b to your computer and use it in GitHub Desktop.
Jsoup #jsoup
package com.kyu.app;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.jsoup.Jsoup.connect;
public class JSoupTest {
@Test
public void test() {
String htmlString = "<html><head><title>My title</title></head>"
+ "<body>Body content</body></html>"
+ "<div id='mydiv'>Contents of a div element</div>";
Document doc = Jsoup.parse(htmlString);
String title = doc.title();
String body = doc.body().text();
Element divTag = doc.getElementById("mydiv");
System.out.printf("title : %s%n", title);
System.out.printf("body : %s%n", body);
System.out.printf("divTag : %s", divTag.text());
}
@Test
public void connect테스트() throws IOException {
String url = "https://www.google.com";
Document doc = connect(url).get();
String title = doc.title();
System.out.printf("title : %s%n", title);
String html = doc.html();
System.out.printf("html : %s%n", html);
}
@Test
public void query() throws IOException {
String url = "http://www.jsoup.org";
Document document = connect(url).get();
String description = document.select("meta[name=description]").first().attr("content");
System.out.println("description : " + description);
String keywords = document.select("meta[name=keywords]").first().attr("content");
System.out.println("keywords : " + keywords);
}
@Test
public void link파싱() throws IOException {
String url = "http://jsoup.org";
Document document = connect(url).get();
Elements links = document.select("a[href]");
for (Element link : links) {
System.out.println("link : " + link.attr("href"));
System.out.println("text : " + link.text());
}
}
@Test
public void html문법검증() {
String htmlString = "<html><head><title>My title</title></head>"
+ "<body><center>Body content</center></body></html>";
boolean valid = Jsoup.isValid(htmlString, Whitelist.basic());
if (valid) {
System.out.println("The document is valid");
} else {
System.out.println("The document is not valid.");
System.out.println("Cleaned document");
Document dirtyDoc = Jsoup.parse(htmlString);
Document cleanDoc = new Cleaner(Whitelist.basic()).clean(dirtyDoc);
System.out.println(cleanDoc.html());
}
}
@Test
public void 모든이미지정보() throws IOException {
Document doc = connect("https://naver.com").get();
Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
for (Element image : images) {
System.out.println("\nsrc : " + image.attr("src"));
System.out.println("height : " + image.attr("height"));
System.out.println("width : " + image.attr("width"));
System.out.println("alt : " + image.attr("alt"));
}
}
@Test
public void form의input정보추출() throws IOException {
Document doc = connect("https://naver.com").get();
Element formElement = doc.getElementById("sform");
Elements inputElements = formElement.getElementsByTag("input");
for (Element inputElement : inputElements) {
String key = inputElement.attr("name");
String value = inputElement.attr("value");
System.out.println("Param name: "+key+" \nParam value: "+value);
}
}
private static Matcher matcher;
private static final String DOMAIN_NAME_PATTERN = "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,15}";
private static Pattern patrn = Pattern.compile(DOMAIN_NAME_PATTERN);
public static String getDomainName(String url) {
String domainName = "";
matcher = patrn.matcher(url);
if (matcher.find()) {
domainName = matcher.group(0).toLowerCase().trim();
}
return domainName;
}
@Test
public void 구글search() throws IOException {
String query = "spring%20cloud";
String url = "https://www.google.com/search?q=" + query;
Document doc = Jsoup
.connect(url)
.userAgent("Jsoup client")
.timeout(5000).get();
Elements links = doc.select("a[href]");
Set<String> result = new HashSet<>();
for (Element link : links) {
String attr1 = link.attr("href");
String attr2 = link.attr("class");
if (!attr2.startsWith("_Zkb") && attr1.startsWith("/url?q=")) {
result.add(getDomainName(attr1));
}
}
for (String el : result) {
System.out.println(el);
}
}
@Test
public void 헤더정보() throws IOException {
Connection.Response response = Jsoup
.connect("https://naver.com")
.method(Connection.Method.GET)
.followRedirects(false)
.execute();
Map<String, String> headers = response.headers();
System.out.println(headers);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment