Skip to content

Instantly share code, notes, and snippets.

@ayaysir
Last active October 29, 2018 13:11
Show Gist options
  • Save ayaysir/69e97a04db5df992f4f3465e93aa7737 to your computer and use it in GitHub Desktop.
Save ayaysir/69e97a04db5df992f4f3465e93aa7737 to your computer and use it in GitHub Desktop.
Jsoup example
package com.springboot.morse;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;
import com.springboot.morse.dto.NamuLink;
@Controller
public class NamuController {
@RequestMapping("/namuList")
public String namuListWeb(Model model) throws IOException {
model.addAttribute("list", namuList());
return "namuList";
}
public List<NamuLink> namuList() throws IOException{
List<NamuLink> list = new ArrayList<>();
Document doc = Jsoup.connect("https://namu.wiki/w/"
+ "%EC%95%84%EC%9D%B4%EB%8F%8C%20%EB%A7%88%"
+ "EC%8A%A4%ED%84%B0%20%EC%8B%A0%EB%8D%B0%EB"
+ "%A0%90%EB%9D%BC%20%EA%B1%B8%EC%A6%88%20%E"
+ "C%8A%A4%ED%83%80%EB%9D%BC%EC%9D%B4%ED%8A%"
+ "B8%20%EC%8A%A4%ED%85%8C%EC%9D%B4%EC%A7%80/"
+ "%EC%88%98%EB%A1%9D%EA%B3%A1").get();
Elements els = doc.getElementsByClass("wiki-table");
int ix = 1;
for(Element el : els) {
Elements subEls = el.getElementsByClass("wiki-link-internal");
if(ix != 1) {
for(Element subEl : subEls) {
list.add(new NamuLink(subEl.text(), subEl.attr("href")));
}
}
ix++;
}
return list;
}
@ResponseBody
@RequestMapping("/viewAllLyrics")
public String viewAllLyrics() throws IOException {
StringBuilder sb = new StringBuilder();
List<NamuLink> list = namuList();
list.forEach(el -> {
try {
sb.append(findLyrics(el.getUrl(), false));
} catch (IOException e) {
System.err.println("viewAllLyrics: " + e);
}
try {
Thread.sleep(2500);
} catch (InterruptedException e) {
e.printStackTrace();
}
});
return sb.toString();
}
@ResponseBody
@RequestMapping("/findLyrics")
public String findLyrics(String url, boolean forQuizlet) throws IOException {
StringBuilder sb = new StringBuilder();
Document doc = Jsoup.connect("https://namu.wiki" + url).get();
String title = "<h1>" + doc.getElementsByTag("title").text().replace(" - 나무위키", "") + "</h1>";
try {
doc.select("br").append("\\n"); // to line break
doc.getElementsContainingOwnText("아이돌 마스터 신데렐라 걸즈/음악").get(0).text();
Elements els = doc.getElementsByClass("wiki-heading");
els.forEach(el -> {
// System.out.println(el.text());
if(el.text().contains("가사[편집]")) {
String[] arr = el.text().split(" ");
// System.out.println(arr[0]);
// System.out.println(el);
int subIndex = 1;
while(true) {
if(doc.getElementById("s-" + arr[0] + subIndex) == null) {
if(subIndex == 1) {
Element sibling = (Element) el.nextSibling();
Elements subEl = sibling.getElementsByTag("tbody");
subEl.forEach(t -> {
// System.out.println(t.text());
sb.append(t.text().replace("\\n", "<br>"));
});
break;
}
else break;
}
Element subEl = doc.getElementById("s-" + arr[0] + subIndex);
// System.out.println(subEl);
System.out.println(subEl.nextSibling());
sb.append("<h3>" + subEl.nextSibling() + "</h3>");
Element sib = (Element) subEl.parent().nextSibling();
String lyric = sib.getElementsByTag("tbody").text();
// System.out.println(lyric);
sb.append(lyric.replace("\\n", "<br>").replaceAll("\\[([a-zA-Z0-9ㄱ-힣]*)\\]", ""));
subIndex++;
}
}
});
} catch (Exception e) {
System.err.println(e);
return "**ERROR**";
}
String outputStr = null;
System.out.println("Int: " + forQuizlet);
if(!forQuizlet) {
outputStr = title + sb.toString();
} else {
List<String> cha = new ArrayList<>(Arrays.asList(sb.toString().trim().split("(<br>|</h3>)")));
cha.add(" ");
StringBuilder subSb = new StringBuilder();
String regex = "\\[([a-zA-Z0-9ㄱ-힣]*)\\]";
/*
* 문단이 2줄인 경우 영어 가사일 가능성이 높거나 잘못 기입되었을수도 있음.
* 문단이 4줄 이상인 경우 다시 분석.
*/
int para = 0;
for(int i = 0; i < cha.size(); i++) {
if(cha.get(i).contains("<h3>")) {
para = 0;
continue;
}
if(cha.get(i).equals(" ") || cha.get(i).equals(" ") || cha.get(i).equals("") || cha.get(i) == null) {
if(para == 3) {
System.out.println("PARA 3");
subSb.append(cha.get(i - 3).trim().replaceAll(regex, "") + "\t");
subSb.append(cha.get(i - 1).trim().replaceAll(regex, "") + " [" + cha.get(i - 2).trim().replaceAll(regex, "") + "]\n");
} else if (para < 3) {
para = 0;
continue;
} else if (para > 3) {
if(para % 3 != 0) {
subSb.append("\n========= CHECK SHIYO =========\n");
for(int x = para; x > 0; x--) {
subSb.append(cha.get(i - x).trim().replaceAll(regex, "") + "\n");
}
subSb.append("===============================\n\n");
} else {
System.out.println("PARA " + para);
int divi = para / 3;
for(int x = divi; x > 0; x--) {
subSb.append(cha.get(i - (3*x)).trim().replaceAll(regex, "") + "\t");
subSb.append(cha.get(i - (3*x-2)).trim().replaceAll(regex, "") + " [" + cha.get(i - (3*x-1)).trim().replaceAll(regex, "") + "]\n");
}
}
}
para = 0;
} else {
System.out.println(cha.get(i).replaceAll(regex, ""));
para++;
}
}
System.out.println(subSb.toString());
outputStr = title + "<p><a href='https://quizlet.com/create-set' target=_blank>Create-set</a></p>"
+ subSb.toString().replace("\n", "<br>");
}
return outputStr;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment