Last active
October 29, 2018 13:11
-
-
Save ayaysir/69e97a04db5df992f4f3465e93aa7737 to your computer and use it in GitHub Desktop.
Jsoup example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.springboot.morse; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.List; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import org.springframework.stereotype.Controller; | |
import org.springframework.ui.Model; | |
import org.springframework.web.bind.annotation.RequestMapping; | |
import org.springframework.web.bind.annotation.ResponseBody; | |
import com.springboot.morse.dto.NamuLink; | |
@Controller | |
public class NamuController { | |
@RequestMapping("/namuList") | |
public String namuListWeb(Model model) throws IOException { | |
model.addAttribute("list", namuList()); | |
return "namuList"; | |
} | |
public List<NamuLink> namuList() throws IOException{ | |
List<NamuLink> list = new ArrayList<>(); | |
Document doc = Jsoup.connect("https://namu.wiki/w/" | |
+ "%EC%95%84%EC%9D%B4%EB%8F%8C%20%EB%A7%88%" | |
+ "EC%8A%A4%ED%84%B0%20%EC%8B%A0%EB%8D%B0%EB" | |
+ "%A0%90%EB%9D%BC%20%EA%B1%B8%EC%A6%88%20%E" | |
+ "C%8A%A4%ED%83%80%EB%9D%BC%EC%9D%B4%ED%8A%" | |
+ "B8%20%EC%8A%A4%ED%85%8C%EC%9D%B4%EC%A7%80/" | |
+ "%EC%88%98%EB%A1%9D%EA%B3%A1").get(); | |
Elements els = doc.getElementsByClass("wiki-table"); | |
int ix = 1; | |
for(Element el : els) { | |
Elements subEls = el.getElementsByClass("wiki-link-internal"); | |
if(ix != 1) { | |
for(Element subEl : subEls) { | |
list.add(new NamuLink(subEl.text(), subEl.attr("href"))); | |
} | |
} | |
ix++; | |
} | |
return list; | |
} | |
@ResponseBody | |
@RequestMapping("/viewAllLyrics") | |
public String viewAllLyrics() throws IOException { | |
StringBuilder sb = new StringBuilder(); | |
List<NamuLink> list = namuList(); | |
list.forEach(el -> { | |
try { | |
sb.append(findLyrics(el.getUrl(), false)); | |
} catch (IOException e) { | |
System.err.println("viewAllLyrics: " + e); | |
} | |
try { | |
Thread.sleep(2500); | |
} catch (InterruptedException e) { | |
e.printStackTrace(); | |
} | |
}); | |
return sb.toString(); | |
} | |
@ResponseBody | |
@RequestMapping("/findLyrics") | |
public String findLyrics(String url, boolean forQuizlet) throws IOException { | |
StringBuilder sb = new StringBuilder(); | |
Document doc = Jsoup.connect("https://namu.wiki" + url).get(); | |
String title = "<h1>" + doc.getElementsByTag("title").text().replace(" - 나무위키", "") + "</h1>"; | |
try { | |
doc.select("br").append("\\n"); // to line break | |
doc.getElementsContainingOwnText("아이돌 마스터 신데렐라 걸즈/음악").get(0).text(); | |
Elements els = doc.getElementsByClass("wiki-heading"); | |
els.forEach(el -> { | |
// System.out.println(el.text()); | |
if(el.text().contains("가사[편집]")) { | |
String[] arr = el.text().split(" "); | |
// System.out.println(arr[0]); | |
// System.out.println(el); | |
int subIndex = 1; | |
while(true) { | |
if(doc.getElementById("s-" + arr[0] + subIndex) == null) { | |
if(subIndex == 1) { | |
Element sibling = (Element) el.nextSibling(); | |
Elements subEl = sibling.getElementsByTag("tbody"); | |
subEl.forEach(t -> { | |
// System.out.println(t.text()); | |
sb.append(t.text().replace("\\n", "<br>")); | |
}); | |
break; | |
} | |
else break; | |
} | |
Element subEl = doc.getElementById("s-" + arr[0] + subIndex); | |
// System.out.println(subEl); | |
System.out.println(subEl.nextSibling()); | |
sb.append("<h3>" + subEl.nextSibling() + "</h3>"); | |
Element sib = (Element) subEl.parent().nextSibling(); | |
String lyric = sib.getElementsByTag("tbody").text(); | |
// System.out.println(lyric); | |
sb.append(lyric.replace("\\n", "<br>").replaceAll("\\[([a-zA-Z0-9ㄱ-힣]*)\\]", "")); | |
subIndex++; | |
} | |
} | |
}); | |
} catch (Exception e) { | |
System.err.println(e); | |
return "**ERROR**"; | |
} | |
String outputStr = null; | |
System.out.println("Int: " + forQuizlet); | |
if(!forQuizlet) { | |
outputStr = title + sb.toString(); | |
} else { | |
List<String> cha = new ArrayList<>(Arrays.asList(sb.toString().trim().split("(<br>|</h3>)"))); | |
cha.add(" "); | |
StringBuilder subSb = new StringBuilder(); | |
String regex = "\\[([a-zA-Z0-9ㄱ-힣]*)\\]"; | |
/* | |
* 문단이 2줄인 경우 영어 가사일 가능성이 높거나 잘못 기입되었을수도 있음. | |
* 문단이 4줄 이상인 경우 다시 분석. | |
*/ | |
int para = 0; | |
for(int i = 0; i < cha.size(); i++) { | |
if(cha.get(i).contains("<h3>")) { | |
para = 0; | |
continue; | |
} | |
if(cha.get(i).equals(" ") || cha.get(i).equals(" ") || cha.get(i).equals("") || cha.get(i) == null) { | |
if(para == 3) { | |
System.out.println("PARA 3"); | |
subSb.append(cha.get(i - 3).trim().replaceAll(regex, "") + "\t"); | |
subSb.append(cha.get(i - 1).trim().replaceAll(regex, "") + " [" + cha.get(i - 2).trim().replaceAll(regex, "") + "]\n"); | |
} else if (para < 3) { | |
para = 0; | |
continue; | |
} else if (para > 3) { | |
if(para % 3 != 0) { | |
subSb.append("\n========= CHECK SHIYO =========\n"); | |
for(int x = para; x > 0; x--) { | |
subSb.append(cha.get(i - x).trim().replaceAll(regex, "") + "\n"); | |
} | |
subSb.append("===============================\n\n"); | |
} else { | |
System.out.println("PARA " + para); | |
int divi = para / 3; | |
for(int x = divi; x > 0; x--) { | |
subSb.append(cha.get(i - (3*x)).trim().replaceAll(regex, "") + "\t"); | |
subSb.append(cha.get(i - (3*x-2)).trim().replaceAll(regex, "") + " [" + cha.get(i - (3*x-1)).trim().replaceAll(regex, "") + "]\n"); | |
} | |
} | |
} | |
para = 0; | |
} else { | |
System.out.println(cha.get(i).replaceAll(regex, "")); | |
para++; | |
} | |
} | |
System.out.println(subSb.toString()); | |
outputStr = title + "<p><a href='https://quizlet.com/create-set' target=_blank>Create-set</a></p>" | |
+ subSb.toString().replace("\n", "<br>"); | |
} | |
return outputStr; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment