Last active
November 27, 2017 03:20
-
-
Save seraekim/d08a69c3feb375e5f9b97ee956c2549d to your computer and use it in GitHub Desktop.
Search both row and col header of multiple tables in a html request
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.srkim.test; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.select.Elements; | |
/** | |
* Search both row and col header of multiple tables in a html request. | |
* even can search spans. | |
* | |
* there are some functions to add that are currently not applicable. see the following: | |
* | |
* <blockquote> | |
* <ol> | |
* <li>thead, tbody and th</li> | |
* <li>vertical and horizontal scroll fixed row and col header resulting in one shown table having multiple tables</li> | |
* </ol> | |
* </blockquote> | |
* @author srkim | |
* | |
*/ | |
public class TableParser { | |
static String[] rowHeaders = {"정정일자","1. 정정관련 공시서류","2. 정정관련 공시서류제출일","3. 정정사유","4. 정정사항" | |
,"1. 일시","3. 의안 주요내용","4. 이사회결의일(결정일)","2. 장소","-사외이사 참석여부","-감사(사외이사가 아닌 감사위원) 참석여부" | |
,"-주주총회 구분","※관련공시","사업목적 추가","사업목적 변경","사업목적 삭제", "날짜", "시간", "참석(명)", "불참(명)"}; | |
static String[] colHeaders = {"정정항목","정정전","정정후","성명","출생년월","임기","신규선임여부","주요경력(현직포함)" | |
,"이사 등으로 재직 중인 다른 법인명(직위)","상근여부","구분","내용","이유","변경전","변경후","5. 기타 투자판단에 참고할 사항"}; | |
public static void main(String[] args) throws IOException { | |
// String html = "<!doctype html><html><head><meta charset=\"UTF-8\">" | |
// + "<title>Untitled Document</title><style>table * { border: solid 1px red;}</style></head>" | |
// + "<body><table><tr><td rowspan=\"2\">l1</td><td colspan=\"2\">t1</td><td colspan=\"2\">t2</td></tr>" | |
// + "<tr><td>1</td><td>2</td><td>3</td><td>4</td></tr><tr><td>l2</td><td>6</td><td>7</td><td>8</td><td>9</td>" | |
// + "</tr></table></body></html>"; | |
String urlStr = "http://dart.fss.or.kr/report/viewer.do?rcpNo=20171121900601&dcmNo=5855132&eleId=0&offset=0&length=0&dtd=HTML"; | |
Document document = Jsoup.connect(urlStr).get(); | |
Elements tables = document.select("table"); | |
for(int i = 0; i < tables.size(); i++) { | |
System.out.println("start table idx : " + i); | |
Elements rows = tables.eq(i).select("tr"); | |
List<List<Object>> parsedSingleTable = splitSpans(rows); | |
// 검색 해보기 | |
search(parsedSingleTable, "- 제2-8호 의안", false); | |
search(parsedSingleTable, "임시주주총회 소집일시 및 안건 변경", true); // there is only row header. | |
search(parsedSingleTable, "전) 주식회사 SBT 투자 사장", false); // there are both headers. | |
} | |
} | |
/** | |
* split spans scanning rows | |
* | |
* @param rows | |
* @return parsed double list of split colspans and colspans | |
* | |
*/ | |
public static List<List<Object>> splitSpans(Elements rows) { | |
int rowCount = rows.size(); | |
// System.out.println("rowCount : " + rowCount); | |
List<List<Object>> r = new ArrayList<List<Object>>(); | |
for (int i = 0; i < rowCount; i++) { | |
r.add(new ArrayList<Object>()); | |
} | |
for (int i = 0; i < rowCount; i++) { | |
Elements cols = rows.get(i).select("td"); | |
int colCount = cols.size(); | |
for (int j = 0; j < colCount; j++) { | |
String text = cols.get(j).text(); | |
String rSpanStr = cols.get(j).attr("rowspan"); | |
int rSpan = 1; | |
if (rSpanStr.length() != 0) { | |
rSpan = Integer.parseInt(rSpanStr); | |
} | |
String cSpanStr = cols.get(j).attr("colspan"); | |
int cSpan = 1; | |
if (cSpanStr.length() != 0) { | |
cSpan = Integer.parseInt(cSpanStr); | |
} | |
r.get(i).add(text); | |
for (int k = 1; k < rSpan; k++) { | |
r.get(i + k).add(text); | |
} | |
for (int k = 1; k < cSpan; k++) { | |
r.get(i).add(text); | |
} | |
} | |
} | |
return r; | |
} | |
/** | |
* search row and col header of a keyword in tds | |
* | |
* @param r double list of rows and cols of a table | |
* @param keyword search word | |
* @param rowHeaderIdx designate row header index | |
* @param colHeaderIdx designate col header index | |
* @param isPerfectMatch perfect equals literally otherwise contains filter applied | |
*/ | |
public static void search(List<List<Object>> r, String keyword, boolean isPerfectMatch) { | |
// scan and print all elements of rows | |
if(keyword == null || keyword.equals("")) { | |
for (int i = 0; i < r.size(); i++) { | |
for (int j = 0; j < r.get(0).size(); j++) { | |
System.out.print(r.get(i).get(j) + "(" + i + "," + j + ") "); | |
} | |
System.out.println(); | |
} | |
} else { | |
for (int i = 0; i < r.size(); i++) { | |
for (int j = 0; j < r.get(0).size(); j++) { | |
String v = (String) r.get(i).get(j); | |
if(isPerfectMatch) { | |
if(v.equals(keyword)) { | |
printSearchResult(r, i, j); | |
break; // prevent from duplicate search by split colspans | |
} | |
} else { | |
if(v.contains(keyword)) { | |
printSearchResult(r, i, j); | |
break; // prevent from duplicate search by split colspans | |
} | |
} | |
} | |
} | |
} | |
} | |
/** | |
* print headers with their indexes | |
* | |
* @param r | |
* @param i | |
* @param j | |
* @param rowHeaderIdx | |
* @param colHeaderIdx | |
*/ | |
public static void printSearchResult(List<List<Object>> r, int i, int j, Integer rowHeaderIdx, Integer colHeaderIdx) { | |
String rowHeader = ""; | |
String colHeader = ""; | |
if(rowHeaderIdx != null) { | |
rowHeader = "[rowhead " + r.get(i).get(rowHeaderIdx) + "]"; | |
} | |
if(colHeaderIdx != null) { | |
colHeader = "[colhead " + r.get(colHeaderIdx).get(j) + "]"; | |
} | |
System.out.println("(" + i + "," + j + ") " + rowHeader + colHeader + " ====> " + r.get(i).get(j)); | |
} | |
/** | |
* print headers with predefined dics. | |
* @param r | |
* @param i | |
* @param j | |
* @param rowHeaderIdx | |
* @param colHeaderIdx | |
*/ | |
public static void printSearchResult(List<List<Object>> r, int i, int j) { | |
String rowHeader = "[]"; | |
String colHeader = "[]"; | |
// row scan | |
for (String rowName : rowHeaders) { | |
if (r.get(i).get(0).equals(rowName)) { | |
rowHeader = "["+rowName+"]"; | |
break; | |
} | |
} | |
/* | |
* col san. | |
* scan the past indexes of row and col with i,j as starting position. | |
*/ | |
for (int k = i - 1; k >= 0; k--) { | |
if (colHeader.equals("[]")) { | |
for (String colName : colHeaders) { | |
if (r.get(k).get(j).equals(colName)) { | |
colHeader = "["+colName+"]"; | |
break; | |
} | |
} | |
} else { | |
// System.out.println(colHeader); | |
break; | |
} | |
} | |
System.out.println("(" + i + "," + j + ") " + rowHeader + colHeader + " ====> " + r.get(i).get(j)); | |
} | |
/* | |
* execution result.... | |
* | |
* start table idx : 0 | |
start table idx : 1 | |
(6,1) [3. 의안 주요내용][정정전] ====> <부의안건> 제1호 의안 : 정관 일부 변경의 건 - ... | |
(2,1) [3. 정정사유][] ====> 임시주주총회 소집일시 및 안건 변경 | |
start table idx : 2 | |
start table idx : 3 | |
start table idx : 4 | |
(3,4) [][주요경력(현직포함)] ====> 전) 주식회사 SBT 투자 사장 현) 한국감사협회 이사 현) GRC코리아 감사 | |
start table idx : 5 | |
start table idx : 6 | |
start table idx : 7 | |
*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment