Skip to content

Instantly share code, notes, and snippets.

@seraekim
Last active November 27, 2017 03:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seraekim/d08a69c3feb375e5f9b97ee956c2549d to your computer and use it in GitHub Desktop.
Save seraekim/d08a69c3feb375e5f9b97ee956c2549d to your computer and use it in GitHub Desktop.
Search both row and col header of multiple tables in a html request
package org.srkim.test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
* Search both row and col header of multiple tables in a html request.
* even can search spans.
*
* there are some functions to add that are currently not applicable. see the following:
*
* <blockquote>
* <ol>
* <li>thead, tbody and th</li>
* <li>vertical and horizontal scroll fixed row and col header resulting in one shown table having multiple tables</li>
* </ol>
* </blockquote>
* @author srkim
*
*/
public class TableParser {
static String[] rowHeaders = {"정정일자","1. 정정관련 공시서류","2. 정정관련 공시서류제출일","3. 정정사유","4. 정정사항"
,"1. 일시","3. 의안 주요내용","4. 이사회결의일(결정일)","2. 장소","-사외이사 참석여부","-감사(사외이사가 아닌 감사위원) 참석여부"
,"-주주총회 구분","※관련공시","사업목적 추가","사업목적 변경","사업목적 삭제", "날짜", "시간", "참석(명)", "불참(명)"};
static String[] colHeaders = {"정정항목","정정전","정정후","성명","출생년월","임기","신규선임여부","주요경력(현직포함)"
,"이사 등으로 재직 중인 다른 법인명(직위)","상근여부","구분","내용","이유","변경전","변경후","5. 기타 투자판단에 참고할 사항"};
public static void main(String[] args) throws IOException {
// String html = "<!doctype html><html><head><meta charset=\"UTF-8\">"
// + "<title>Untitled Document</title><style>table * { border: solid 1px red;}</style></head>"
// + "<body><table><tr><td rowspan=\"2\">l1</td><td colspan=\"2\">t1</td><td colspan=\"2\">t2</td></tr>"
// + "<tr><td>1</td><td>2</td><td>3</td><td>4</td></tr><tr><td>l2</td><td>6</td><td>7</td><td>8</td><td>9</td>"
// + "</tr></table></body></html>";
String urlStr = "http://dart.fss.or.kr/report/viewer.do?rcpNo=20171121900601&dcmNo=5855132&eleId=0&offset=0&length=0&dtd=HTML";
Document document = Jsoup.connect(urlStr).get();
Elements tables = document.select("table");
for(int i = 0; i < tables.size(); i++) {
System.out.println("start table idx : " + i);
Elements rows = tables.eq(i).select("tr");
List<List<Object>> parsedSingleTable = splitSpans(rows);
// 검색 해보기
search(parsedSingleTable, "- 제2-8호 의안", false);
search(parsedSingleTable, "임시주주총회 소집일시 및 안건 변경", true); // there is only row header.
search(parsedSingleTable, "전) 주식회사 SBT 투자 사장", false); // there are both headers.
}
}
/**
* split spans scanning rows
*
* @param rows
* @return parsed double list of split colspans and colspans
*
*/
public static List<List<Object>> splitSpans(Elements rows) {
int rowCount = rows.size();
// System.out.println("rowCount : " + rowCount);
List<List<Object>> r = new ArrayList<List<Object>>();
for (int i = 0; i < rowCount; i++) {
r.add(new ArrayList<Object>());
}
for (int i = 0; i < rowCount; i++) {
Elements cols = rows.get(i).select("td");
int colCount = cols.size();
for (int j = 0; j < colCount; j++) {
String text = cols.get(j).text();
String rSpanStr = cols.get(j).attr("rowspan");
int rSpan = 1;
if (rSpanStr.length() != 0) {
rSpan = Integer.parseInt(rSpanStr);
}
String cSpanStr = cols.get(j).attr("colspan");
int cSpan = 1;
if (cSpanStr.length() != 0) {
cSpan = Integer.parseInt(cSpanStr);
}
r.get(i).add(text);
for (int k = 1; k < rSpan; k++) {
r.get(i + k).add(text);
}
for (int k = 1; k < cSpan; k++) {
r.get(i).add(text);
}
}
}
return r;
}
/**
* search row and col header of a keyword in tds
*
* @param r double list of rows and cols of a table
* @param keyword search word
* @param rowHeaderIdx designate row header index
* @param colHeaderIdx designate col header index
* @param isPerfectMatch perfect equals literally otherwise contains filter applied
*/
public static void search(List<List<Object>> r, String keyword, boolean isPerfectMatch) {
// scan and print all elements of rows
if(keyword == null || keyword.equals("")) {
for (int i = 0; i < r.size(); i++) {
for (int j = 0; j < r.get(0).size(); j++) {
System.out.print(r.get(i).get(j) + "(" + i + "," + j + ") ");
}
System.out.println();
}
} else {
for (int i = 0; i < r.size(); i++) {
for (int j = 0; j < r.get(0).size(); j++) {
String v = (String) r.get(i).get(j);
if(isPerfectMatch) {
if(v.equals(keyword)) {
printSearchResult(r, i, j);
break; // prevent from duplicate search by split colspans
}
} else {
if(v.contains(keyword)) {
printSearchResult(r, i, j);
break; // prevent from duplicate search by split colspans
}
}
}
}
}
}
/**
* print headers with their indexes
*
* @param r
* @param i
* @param j
* @param rowHeaderIdx
* @param colHeaderIdx
*/
public static void printSearchResult(List<List<Object>> r, int i, int j, Integer rowHeaderIdx, Integer colHeaderIdx) {
String rowHeader = "";
String colHeader = "";
if(rowHeaderIdx != null) {
rowHeader = "[rowhead " + r.get(i).get(rowHeaderIdx) + "]";
}
if(colHeaderIdx != null) {
colHeader = "[colhead " + r.get(colHeaderIdx).get(j) + "]";
}
System.out.println("(" + i + "," + j + ") " + rowHeader + colHeader + " ====> " + r.get(i).get(j));
}
/**
* print headers with predefined dics.
* @param r
* @param i
* @param j
* @param rowHeaderIdx
* @param colHeaderIdx
*/
public static void printSearchResult(List<List<Object>> r, int i, int j) {
String rowHeader = "[]";
String colHeader = "[]";
// row scan
for (String rowName : rowHeaders) {
if (r.get(i).get(0).equals(rowName)) {
rowHeader = "["+rowName+"]";
break;
}
}
/*
* col san.
* scan the past indexes of row and col with i,j as starting position.
*/
for (int k = i - 1; k >= 0; k--) {
if (colHeader.equals("[]")) {
for (String colName : colHeaders) {
if (r.get(k).get(j).equals(colName)) {
colHeader = "["+colName+"]";
break;
}
}
} else {
// System.out.println(colHeader);
break;
}
}
System.out.println("(" + i + "," + j + ") " + rowHeader + colHeader + " ====> " + r.get(i).get(j));
}
/*
* execution result....
*
* start table idx : 0
start table idx : 1
(6,1) [3. 의안 주요내용][정정전] ====> <부의안건> 제1호 의안 : 정관 일부 변경의 건 - ...
(2,1) [3. 정정사유][] ====> 임시주주총회 소집일시 및 안건 변경
start table idx : 2
start table idx : 3
start table idx : 4
(3,4) [][주요경력(현직포함)] ====> 전) 주식회사 SBT 투자 사장 현) 한국감사협회 이사 현) GRC코리아 감사
start table idx : 5
start table idx : 6
start table idx : 7
*/
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment