Created
October 5, 2015 07:07
-
-
Save doyonghoon/2ad751098907ab3f061d to your computer and use it in GitHub Desktop.
sample parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package dyh.parser; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.apache.commons.lang3.StringUtils; | |
import org.apache.commons.lang3.math.NumberUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import util.WLog; | |
/** | |
* Created by Spencer Do on 2015. 10. 5.. | |
* | |
* http://www.assist.org/cgi-bin/REPORT_2/Rep2.pl?aay=15-16&dora=EECS&oia=UCB&ay=15-16&event=19&ria=UCB&agreement=aa&ia=DAC&sia=DAC&dir=1&&sidebar=false&rinst=left&mver=2&kind=5&dt=2 | |
*/ | |
public class Parser { | |
private static final int NUMBER_OF_HYPHENS = 80; | |
public File getFile(String fileName) { | |
StringBuilder result = new StringBuilder(); | |
//Get file from resources folder | |
ClassLoader classLoader = getClass().getClassLoader(); | |
return new File(classLoader.getResource(fileName).getFile()); | |
} | |
public void parse(File in) { | |
try { | |
Document d = Jsoup.parse(in, "UTF-8"); | |
String body = d.getElementsByTag("body").get(0).text(); | |
String[] rows = getRows(body); | |
for (String r : rows) { | |
parseCourses(r); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
private void parseCourses(String row) { | |
String code = null; | |
List<String> collegeCourses = new ArrayList<>(); | |
String[] lines = row.split("\n"); | |
for (String l : lines) { | |
String[] split = l.split("\\|"); | |
for (int i = 0; i < split.length; i++) { | |
String value = getCourseCode(split[i].trim()); | |
if (StringUtils.isEmpty(code)) { | |
code = value; | |
} else { | |
if (!StringUtils.isEmpty(value)) { | |
collegeCourses.add(value); | |
} | |
} | |
} | |
} | |
WLog.i("code: " + code + ", courses: " + collegeCourses.toString()); | |
} | |
private String getCourseCode(String raw) { | |
if (!StringUtils.isEmpty(raw)) { | |
int index = 0; | |
for (int i = 0; i < raw.toCharArray().length; i++) { | |
char c = raw.toCharArray()[i]; | |
if (c >= 'a' && c <= 'z') { | |
index = i; | |
break; | |
} | |
} | |
if (index > 2) { | |
String result = raw.substring(0, index - 2).trim(); | |
return result; | |
} | |
} | |
return null; | |
} | |
private String[] getRows(String body) { | |
List<String> result = new ArrayList<>(); | |
String[] lines = body.split(createHyphen(NUMBER_OF_HYPHENS)); | |
if (lines != null && lines.length > 0) { | |
for (String l : lines) { | |
if (l.contains("|")) { | |
result.add(l.trim()); | |
} | |
} | |
} | |
return result.toArray(new String[result.size()]); | |
} | |
private String createHyphen(int length) { | |
String result = ""; | |
for (int i = 0; i < length; i++) { | |
result += "-"; | |
} | |
return result; | |
} | |
private String[] getCourses(String line) { | |
return line.split("\\|"); | |
} | |
private boolean hasUnits(String line) { | |
return line.matches("\\((.*?)\\)"); | |
} | |
private float getUnits(String raw) { | |
raw.replaceAll("\\(", "").replaceAll("\\)", ""); | |
return NumberUtils.toFloat(raw, 0f); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment