Skip to content

Instantly share code, notes, and snippets.

@alexxv
Created March 1, 2012 08:34
Show Gist options
  • Save alexxv/1948330 to your computer and use it in GitHub Desktop.
Save alexxv/1948330 to your computer and use it in GitHub Desktop.
Date parser
final Parser p = new Parser();
List<String> strings = Files.readLines(new File("c:\\users\\alexv\\Desktop\\task_titles.tsv"), Charset.defaultCharset());
final Set<Character> chars = newHashSet(' ', '@', ':', '-');
final Set<String> good_last_words = newHashSet("on", "@", "by", "due", "date", "untill", "til", "till", "at", "for", "before", "after", "in");
Set<String> text_firsts = newHashSet("today", "tomorrow", "this", "next");
final Set<String> syntaxTreeBlackList = newHashSet("(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ))))", "tomorrow", "this", "next");
final Set<String> syntaxTreeWhiteList = newHashSet("(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day (DAY_OF_WEEK )))))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_DATE (MONTH_OF_YEAR ) (DAY_OF_MONTH ))))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day day))))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) pm)))",
//"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) am)))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day (MONTH_OF_YEAR )))))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) (SECONDS_OF_MINUTE ) pm)))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_DATE (MONTH_OF_YEAR ) (DAY_OF_MONTH ) (YEAR_OF ))))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (RELATIVE_DATE (SEEK > by_day (DAY_OF_WEEK ))) (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ))))",
"(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_DATE (MONTH_OF_YEAR ) (DAY_OF_MONTH )) (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ))))");
final BufferedWriter bw = new BufferedWriter(new FileWriter("c:\\users\\alexv\\Desktop\\task_dates_at_the_beginning.tsv"));
final BufferedWriter bw2 = new BufferedWriter(new FileWriter("c:\\users\\alexv\\Desktop\\out.txt"));
int cnt = 0;
long start = System.currentTimeMillis();
for (String string : strings) {
String line = string.trim();
line = line.replace("\t", " ");
cnt++;
if (cnt % 10000 == 0) {
System.out.println(cnt);
}
bw2.write(line + "\n");
try {
List<DateGroup> parse = p.parse(line);
if (!parse.isEmpty()) {
for (DateGroup dateGroup : parse) {
int pos = dateGroup.getPosition();
String text = dateGroup.getText();
String[] text_words = text.split("\\s+");
String text_first_word = text_words.length > 0 ? text_words[0] : "";
String text_last_word = text_words.length > 0 ? text_words[text_words.length - 1] : "";
if (pos == 0) {
Character next_char = dateGroup.getText().length() > line.length() ? line.charAt(dateGroup.getText().length()) : '\0';
// if (chars.contains(last_char)) { // space : - , @
String[] words = line.substring(text.length(), line.length()).split("\\s+");
String first_word = words.length > 0 ? words[0] : "";
String syntaxTree = dateGroup.getSyntaxTree().toStringTree();
String syntaxTreeClean = syntaxTree.replaceAll("\\d+", "").replace("am", "pm").trim();
// if (syntaxTreeBlackList.contains(syntaxTree))
// continue;
//if (last_words.contains(last_word.toLowerCase()) || last_char == '@' || text_firsts.contains(text_first_word.toLowerCase()))
// if (syntaxTreeWhiteList.contains(syntaxTree)) ||
// good_last_words.contains(last_word.toLowerCase()) ||
// last_char == '@' ||
// (syntaxTree.equals("(DATE_TIME_ALTERNATIVE (DATE_TIME (EXPLICIT_TIME (HOURS_OF_DAY ) (MINUTES_OF_HOUR ) (SECONDS_OF_MINUTE ) pm)))") && (text_first_word.equals("this") || text_first_word.equals("tonight") || text_first_word.equals("in")))
// ) {
bw.write(text + "\t" + line + "\t" + first_word + "\t" + text_first_word + "\t" + text_last_word + "\t" + next_char + "\t" + syntaxTree + "\t" + syntaxTreeClean + "\n");
// }
// }
}
}
}
} catch (Exception e) {
// e.printStackTrace();
}
}
bw.close();
bw2.close();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment