Created
June 5, 2017 18:25
-
-
Save joshuabambrick/08e3095a166904ebeb16ed88a90cd424 to your computer and use it in GitHub Desktop.
Date Extraction Pattern Construction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package utils; | |
import com.google.common.collect.ImmutableList; | |
import options.GlobalOptionsManager; | |
import org.jetbrains.annotations.NotNull; | |
import java.util.*; | |
import java.util.stream.Collectors; | |
public class DatePartPatterns { | |
private static final String SOME_NON_WORD_CHARS = "\\W+"; | |
private static final String EMPTY_MARKER = ""; | |
public static void printAllPatterns() { | |
for (String pattern : getPatterns()) | |
System.out.println(pattern); | |
} | |
public enum DatePart { | |
DAY("day"), | |
MONTH("month"), | |
YEAR("year"), | |
; | |
private final String captureGroupName; | |
DatePart(@NotNull String captureGroupName) { | |
this.captureGroupName = captureGroupName; | |
} | |
@NotNull | |
public String getCaptureGroupName() { | |
return captureGroupName; | |
} | |
} | |
private enum Month { | |
January("January", "Jan", "Jan\\."), | |
February("February", "Feb", "Feb\\."), | |
March("March", "Mar", "Mar\\."), | |
April("April", "Apr", "Apr\\."), | |
May("May", "May", "May\\."), | |
June("June", "Jun", "Jun\\."), | |
July("July", "Jul", "Jul\\."), | |
August("August", "Aug", "Aug\\."), | |
September("September", "Sep", "Sep\\.", "Sept", "Sept\\."), | |
October("October", "Oct", "Oct\\."), | |
November("November", "Nov", "Nov\\."), | |
December("December", "Dec", "Dec\\."), | |
; | |
private final Set<String> names = new HashSet<>(); | |
Month(@NotNull String... names) { | |
this.names.addAll(Arrays.asList(names)); | |
} | |
@NotNull | |
public Set<String> getNames() { | |
return Collections.unmodifiableSet(names); | |
} | |
public boolean matches(@NotNull String str) { | |
for (String name : names) { | |
if (str.equalsIgnoreCase(name)) { | |
return true; | |
} | |
} | |
return false; | |
} | |
} | |
private DatePartPatterns() {} | |
public static Integer matchMonthString(@NotNull String str) { | |
for (Month month : Month.values()) { | |
if (month.matches(str)) { | |
return month.ordinal() + 1; | |
} | |
} | |
return null; | |
} | |
private static String createNamedRegexCapturingGroup(@NotNull String pattern, @NotNull DatePart datePart) { | |
return String.format("(?<%s>%s)", datePart.getCaptureGroupName(), pattern); | |
} | |
private static final String DAY_SUFFIX_PATTERN = "\\s*((st)|(nd)|(rd)|(th))?(\\s*( day)? of)?"; | |
private static final List<String> DAY_PATTERNS = ImmutableList.<String>builder() | |
.add(createNamedRegexCapturingGroup("[0-3][0-9]", DatePart.DAY) + DAY_SUFFIX_PATTERN) | |
.add(createNamedRegexCapturingGroup("[1-9]", DatePart.DAY) + DAY_SUFFIX_PATTERN) | |
.build(); | |
private static final List<String> MONTH_PATTERNS; | |
private static final List<String> YEAR_PATTERNS = ImmutableList.<String>builder() | |
.add("(19|20)\\d{2}") | |
.build() | |
.stream() | |
.map(str -> createNamedRegexCapturingGroup(str, DatePart.YEAR)) | |
.collect(Collectors.toList()); | |
private static final List<String> DATE_PATTERNS; | |
@NotNull | |
public static List<String> getPatterns() { | |
return DATE_PATTERNS; | |
} | |
static { | |
MONTH_PATTERNS = determineInitialMonthPatterns(); | |
DATE_PATTERNS = determineInitialDatePatterns(); | |
} | |
private static List<String> determineInitialDatePatterns() { | |
List<String> datePatterns = new ArrayList<>(); | |
String termDelimiter = SOME_NON_WORD_CHARS; | |
String startMarker = getEnforceComplete() ? "^" : EMPTY_MARKER; | |
String endMarker = getEnforceComplete() ? "$" : EMPTY_MARKER; | |
datePatterns.addAll(PatternJoiner. | |
joinPatterns(startMarker, endMarker, termDelimiter, DAY_PATTERNS, MONTH_PATTERNS, YEAR_PATTERNS)); | |
datePatterns.addAll(PatternJoiner. | |
joinPatterns(startMarker, endMarker, termDelimiter, MONTH_PATTERNS, DAY_PATTERNS, YEAR_PATTERNS)); | |
datePatterns.addAll(PatternJoiner. | |
joinPatterns(startMarker, endMarker, termDelimiter, MONTH_PATTERNS, YEAR_PATTERNS)); | |
datePatterns.addAll(PatternJoiner. | |
joinPatterns(startMarker, endMarker, termDelimiter, DAY_PATTERNS, MONTH_PATTERNS)); | |
datePatterns.addAll(PatternJoiner. | |
joinPatterns(startMarker, endMarker, termDelimiter, MONTH_PATTERNS, DAY_PATTERNS)); | |
datePatterns.addAll(PatternJoiner. | |
joinPatterns(startMarker, endMarker, termDelimiter, YEAR_PATTERNS)); | |
return Collections.unmodifiableList(datePatterns); | |
} | |
@NotNull | |
private static List<String> determineInitialMonthPatterns() { | |
List<String> monthPatterns = new ArrayList<>(); | |
String orDelimiter = "|"; | |
monthPatterns.add(Arrays.stream(Month.values()) | |
.flatMap(month -> month.getNames().stream()) | |
.collect(Collectors.joining(orDelimiter))); | |
monthPatterns.add("0?[1-9]|1[0-2]"); | |
return monthPatterns.stream() | |
.map(str -> createNamedRegexCapturingGroup(str, DatePart.MONTH)) | |
.collect(Collectors.toList()); | |
} | |
private static boolean getEnforceComplete() { | |
return GlobalOptionsManager.get().getDatePatternEnforceCompleteMatch().isEnabled(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment