Created
January 22, 2019 17:53
-
-
Save timmc/de4ab6b5b12c10a3b4c86f89e4f662c5 to your computer and use it in GitHub Desktop.
Custom query term splitter, as state machine (demonstration only; a regex with possessive quantifiers slightly out-performs this)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static List<String> splitTermsStateMachine(String query) { | |
// Horizontal or vertical whitespace | |
Pattern isWhitespace = Pattern.compile("[\\h\\v]"); | |
//== Manually managed state bits ==// | |
// If non-null, we're in a term, and this contains the term so far | |
StringBuilder currentTerm = null; | |
// True iff inside a quoted run | |
boolean inQuotes = false; | |
//== Less interesting state bits ==// | |
// Previous character, or null at beginning; changes every iteration, | |
// unconditionally. | |
Character previous = null; | |
// When each currentTerm completes, it is dumped into here. | |
List<String> terms = new ArrayList<>(); | |
for (char c: query.toCharArray()) { | |
if (currentTerm == null) { | |
// Not in a term. Can we start one? | |
boolean curWhitespace = isWhitespace.matcher("" + c).find(); // TODO awful, let's use a character Set instead | |
if (!curWhitespace) { | |
// Start a term, we're no longer in whitespace | |
currentTerm = new StringBuilder(); | |
currentTerm.append(c); | |
if (c == '"') { | |
inQuotes = true; | |
} | |
} | |
} else { | |
// Currently in a term. Continue, or finish it? | |
if (previous == '\\') { | |
// Complete the escape sequence, no matter what character | |
currentTerm.append(c); | |
} else { | |
// Only unquoted whitespace will naturally end a term | |
boolean curWhitespace = isWhitespace.matcher("" + c).find(); | |
if (curWhitespace && !inQuotes) { | |
// End the term, we found unquoted whitespace | |
terms.add(currentTerm.toString()); | |
currentTerm = null; | |
} else { | |
currentTerm.append(c); | |
} | |
// When we encounter a double-quote, we flip between being | |
// in/out of quoting—unless the previous was a backslash, | |
// but that's already taken care of. | |
if (c == '"') { | |
inQuotes = !inQuotes; | |
} | |
} | |
} | |
previous = c; | |
} | |
// If we still had an unfinished term, add it to the list anyway | |
if (currentTerm != null) { | |
terms.add(currentTerm.toString()); | |
} | |
return terms; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment