Skip to content

Instantly share code, notes, and snippets.

@timmc
Created January 22, 2019 17:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timmc/de4ab6b5b12c10a3b4c86f89e4f662c5 to your computer and use it in GitHub Desktop.
Save timmc/de4ab6b5b12c10a3b4c86f89e4f662c5 to your computer and use it in GitHub Desktop.
Custom query term splitter, as state machine (demonstration only; a regex with possessive quantifiers slightly out-performs this)
public static List<String> splitTermsStateMachine(String query) {
// Horizontal or vertical whitespace
Pattern isWhitespace = Pattern.compile("[\\h\\v]");
//== Manually managed state bits ==//
// If non-null, we're in a term, and this contains the term so far
StringBuilder currentTerm = null;
// True iff inside a quoted run
boolean inQuotes = false;
//== Less interesting state bits ==//
// Previous character, or null at beginning; changes every iteration,
// unconditionally.
Character previous = null;
// When each currentTerm completes, it is dumped into here.
List<String> terms = new ArrayList<>();
for (char c: query.toCharArray()) {
if (currentTerm == null) {
// Not in a term. Can we start one?
boolean curWhitespace = isWhitespace.matcher("" + c).find(); // TODO awful, let's use a character Set instead
if (!curWhitespace) {
// Start a term, we're no longer in whitespace
currentTerm = new StringBuilder();
currentTerm.append(c);
if (c == '"') {
inQuotes = true;
}
}
} else {
// Currently in a term. Continue, or finish it?
if (previous == '\\') {
// Complete the escape sequence, no matter what character
currentTerm.append(c);
} else {
// Only unquoted whitespace will naturally end a term
boolean curWhitespace = isWhitespace.matcher("" + c).find();
if (curWhitespace && !inQuotes) {
// End the term, we found unquoted whitespace
terms.add(currentTerm.toString());
currentTerm = null;
} else {
currentTerm.append(c);
}
// When we encounter a double-quote, we flip between being
// in/out of quoting—unless the previous was a backslash,
// but that's already taken care of.
if (c == '"') {
inQuotes = !inQuotes;
}
}
}
previous = c;
}
// If we still had an unfinished term, add it to the list anyway
if (currentTerm != null) {
terms.add(currentTerm.toString());
}
return terms;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment