Skip to content

Instantly share code, notes, and snippets.

@khill
Created July 26, 2013 02:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save khill/6085509 to your computer and use it in GitHub Desktop.
Save khill/6085509 to your computer and use it in GitHub Desktop.
Lucene TokenFilter subclass which splits text into tokens where numbers are next to letters.
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Filter which splits text at boundaries between numbers and letters.
*
* This is designed to make sure that numbers and units of measure work with and without
* spaces - for example, "10ml" and "10 ml" are equivalent.
*
* @author khill
*
*/
public class AlphaNumericSplittingFilter extends TokenFilter {
private CharTermAttribute termAttribute;
private PositionIncrementAttribute positionIncrementAttribute;
private LinkedList<String> terms = new LinkedList<String>();
public AlphaNumericSplittingFilter(TokenStream in) {
super(in);
termAttribute = this.addAttribute(CharTermAttribute.class);
positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public final boolean incrementToken() throws IOException {
if(terms.size() > 0) {
this.setTermBufferFromList();
return true;
} else {
if(!input.incrementToken()) {
return false;
}
this.splitTerm();
if(terms.size() > 0) {
this.setTermBufferFromList();
}
return true;
}
}
private void setTermBufferFromList() {
char [] nextTerm = terms.removeFirst().toCharArray();
termAttribute.resizeBuffer(nextTerm.length);
termAttribute.copyBuffer(nextTerm, 0, nextTerm.length);
positionIncrementAttribute.setPositionIncrement(0);
}
private void splitTerm() {
char [] buffer = termAttribute.buffer();
int length = termAttribute.length();
String term = new String(Arrays.copyOfRange(buffer, 0, length));
String [] t = term.replaceAll("([0-9])([a-zA-Z])", "$1 $2").split(" ");
if(t.length > 1) {
terms = new LinkedList<String>();
for(String s : t) {
terms.add(s);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment