khill/AlphaNumericSplittingFilter.java

## AlphaNumericSplittingFilter.java
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedList;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;


/**
 * Filter which splits text at boundaries between numbers and letters.
 *
 * This is designed to make sure that numbers and units of measure work with and without
 * spaces - for example, "10ml" and "10 ml" are equivalent.
 *
 * @author khill
 *
 */
public class AlphaNumericSplittingFilter extends TokenFilter {

    private CharTermAttribute termAttribute;
    private PositionIncrementAttribute positionIncrementAttribute;

    private LinkedList<String> terms = new LinkedList<String>();

    public AlphaNumericSplittingFilter(TokenStream in) {
        super(in);
        termAttribute = this.addAttribute(CharTermAttribute.class);
        positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class);
    }

    /* (non-Javadoc)
     * @see org.apache.lucene.analysis.TokenStream#incrementToken()
     */
    @Override
    public final boolean incrementToken() throws IOException {
        if(terms.size() > 0) {
            this.setTermBufferFromList();
            return true;
        } else {
            if(!input.incrementToken()) {
                return false;
            }
            this.splitTerm();
            if(terms.size() > 0) {
                this.setTermBufferFromList();
            }
            return true;
        }
    }

    private void setTermBufferFromList() {
        char [] nextTerm = terms.removeFirst().toCharArray();
        termAttribute.resizeBuffer(nextTerm.length);
        termAttribute.copyBuffer(nextTerm, 0, nextTerm.length);
        positionIncrementAttribute.setPositionIncrement(0);
    }

    private void splitTerm() {
        char [] buffer = termAttribute.buffer();
        int length = termAttribute.length();
        String term = new String(Arrays.copyOfRange(buffer, 0, length));
        String  [] t = term.replaceAll("([0-9])([a-zA-Z])", "$1 $2").split(" ");
        if(t.length > 1) {
            terms = new LinkedList<String>();
            for(String s : t) {
                terms.add(s);
            }
        }
    }
}
	import java.io.IOException;
	import java.util.Arrays;
	import java.util.LinkedList;

	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;


	/**
	* Filter which splits text at boundaries between numbers and letters.
	*
	* This is designed to make sure that numbers and units of measure work with and without
	* spaces - for example, "10ml" and "10 ml" are equivalent.
	*
	* @author khill
	*
	*/
	public class AlphaNumericSplittingFilter extends TokenFilter {

	private CharTermAttribute termAttribute;
	private PositionIncrementAttribute positionIncrementAttribute;

	private LinkedList<String> terms = new LinkedList<String>();

	public AlphaNumericSplittingFilter(TokenStream in) {
	super(in);
	termAttribute = this.addAttribute(CharTermAttribute.class);
	positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class);
	}

	/* (non-Javadoc)
	* @see org.apache.lucene.analysis.TokenStream#incrementToken()
	*/
	@Override
	public final boolean incrementToken() throws IOException {
	if(terms.size() > 0) {
	this.setTermBufferFromList();
	return true;
	} else {
	if(!input.incrementToken()) {
	return false;
	}
	this.splitTerm();
	if(terms.size() > 0) {
	this.setTermBufferFromList();
	}
	return true;
	}
	}

	private void setTermBufferFromList() {
	char [] nextTerm = terms.removeFirst().toCharArray();
	termAttribute.resizeBuffer(nextTerm.length);
	termAttribute.copyBuffer(nextTerm, 0, nextTerm.length);
	positionIncrementAttribute.setPositionIncrement(0);
	}

	private void splitTerm() {
	char [] buffer = termAttribute.buffer();
	int length = termAttribute.length();
	String term = new String(Arrays.copyOfRange(buffer, 0, length));
	String [] t = term.replaceAll("([0-9])([a-zA-Z])", "$1 $2").split(" ");
	if(t.length > 1) {
	terms = new LinkedList<String>();
	for(String s : t) {
	terms.add(s);
	}
	}
	}
	}