Created
February 3, 2010 07:14
-
-
Save kijun/293436 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.apache.lucene.analysis; | |
import java.io.IOException; | |
import org.apache.lucene.analysis.tokenattributes.TermAttribute; | |
/** | |
* Removes words that are too long or too short from the stream. | |
*/ | |
public final class LengthFilter extends TokenFilter { | |
final int min; | |
final int max; | |
private TermAttribute termAtt; | |
/** | |
* Build a filter that removes words that are too long or too | |
* short from the text. | |
*/ | |
public LengthFilter(TokenStream in, int min, int max) | |
{ | |
super(in); | |
this.min = min; | |
this.max = max; | |
// input.incrementToken()을 하면 termAtt도 그에 따라 변한다. | |
termAtt = addAttribute(TermAttribute.class); | |
} | |
/** | |
* Returns the next input Token whose term() is the right len | |
*/ | |
@Override | |
public final boolean incrementToken() throws IOException { | |
while (input.incrementToken()) { | |
int len = termAtt.termLength(); | |
// term 길이가 적합할 때 까지 incrementToken()을 실행한다. | |
if (len >= min && len <= max) { | |
return true; | |
} | |
// note: else we ignore it but should we index each part of it? | |
} | |
// reached EOS -- return false | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment