Created
August 18, 2012 08:36
-
-
Save pcdinh/3385340 to your computer and use it in GitHub Desktop.
Custom Lucene analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import org.apache.lucene.analysis.*; | |
import org.apache.lucene.util.Version; | |
public class MyAnalyzer extends Analyzer { | |
public TokenStream tokenStream(String fieldName, Reader reader) { | |
return | |
new StopFilter( | |
true, | |
new StandardTokenizer(Version.LUCENE_30, reader), | |
StopAnalyzer.ENGLISH_STOP_WORDS_SET | |
); | |
} | |
private static void printTokens(String string) throws IOException { | |
TokenStream ts = new MyAnalyzer().tokenStream("default", new | |
StringReader(string)); | |
TermAttribute termAtt = ts.getAttribute(TermAttribute.class); | |
while(ts.incrementToken()) { | |
System.out.print(termAtt.term()); | |
System.out.print(" "); | |
} | |
System.out.println(); | |
} | |
public static void main(String[] args) throws IOException { | |
printTokens("one_two_three"); // prints "one two three" | |
printTokens("four4_five5_six6"); // prints "four4_five5_six6" | |
printTokens("seven7_eight_nine"); // prints "seven7_eight nine" | |
printTokens("ten_eleven11_twelve"); // prints "ten_eleven11_twelve" | |
} | |
} | |
public class MyAnalyzer extends Analyzer { | |
public TokenStream tokenStream(String fieldName, Reader reader) { | |
StandardTokenizer tokenizer = new StandardTokenizer( | |
Version.LUCENE_30, reader); | |
TokenStream tokenStream = new StandardFilter(tokenizer); | |
tokenStream = new MyTokenFilter(tokenStream); | |
tokenStream = new StopFilter(true, tokenStream, | |
StopAnalyzer.ENGLISH_STOP_WORDS_SET); | |
return tokenStream; | |
} | |
} | |
public class MyTokenFilter extends TokenFilter { | |
private final TermAttribute termAttr; | |
private String[] terms; | |
private int pos; | |
public MyTokenFilter(TokenStream tokenStream) { | |
super(tokenStream); | |
this.termAttr = input.addAttribute(TermAttribute.class); | |
} | |
public boolean incrementToken() throws IOException { | |
if (terms == null) { | |
if (!input.incrementToken()) { | |
return false; | |
} | |
terms = termAttr.term().split("_"); | |
} | |
termAttr.setTermBuffer(terms[pos++]); | |
if (pos == terms.length) { | |
terms = null; | |
pos = 0; | |
} | |
return true; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment