Skip to content

Instantly share code, notes, and snippets.

@renaud
Created September 24, 2012 14:11
Show Gist options
  • Save renaud/3776145 to your computer and use it in GitHub Desktop.
Save renaud/3776145 to your computer and use it in GitHub Desktop.
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/**
*
* <dependency>
* <groupId>org.apache.lucene</groupId>
* <artifactId>lucene-icu</artifactId>
* <version>3.6.1</version>
* </dependency>
*/
public class LuceneTest {
public static List<String> parseKeywords(Analyzer analyzer, String field,
String keywords) {
List<String> result = new ArrayList<String>();
TokenStream stream = analyzer.tokenStream(field, new StringReader(
keywords));
try {
while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class)
.toString());
}
} catch (IOException e) { // nope
}
return result;
}
public static class MyAnalyzer extends StopwordAnalyzerBase {
protected MyAnalyzer(Version version) {
super(version);
}
@Override
protected TokenStreamComponents createComponents(
final String fieldName, final Reader reader) {
final StandardTokenizer src = new StandardTokenizer(matchVersion,
reader);
TokenStream tok = new StandardFilter(matchVersion, src);
tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, stopwords);
tok = new ICUFoldingFilter(tok);
return new TokenStreamComponents(src, tok);
}
}
public static void main(String[] args) {
List<String> list = parseKeywords(new MyAnalyzer(Version.LUCENE_36),
"a", "the blue moons élongations non-triviales décès rapide!");
for (String l : list) {
System.err.println(l);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment