dumpTokenized("中文标题", new CJKAnalyzer(Version.LUCENE_47));
Text to tokenize [中文标题] via LimitTokenCountAnalyzer
[中] [文] [标] [题]
private void dumpTokenized(String text, Analyzer analyzer) throws IOException { | |
List<String> tokens = tokenize(text, analyzer); | |
System.out.printf("Text to tokenize [%s] via %s %n", text, analyzer.getClass().getSimpleName()); | |
for (String t : tokens){ | |
System.out.printf("[%s] ", t); | |
} | |
System.out.println(); | |
} | |
private List<String> tokenize(String text, Analyzer analyzer) throws IOException { | |
List<String> tokens = new ArrayList<String>(); | |
TokenStream stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text)); | |
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); | |
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); | |
stream.reset(); | |
while (stream.incrementToken()) { | |
String term = termAtt.toString(); | |
tokens.add(term); | |
} | |
stream.close(); | |
return tokens; | |
} |