Skip to content

Instantly share code, notes, and snippets.

@hideaki-t
Created March 5, 2012 17:44
Show Gist options
  • Save hideaki-t/1979749 to your computer and use it in GitHub Desktop.
Save hideaki-t/1979749 to your computer and use it in GitHub Desktop.
Kuromoji(will be included in lucene 3.6) test w/java7
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
public class Test {
public static void main(String[] args) {
try (JapaneseAnalyzer t = new JapaneseAnalyzer(Version.LUCENE_36);
TokenStream ts = t.reusableTokenStream("txt", new StringReader("今日はカレーを食べました"))) {
while (ts.incrementToken()) {
CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
BaseFormAttribute base = ts.getAttribute(BaseFormAttribute.class);
PartOfSpeechAttribute pos = ts.getAttribute(PartOfSpeechAttribute.class);
ReadingAttribute read = ts.getAttribute(ReadingAttribute.class);
InflectionAttribute inflect = ts.getAttribute(InflectionAttribute.class);
System.out.println(term);
System.out.println(base);
System.out.println(pos);
System.out.println(read);
System.out.println(inflect);
}
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.io.StringReader;
public class TestTokenizer {
public static void main(String[] args) {
try (JapaneseTokenizer ts = new JapaneseTokenizer(
new StringReader("今日はカレーを食べました"),
null, false, JapaneseTokenizer.Mode.NORMAL)) {
while (ts.incrementToken()) {
CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
BaseFormAttribute base = ts.getAttribute(BaseFormAttribute.class);
PartOfSpeechAttribute pos = ts.getAttribute(PartOfSpeechAttribute.class);
ReadingAttribute read = ts.getAttribute(ReadingAttribute.class);
InflectionAttribute inflect = ts.getAttribute(InflectionAttribute.class);
System.out.println(term);
System.out.println(base);
System.out.println(pos);
System.out.println(read);
System.out.println(inflect);
}
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment