Skip to content

Instantly share code, notes, and snippets.

@siahr
Created March 25, 2015 09:13
Show Gist options
  • Save siahr/f55453a7977ea621c91e to your computer and use it in GitHub Desktop.
Save siahr/f55453a7977ea621c91e to your computer and use it in GitHub Desktop.
Japanese lexer using bi-gram (experimental).
package org.basex.util.ft;
import static org.basex.util.Token.*;
import static org.basex.util.ft.FTFlag.*;
import java.io.*;
import java.lang.reflect.*;
import java.util.*;
import org.basex.util.*;
/**
* Japanese lexer using igo (http://igo.sourceforge.jp/).
*
* @author BaseX Team 2005-14, BSD License
* @author Toshio HIRAI
*/
public class JapaneseTokenizer extends Tokenizer {
/** Flag available. */
private static boolean available = true;
/** Name of the Igo tagger class. */
private static final String PATTERN = "net.reduls.igo.Tagger";
/** Name of Japanese dictionary. */
private static final String LANG = "ja";
/** The kind of POS(Noun). */
private static final String MEISHI = "\u540D\u8A5E";
/** The kind of POS(Pre-noun Adjectival). */
private static final String RENTAISHI = "\u9023\u4F53\u8A5E";
/** The kind of POS(Adverb). */
private static final String HUKUSHI = "\u526F\u8A5E";
/** The kind of POS(Verb). */
private static final String DOUSHI = "\u52D5\u8A5E";
/** The kind of POS(Conjunction). */
private static final String SETSUZOKUSHI = "\u63A5\u7D9A\u8A5E";
/** The kind of POS(Prefix). */
private static final String SETTOUSHI = "\u63A5\u982D\u8A5E";
/** The kind of POS(Modal verbs). */
private static final String JYODOUSHI = "\u52A9\u52D5\u8A5E";
/** The kind of POS(Postpositional particle). */
private static final String JYOSHI = "\u52A9\u8A5E";
/** The kind of POS(Adjective). */
private static final String KEIYOUSHI = "\u5F62\u5BB9\u8A5E";
/** The kind of POS(Mark). */
private static final String KIGOU = "\u8A18\u53F7";
/** The kind of POS(Interjection). */
private static final String KANDOUSHI = "\u611F\u52D5\u8A5E";
/** The kind of POS(Filler). */
private static final String FILLER = "\u30D5\u30A3\u30E9\u30FC";
/** Constant of Feature(Mark). */
private static final String KIGOU_FEATURE = "\u8A18\u53F7,*,*,*,*,*,*,*,*";
/** Constant of Feature(Noun). */
private static final String MEISHI_FEATURE = "\u540D\u8A5E,*,*,*,*,*,*,*,*";
/** Igo instance. */
private static Object tagger;
/** Parse method. */
private static Method parse;
/** Surface field. */
private static Field surface;
/** feature field. */
private static Field feature;
/** Start field. */
private static Field start;
/** Token iterator. */
private Iterator<Morpheme> tokens;
/** Token list. */
private ArrayList<Morpheme> tokenList = new ArrayList<Morpheme>();
/** Current position of the Token list. */
private int cpos;
/** Current token. */
private Morpheme currToken;
/** Case option. */
private final FTCase cs;
/** Diacritics flag. */
private final boolean dc;
/** Wildcard flag. */
private final boolean wc;
/** Stemming flag. */
private final boolean st;
/** Token position. */
private int pos = -1;
/** Flag indicating a special character. */
private boolean sc;
static {
File dic = null;
if(Reflect.available(PATTERN)) {
dic = new File(LANG);
if(!dic.exists()) {
dic = new File(Prop.HOME, "etc/" + LANG);
if(!dic.exists()) {
available = false;
}
}
} else {
available = false;
}
if(available) {
Class<?> clz = Reflect.find(PATTERN);
if(clz == null) {
Util.debug("Could not initialize Igo Japanese lexer.");
} else {
/* Igo constructor. */
final Constructor<?> tgr = Reflect.find(clz, String.class);
tagger = Reflect.get(tgr, dic.toString());
if(tagger == null) {
available = false;
Util.debug("Could not initialize Igo Japanese lexer.");
} else {
parse = Reflect.method(clz, "parse", CharSequence.class);
if(parse == null) {
Util.debug("Could not initialize Igo lexer method.");
}
clz = Reflect.find("net.reduls.igo.Morpheme");
surface = Reflect.field(clz, "surface");
feature = Reflect.field(clz, "feature");
start = Reflect.field(clz, "start");
}
}
}
}
/**
* Checks if the library is available.
* @return result of check
*/
static boolean available() {
return available;
}
/**
* Constructor.
* @param fto (optional) full-text options
*/
public JapaneseTokenizer(final FTOpt fto) {
cs = fto != null && fto.cs != null ? fto.cs : FTCase.INSENSITIVE;
wc = fto != null && fto.is(WC);
dc = fto != null && fto.is(DC);
st = fto != null && fto.is(ST);
}
@Override
Tokenizer get(final FTOpt f) {
return new JapaneseTokenizer(f);
}
@Override
public JapaneseTokenizer init(final byte[] txt) {
String source = string(txt);
if(wc) { // convert wide-space to space
source = source.replace('\u3000', '\u0020');
}
final ArrayList<?> morpheme =
(ArrayList<?>) Reflect.invoke(parse, tagger, source);
final ArrayList<Morpheme> list = new ArrayList<Morpheme>();
try {
int prev = 0;
final int size = morpheme.size();
StringBuffer meishi = new StringBuffer();
for(int i = 0; i < size; i++) {
final Object m = morpheme.get(i);
Object n = null;
if (i != (size - 1)) {
n = morpheme.get(i + 1);
}
String srfc = surface.get(m).toString();
String ftr = feature.get(m).toString();
String nftr = null;
if (n != null) {
nftr = feature.get(n).toString();
if(ftr.substring(0, 2).equals(MEISHI)) {
if (nftr.substring(0, 2).equals(MEISHI)) {
meishi.append(srfc);
continue;
}
}
}
if (meishi.length() != 0) {
meishi.append(srfc);
srfc = meishi.toString();
ftr = MEISHI_FEATURE;
meishi = new StringBuffer();
List<Morpheme> words = biGram(srfc, ftr);
Iterator<Morpheme> w = words.iterator();
while(w.hasNext()) {
list.add(w.next());
}
prev += srfc.length();
continue;
}
final int s = start.getInt(m);
if(i != 0) {
final int l = s - prev;
if(l != 0) {
list.add(new Morpheme(
source.substring(s - 1, s + l - 1), KIGOU_FEATURE)
);
}
}
prev = srfc.length() + s;
// separates continuous mark (ASCII)
boolean cont = true;
final ArrayList<Morpheme> marks = new ArrayList<Morpheme>();
for(int j = 0; j < srfc.length(); j++) {
final String c = String.valueOf(srfc.charAt(j));
final byte[] t = token(c);
if(t.length == 1)
if(letter(t[0]) || digit(t[0])) cont = false;
else marks.add(new Morpheme(c, KIGOU_FEATURE));
else cont = false;
}
if(cont) list.addAll(marks);
else {
Morpheme mor = new Morpheme(srfc, ftr);
if (mor.getHinshi() == Morpheme.HINSHI_MEISHI) {
List<Morpheme> words = biGram(srfc, ftr);
Iterator<Morpheme> w = words.iterator();
while(w.hasNext()) {
list.add(w.next());
}
} else {
list.add(mor);
}
}
}
} catch(final Exception ex) {
System.out.println(source);
Util.errln(Util.className(this) + ": " + ex);
}
tokenList = list;
tokens = list.iterator();
return this;
}
/**
* divides MEISHI word.
* @param srfc original word
* @param ftr original feature
* @return words divided by the bi-gram
*/
private static List<Morpheme> biGram(final String srfc, final String ftr) {
List<Morpheme> words = new ArrayList<Morpheme>();
final byte[] c = srfc.getBytes();
if (ascii(c)) {
words.add(new Morpheme(srfc, setIntoBaseForm(ftr, srfc)));
return words;
}
final int l = srfc.length();
if (l < 3) {
words.add(new Morpheme(srfc, setIntoBaseForm(ftr, srfc)));
return words;
}
boolean even = false;
if (l % 2 == 0) even = true;
for(int i = 0; i < l; i++) {
boolean boobyToken = false;
boolean lastToken = false;
if (i == (l - 2)) {
boobyToken = true;
}
if (i == (l - 1)) {
lastToken = true;
}
StringBuilder token = new StringBuilder();
if (!lastToken) {
token.append(srfc.substring(i, i + 1));
token.append(srfc.substring(i + 1, i + 2));
String f = ftr;
if (i % 2 == 0) {
f = setIntoBaseForm(ftr, token.toString());
} else {
if (!even && boobyToken) {
f = setIntoBaseForm(ftr, srfc.substring(l - 1));
} else {
f = setIntoBaseForm(ftr, "");
}
}
words.add(new Morpheme(token.toString(), f));
}
}
return words;
}
/**
* set string to base form.
*
* @param ftr future
* @param s replace string
* @return replaced future
*/
private static String setIntoBaseForm(final String ftr, final String s) {
String[] parts = ftr.split(",");
parts[6] = s;
StringBuilder sb = new StringBuilder();
if (parts.length > 0) {
sb.append(parts[0]);
for(int j = 1; j < parts.length; j++) {
sb.append(",");
sb.append(parts[j]);
}
}
return sb.toString();
}
/**
* Returns whether the special character.
* @param s string
* @return result of check
*/
private static boolean isFtChar(final String s) {
return ".".equals(s) || "?".equals(s) || "*".equals(s) || "+".equals(s) ||
"\\".equals(s) || "{".equals(s) || "}".equals(s);
}
/**
* Returns whether the following token exists (using wildcards).
* @return result of check
*/
private boolean moreWC() {
final StringBuilder word = new StringBuilder();
final int size = tokenList.size();
boolean period = false, bs = false, more = false;
for(; cpos < size; cpos++) {
String cSrfc = tokenList.get(cpos).getSurface();
final boolean cMark = tokenList.get(cpos).isMark();
String nSrfc = null;
boolean nMark = false;
if(cpos < size - 1) {
nSrfc = tokenList.get(cpos + 1).getSurface();
nMark = tokenList.get(cpos + 1).isMark();
}
if(nSrfc != null) {
if("\\".equals(cSrfc)) bs = true;
// delimiter
if(cMark && !isFtChar(cSrfc) || "\\".equals(cSrfc) && nMark) {
period = false;
bs = false;
if(word.length() != 0) {
more = true;
break;
}
if("\\".equals(cSrfc) && nMark) cpos++;
continue;
}
word.append(cSrfc);
if(bs || "\\".equals(nSrfc)) {
more = true;
continue;
}
if(".".equals(cSrfc) || ".".equals(nSrfc)) {
period = true;
continue;
}
if(period) {
if("{".equals(cSrfc)) {
cpos++;
for(; cpos < size; cpos++) {
cSrfc = tokenList.get(cpos).getSurface();
word.append(cSrfc);
if("}".equals(cSrfc)) {
more = true;
break;
}
}
cpos++;
break;
}
continue;
}
} else {
// last token.
if(cMark) {
if("\\".equals(cSrfc)) continue;
if(word.length() != 0) {
word.append(cSrfc);
}
more = true;
continue;
}
}
if(period) {
word.append(cSrfc);
} else {
if(bs)
if(!isFtChar(cSrfc)) word.append(cSrfc);
else
word.setLength(0);
}
more = true;
cpos++;
break;
}
if(more) {
currToken = word.length() == 0 ? tokenList.get(cpos - 1) :
new Morpheme(word.toString(), MEISHI_FEATURE);
}
return more;
}
/**
* Returns whether the following token exists.
* @return result
*/
private boolean more() {
if(special) {
return tokens.hasNext();
}
while(tokens.hasNext()) {
currToken = tokens.next();
if(!currToken.isMark() && !currToken.isAttachedWord()) return true;
}
return false;
}
@Override
public boolean hasNext() {
return wc ? moreWC() : more();
}
@Override
public FTSpan next() {
return new FTSpan(nextToken(), pos, sc);
}
/**
* Returns the effective token.
* @return token
*/
private byte[] get() {
pos++;
String n = currToken.getSurface();
final int hinshi = currToken.getHinshi();
if(st &&
(hinshi == Morpheme.HINSHI_DOUSHI ||
hinshi == Morpheme.HINSHI_KEIYOUSHI)) {
n = currToken.getBaseForm();
}
byte[] token = token(n);
final boolean a = ascii(token);
if(!a && !dc) token = WesternTokenizer.dia(token);
if(cs == FTCase.UPPER) token = WesternTokenizer.upper(token, a);
else if(cs != FTCase.SENSITIVE) token = WesternTokenizer.lower(token, a);
return toHankaku(token);
}
/**
* Returns the token which contains special character.
* @return token
*/
private byte[] getSC() {
final Morpheme m = tokens.next();
String n = m.getSurface();
if (m.getHinshi() == Morpheme.HINSHI_MEISHI) {
n = m.getBaseForm();
}
if(m.isMark() || m.isAttachedWord()) sc = true;
else {
pos++;
sc = false;
}
return token(n);
}
@Override
public byte[] nextToken() {
return special ? getSC() : get();
}
@Override
protected byte prec() {
return 20;
}
@Override
Collection<Language> languages() {
return collection(LANG);
}
/**
* Converts to HANKAKU characters.
* @param s Japanese text
* @return result of conversion(->HANKAKU)
*/
private static byte[] toHankaku(final byte[] s) {
if(ascii(s)) return s;
final TokenBuilder tb = new TokenBuilder(s.length);
for(int p = 0; p < s.length; p += cl(s, p)) {
final int c = cp(s, p);
if(c >= 0xFF10 && c <= 0xFF19 || c >= 0xFF21 && c <= 0xFF3A
|| c >= 0xFF41 && c <= 0xFF5A) {
tb.add(c - 0xFEE0);
} else if(c == 0x3000) { // IDEOGRAPHIC SPACE
tb.add(0x0020);
} else if(c == 0xFF01) { // !
tb.add(0x0021);
} else if(c == 0xFF02) { // " FULLWIDTH QUOTATION MARK
tb.add(0x0022);
} else if(c == 0x201C) { // " LEFT DOUBLE QUOTATION MARK
tb.add(0x0022);
} else if(c == 0x201D) { // " RIGHT DOUBLE QUOTATION MARK
tb.add(0x0022);
} else if(c == 0xFF03) { // #
tb.add(0x0023);
} else if(c == 0xFF04) { // $
tb.add(0x0024);
} else if(c == 0xFF05) { // %
tb.add(0x0025);
} else if(c == 0xFF06) { // &
tb.add(0x0026);
} else if(c == 0xFF07) { // ' FULLWIDTH APOSTROPHE
tb.add(0x0027);
} else if(c == 0x2018) { // ' LEFT SINGLE QUOTATION MARK
tb.add(0x0027);
} else if(c == 0x2019) { // ' RIGHT SINGLE QUOTATION MARK
tb.add(0x0027);
} else if(c == 0xFF08) { // (
tb.add(0x0028);
} else if(c == 0xFF09) { // )
tb.add(0x0029);
} else if(c == 0xFF0A) { // *
tb.add(0x002A);
} else if(c == 0xFF0B) { // +
tb.add(0x002B);
} else if(c == 0xFF0C) { // ,
tb.add(0x002C);
} else if(c == 0xFF0D) { // -
tb.add(0x002D);
} else if(c == 0xFF0E) { // .
tb.add(0x002E);
} else if(c == 0xFF0F) { // /
tb.add(0x002F);
} else if(c == 0xFF1A) { // :
tb.add(0x003A);
} else if(c == 0xFF1B) { // ;
tb.add(0x003B);
} else if(c == 0xFF1C) { // <
tb.add(0x003C);
} else if(c == 0xFF1D) { // =
tb.add(0x003D);
} else if(c == 0xFF1E) { // >
tb.add(0x003E);
} else if(c == 0xFF1F) { // ?
tb.add(0x003F);
} else if(c == 0xFF20) { // @
tb.add(0x0040);
} else if(c == 0xFF3B) { // [
tb.add(0x005B);
} else if(c == 0xFFE5) { // \
tb.add(0x005C);
} else if(c == 0xFF3D) { // ]
tb.add(0x005D);
} else if(c == 0xFF3E) { // ^
tb.add(0x005E);
} else if(c == 0xFF3F) { // _
tb.add(0x005F);
} else if(c == 0xFF40) { // `
tb.add(0x0060);
} else if(c == 0xFF5B) { // {
tb.add(0x007B);
} else if(c == 0xFF5C) { // |
tb.add(0x007C);
} else if(c == 0xFF5D) { // }
tb.add(0x007D);
} else if(c == 0xFF5E) { // ~
tb.add(0x007E);
} else {
tb.add(c);
}
}
return tb.finish();
}
/** Morpheme class. */
static class Morpheme {
/** A part of speech in the context, NEISHI(Noun). */
static final int HINSHI_MEISHI = 1;
/** A part of speech in the context, RENTAISHI(Pre-noun Adjectival). */
static final int HINSHI_RENTAISHI = 2;
/** A part of speech in the context, HUKUSHI(Adverb). */
static final int HINSHI_HUKUSHI = 3;
/** A part of speech in the context, DOUSHI(Verb). */
static final int HINSHI_DOUSHI = 4;
/** A part of speech in the context, SETSUZOKUSHI(Conjunction). */
static final int HINSHI_SETSUZOKUSHI = 5;
/** A part of speech in the context, JYODOUSHI(Modal verbs). */
static final int HINSHI_JYODOUSHI = 6;
/** A part of speech in the context, JYOSHI(Postpositional particle). */
static final int HINSHI_JYOSHI = 7;
/** A part of speech in the context, KEIYOUSHI(Adjective). */
static final int HINSHI_KEIYOUSHI = 8;
/** A part of speech in the context, KIGOU(Mark). */
static final int HINSHI_KIGOU = 9;
/** A part of speech in the context, KANDOUSHI(Interjection). */
static final int HINSHI_KANDOUSHI = 10;
/** A part of speech in the context, FILLER(Filler). */
static final int HINSHI_FILLER = 11;
/** A part of speech in the context, SETTOUSHI(Prefix). */
static final int HINSHI_SETTOUSHI = 12;
/** A part of speech in the context, Others. */
static final int HINSHI_SONOTA = 0;
/** Surface of Morpheme. */
private final String mSurface;
/** Feature of Morpheme. */
private final String mFeature;
/**
* Constructor.
* @param srfc surface
* @param ftr feature
*/
Morpheme(final String srfc, final String ftr) {
mSurface = srfc;
mFeature = ftr;
}
/**
* Returns surface.
* @return Surface
*/
public String getSurface() {
return mSurface;
}
/**
* Returns whether the avoid token.
* @return result
*/
public boolean isMark() {
final int hinshi = getHinshi();
return hinshi == HINSHI_KIGOU || hinshi == HINSHI_FILLER;
}
/**
* Tests an attached word(FUZOKU-GO).
* @return result
*/
public boolean isAttachedWord() {
final int hinshi = getHinshi();
return hinshi == HINSHI_JYODOUSHI || hinshi == HINSHI_JYOSHI;
}
/**
* Returns the part of speech.
* @return part of speech
*/
public int getHinshi() {
final int hinshi;
// morphological analyzer certainly returns
// the single ascii char as a "noun".
final byte[] s = token(mSurface);
if(s.length == 1 && !letter(s[0]) && !digit(s[0])) {
hinshi = HINSHI_KIGOU;
} else {
final String h = getPos();
if(h.equals(MEISHI)) {
hinshi = HINSHI_MEISHI;
} else if(h.equals(RENTAISHI)) {
hinshi = HINSHI_RENTAISHI;
} else if(h.equals(HUKUSHI)) {
hinshi = HINSHI_HUKUSHI;
} else if(h.equals(DOUSHI)) {
hinshi = HINSHI_DOUSHI;
} else if(h.equals(SETSUZOKUSHI)) {
hinshi = HINSHI_SETSUZOKUSHI;
} else if(h.equals(SETTOUSHI)) {
hinshi = HINSHI_SETTOUSHI;
} else if(h.equals(JYODOUSHI)) {
hinshi = HINSHI_JYODOUSHI;
} else if(h.equals(JYOSHI)) {
hinshi = HINSHI_JYOSHI;
} else if(h.equals(KEIYOUSHI)) {
hinshi = HINSHI_KEIYOUSHI;
} else if(h.equals(KIGOU)) {
hinshi = HINSHI_KIGOU;
} else if(h.equals(KANDOUSHI)) {
hinshi = HINSHI_KANDOUSHI;
} else if(h.equals(FILLER)) {
hinshi = HINSHI_FILLER;
} else {
hinshi = HINSHI_SONOTA;
}
}
return hinshi;
}
/**
* Retrieves base form from feature.
* @return base form
*/
public String getBaseForm() {
final String[] parts = mFeature.split(",");
if (parts.length < 7) {
return "";
}
return parts[6];
}
/**
* Retrieves parts of speech from feature.
* @return parts of speech(coding in Japanese)
*/
private String getPos() {
final String[] parts = mFeature.split(",");
return parts[0];
}
@Override
public String toString() {
return mSurface;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment