Skip to content

Instantly share code, notes, and snippets.

@shun91
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shun91/2c64d0c81355b920ca56 to your computer and use it in GitHub Desktop.
Save shun91/2c64d0c81355b920ca56 to your computer and use it in GitHub Desktop.
テキストからn-gramを生成するNgramCreator.java
import java.util.HashMap;
import java.util.Map;
public class NgramCreator {
/**
* 入力したtextからn-gramを生成.
* n-gramとその出現回数を格納したMapを返す.
* 生成時,半角スペースでsplitして1単語とみなす.
*
* @param text
* n-gramを生成するテキスト
* @param n
* 何gramを生成するか
* @return
*/
public static final Map<String, Integer> createNgram(final String text, final int n) {
final String[] words = text.split(" ", 0);
final int numberOfNgram = words.length - n + 1; // 生成されるn-gramの数(ループ回数になる)
Map<String, Integer> ngramMap = new HashMap<String, Integer>();
StringBuilder ngramSb = new StringBuilder();
// ngramとその出現回数を格納したMapを生成
for (int i = 0; i < numberOfNgram; i++) {
// ngramを1つ生成
for (int j = i; j < i + n; j++) {
ngramSb.append(words[j]).append(" ");
}
ngramSb.deleteCharAt(ngramSb.length() - 1);
String ngramStr = ngramSb.toString();
ngramSb.delete(0, ngramSb.length());
// 生成したngramをMapに入れてカウント
if (ngramMap.containsKey(ngramStr)) {
ngramMap.put(ngramStr, ngramMap.get(ngramStr) + 1);
} else {
ngramMap.put(ngramStr, 1);
}
}
return ngramMap;
}
/**
* テスト用のmainメソッド
*
* @param args
*/
public static void main(String[] args) {
String text = "In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sequence of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.";
int n = 3;
Map<String, Integer> map = createNgram(text, n);
for (Map.Entry<String, Integer> entry : map.entrySet()) {
System.out.println(entry.getKey() + " " + entry.getValue());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment