Last active
August 29, 2015 14:13
-
-
Save shun91/2c64d0c81355b920ca56 to your computer and use it in GitHub Desktop.
テキストからn-gramを生成するNgramCreator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.HashMap; | |
import java.util.Map; | |
public class NgramCreator { | |
/** | |
* 入力したtextからn-gramを生成. | |
* n-gramとその出現回数を格納したMapを返す. | |
* 生成時,半角スペースでsplitして1単語とみなす. | |
* | |
* @param text | |
* n-gramを生成するテキスト | |
* @param n | |
* 何gramを生成するか | |
* @return | |
*/ | |
public static final Map<String, Integer> createNgram(final String text, final int n) { | |
final String[] words = text.split(" ", 0); | |
final int numberOfNgram = words.length - n + 1; // 生成されるn-gramの数(ループ回数になる) | |
Map<String, Integer> ngramMap = new HashMap<String, Integer>(); | |
StringBuilder ngramSb = new StringBuilder(); | |
// ngramとその出現回数を格納したMapを生成 | |
for (int i = 0; i < numberOfNgram; i++) { | |
// ngramを1つ生成 | |
for (int j = i; j < i + n; j++) { | |
ngramSb.append(words[j]).append(" "); | |
} | |
ngramSb.deleteCharAt(ngramSb.length() - 1); | |
String ngramStr = ngramSb.toString(); | |
ngramSb.delete(0, ngramSb.length()); | |
// 生成したngramをMapに入れてカウント | |
if (ngramMap.containsKey(ngramStr)) { | |
ngramMap.put(ngramStr, ngramMap.get(ngramStr) + 1); | |
} else { | |
ngramMap.put(ngramStr, 1); | |
} | |
} | |
return ngramMap; | |
} | |
/** | |
* テスト用のmainメソッド | |
* | |
* @param args | |
*/ | |
public static void main(String[] args) { | |
String text = "In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sequence of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus."; | |
int n = 3; | |
Map<String, Integer> map = createNgram(text, n); | |
for (Map.Entry<String, Integer> entry : map.entrySet()) { | |
System.out.println(entry.getKey() + " " + entry.getValue()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment