Skip to content

Instantly share code, notes, and snippets.

@seikichi
Created May 9, 2011 03:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seikichi/962015 to your computer and use it in GitHub Desktop.
Save seikichi/962015 to your computer and use it in GitHub Desktop.
サークルの講座用(2011/5/9)
/*
サークルの講座用に書いた何か.
`もう/もう 何/なに も/も 怖/コワ く/く な/な い/い 。/。`
という形式のコーパスを読み込んで仮名漢字変換っぽい何かをします.
未知語とか何も考えてない上にスムージングのパラメータも適当でこれはひどい.
「EUCだから1文字2バイトだろ」とか決め打ちでこれはひどい.
全体的に富豪気味でこれはひどい.
using namepsace std; もぐもぐ!
gcc 4.4で動作確認.
% echo "もうなにもこわくない" | nkf -e | ./a.out corpus/all | nkf -w
=> もう何も怖くない
*/
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <sstream>
#include <tr1/unordered_map>
#include <algorithm>
#include <queue>
#include <cmath>
using namespace std;
typedef tr1::unordered_map<string, double> StringDoubleMap;
typedef tr1::unordered_map<string, double> Unigram;
typedef tr1::unordered_map<string, StringDoubleMap > Bigram;
typedef tr1::unordered_map<string, StringDoubleMap > YomiProb;
struct LM {
double lambda1, lambda2;
Unigram unigram;
Bigram bigram;
LM(double l1, double l2) : lambda1(l1), lambda2(l2) {}
double prob(const string& prev, const string& w) const;
};
void get_yomi(istream& in, YomiProb* yomi);
void make_model(istream& in, Unigram* uni, Bigram* bg);
string henkan(const LM& model, const YomiProb& yomi, const string& hira);
int main(int argc, char **argv) {
if (argc != 2) {
cerr << "Usage: kakan corpus" << endl;
return -1;
}
LM model(0.3, 0.7);
YomiProb yomi;
char *filename = argv[1];
{
ifstream in(filename);
get_yomi(in, &yomi);
} {
ifstream in(filename);
make_model(in, &model.unigram, &model.bigram);
}
string raw;
while (cin >> raw) {
cout << "=> " << henkan(model, yomi, raw) << endl;
}
}
double LM::prob(const string& prev, const string& w) const {
double p1 = 0.0, p2 = 0.0;
Unigram::const_iterator ui = unigram.find(w);
if (ui != unigram.end()) { p1 = ui->second; }
Bigram::const_iterator bi = bigram.find(prev);
if (bi != bigram.end()) {
Unigram::const_iterator it = bi->second.find(w);
if (it != bi->second.end()) { p2 = it->second; }
}
return lambda1*p1 + lambda2*p2;
}
pair<string, string> split(const string& s) {
size_t i = s.find('/');
return pair<string, string>(s.substr(0, i), s.substr(i+1));
}
void get_yomi(istream& in, YomiProb* yomi) {
YomiProb tmp;
string line, wordyomi;
while (getline(in, line)) {
istringstream iss(line);
while (iss >> wordyomi) {
string w, y;
pair<string, string> p = split(wordyomi);
w = p.first;
y = p.second;
tmp[w][y] += 1.0;
}
}
for (YomiProb::iterator it = tmp.begin();
it != tmp.end();
++it) {
double sum = 0.0;
for (StringDoubleMap::iterator jt = it->second.begin();
jt != it->second.end();
++jt) {
sum += jt->second;
}
for (StringDoubleMap::iterator jt = it->second.begin();
jt != it->second.end();
++jt) {
(*yomi)[jt->first][it->first] = jt->second / sum;
}
}
(*yomi)["BT"]["BT"] = 1.0;
}
void make_model(istream& in, Unigram* uni, Bigram* bg) {
string line, wordyomi;
while (getline(in, line)) {
string prevw = "BT", w, y;
istringstream iss(line + " BT/BT");
while (iss >> wordyomi) {
pair<string, string> p = split(wordyomi);
w = p.first;
y = p.second;
(*uni)[w] += 1.0;
(*bg)[prevw][w] += 1.0;
prevw = w;
}
}
double usum = 0.0;
for (Unigram::iterator it = uni->begin();
it != uni->end();
++it) {
usum += it->second;
}
for (Unigram::iterator it = uni->begin();
it != uni->end();
++it) {
it->second /= usum;
}
for (Bigram::iterator it = bg->begin();
it != bg->end();
++it) {
double sum = 0.0;
for (Unigram::iterator jt = it->second.begin();
jt != it->second.end();
++jt) {
sum += jt->second;
}
for (Unigram::iterator jt = it->second.begin();
jt != it->second.end();
++jt) {
jt->second /= sum;
}
}
}
struct Node {
unsigned int index;
double prob;
string prev, all;
Node(int index, double prob,
const string& prev, const string& all) :
index(index), prob(prob), prev(prev), all(all) {}
bool operator<(const Node& rhs) const {
return prob < rhs.prob;
}
};
string henkan(const LM& model,
const YomiProb& yomi,
const string& hira) {
string ret = "!!ERROR: couldn't translate `" + hira + "` !!!";
const string raw = hira + "BT";
priority_queue<Node> Q;
Q.push(Node(0, 0.0, "BT", ""));
tr1::unordered_map<int, tr1::unordered_map<string, double> > memo;
while (!Q.empty()) {
Node n = Q.top(); Q.pop();
memo[n.index][n.prev] = n.prob;
if (n.index > 0 && n.prev == "BT") {
ret = n.all.substr(0, n.all.length()-2);
break;
}
for (int i=2; n.index+i<=raw.length(); i+=2) {
string rword = raw.substr(n.index, i);
YomiProb::const_iterator f = yomi.find(rword);
if (f != yomi.end()) {
StringDoubleMap::const_iterator it;;
for (it = f->second.begin();
it != f->second.end();
++it) {
int index = n.index + i;
string kanji = it->first;
double kanji_prob = it->second;
double model_prob = model.prob(n.prev, kanji);
double prob = log2(kanji_prob)
+ log2(model_prob)
+ n.prob;
if ((memo[index].find(kanji) != memo[index].end())
&& (prob < memo[index][kanji])) {
continue;
}
Q.push(Node(index, prob, kanji, n.all+kanji));
}
}
}
}
return ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment