Created
August 15, 2011 08:43
-
-
Save tralamazza/1145911 to your computer and use it in GitHub Desktop.
breathalizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* breathalizer.cpp | |
* | |
* Created on: Aug 13, 2011 | |
* Author: Daniel Tralamazza <daniel@tralamazza.com> | |
* URL: http://www.facebook.com/careers/puzzles.php?puzzle_id=17 | |
*/ | |
#define NORVIG | |
#define FACEBOOK | |
#include <string> | |
#include <iostream> | |
#include <map> | |
#include <vector> | |
#include <algorithm> | |
#include <sstream> | |
#include <fstream> | |
#ifdef NORVIG | |
#include <queue> | |
#include <utility> | |
#include <set> | |
#endif | |
using namespace std; | |
typedef map<string, int> map_str_int_type; | |
typedef map<int, vector<string> > map_int_vstr; | |
static map_str_int_type uwords; | |
static map_int_vstr idict; | |
#ifdef NORVIG | |
typedef set<string> set_str; | |
static set_str dict; | |
#endif | |
#ifdef FACEBOOK | |
static void read_input(const char* fn) { | |
ifstream is(fn, ifstream::in); | |
stringstream ss; | |
string line, w; | |
while (getline(is, line)) { | |
ss.str(line); | |
while (ss >> w) { | |
transform(w.begin(), w.end(), w.begin(), ::toupper); // upper case | |
uwords[w] = uwords[w] + 1; // frequency | |
} | |
} | |
} | |
static void read_dict() { | |
ifstream is("/var/tmp/twl06.txt", ifstream::in); | |
stringstream ss; | |
string line; | |
while (getline(is, line)) { | |
ss.str(line); | |
ss >> line; | |
idict[line.size()].push_back(line); | |
#ifdef NORVIG | |
dict.insert(line); | |
#endif | |
} | |
} | |
#else // FACEBOOK | |
static void read_input_dict(const char* fn) { | |
ifstream is(fn, ifstream::in); | |
stringstream ss; | |
string line, w; | |
bool read_phrase = true; | |
while (getline(is, line)) { | |
if (line.empty()) | |
read_phrase = false; | |
ss.str(line); | |
if (read_phrase) { | |
while (ss >> w) { | |
transform(w.begin(), w.end(), w.begin(), ::toupper); | |
uwords[w] = uwords[w] + 1; | |
} | |
} else { | |
ss >> line; | |
idict[line.size()].push_back(line); | |
#ifdef NORVIG | |
dict.insert(line); | |
#endif | |
} | |
} | |
} | |
#endif // FACEBOOK | |
typedef pair<string, int> word_k; | |
#ifdef NORVIG | |
static inline int push_if_not(set<string>& visited, queue<word_k>& q, const string& w, const int k) { | |
if (w.empty()) return 0; | |
if (visited.count(w) == 0) { | |
q.push(make_pair(w, k)); | |
visited.insert(w); | |
} | |
return dict.count(w); | |
} | |
static int shuffle_word(const string &w) { | |
set<string> visited; | |
queue<word_k> bfs_q; | |
if (push_if_not(visited, bfs_q, w, 0)) | |
return 0; | |
while (!bfs_q.empty()) { | |
const string wk = bfs_q.front().first; | |
const unsigned int& k = bfs_q.front().second + 1; | |
const string::size_type wlen = wk.size(); | |
bfs_q.pop(); | |
for (string::size_type i = 0; i <= wlen; ++i) { | |
const string& sub_0i = (i < wlen) ? wk.substr(0, i) : wk; | |
const string& sub_i1n = (i + 1 < wlen) ? wk.substr(i + 1) : ""; | |
const string& sub_in = (i < wlen) ? wk.substr(i) : ""; | |
if ( push_if_not(visited, bfs_q, sub_0i + sub_i1n, k) ) return k; | |
for (char c = 'A'; c <= 'Z'; ++c) { | |
if ( push_if_not(visited, bfs_q, sub_0i + c + sub_i1n, k) ) return k; | |
if ( push_if_not(visited, bfs_q, sub_0i + c + sub_in, k) ) return k; | |
} | |
} | |
} | |
return -1; | |
} | |
#endif | |
static int levenshtein(const string& s1, const string& s2, const int k) { | |
int s1n = s1.size(); | |
int s2n = s2.size(); | |
const char* s1_c = s1.c_str(); | |
const char* s2_c = s2.c_str(); | |
while (s2n > 0 && s1n > 0 && *s1_c == *s2_c) { | |
s2n--; | |
s1n--; | |
s1_c++; | |
s2_c++; | |
} | |
while (s2n > 0 && s1n > 0 && s1_c[s2n-1] == s2_c[s1n-1]) { | |
s2n--; | |
s1n--; | |
} | |
if (s2n == 0) return s1n; | |
if (s1n == 0) return s2n; | |
int d[s1n + 1][s2n + 1]; | |
for(int j = 0; j <= s2n; ++j) | |
d[0][j] = j; | |
for (int i = 1; i <= s1n; ++i) { | |
d[i][0] = i; | |
const char si = s1_c[i - 1]; | |
for (int j = 1; j <= s2n; ++j) { | |
const int cost = (si == s2_c[j - 1]) ? 0 : 1; | |
int min = d[i - 1][j - 1] + cost; | |
const int a = d[i - 1][j] + 1; | |
const int b = d[i][j - 1] + 1; | |
if (a < min) min = a; | |
if (b < min) min = b; | |
d[i][j] = min; | |
} | |
} | |
return d[s1n][s2n]; | |
} | |
static int scan_dict(const int dict, const string& w, int min) { | |
const vector<string> &vd = idict[dict]; | |
if (vd.empty()) | |
return min; // no dictionary for this size | |
int dist = 0; | |
string min_w; | |
for (vector<string>::const_iterator id = vd.begin(); id != vd.end(); id++) { | |
dist = levenshtein(w, (*id), min); | |
if (dist < min) { | |
min = dist; | |
min_w = (*id); | |
} | |
if (min == 0) | |
break; // found perfect match | |
} | |
return min; | |
} | |
int main(int argc, char** args) { | |
#ifdef FACEBOOK | |
read_input(args[1]); | |
read_dict(); | |
#else | |
read_input_dict(args[1]); | |
#endif | |
int sum = 0; | |
for (map_str_int_type::const_iterator uw = uwords.begin(); uw != uwords.end(); ++uw) { | |
const string w = (*uw).first; | |
int min; | |
#ifdef NORVIG | |
if (w.size() > 4) { | |
#endif | |
const int len = w.size(); | |
min = scan_dict(len, w, len); | |
for (int k = 1; ; ++k) { | |
if (k > min || min == 0) break; | |
min = scan_dict(len + k, w, min); | |
if (k > min || min == 0) break; | |
min = scan_dict(len - k, w, min); | |
} | |
#ifdef NORVIG | |
} else | |
min = shuffle_word(w); | |
#endif | |
sum += min * (*uw).second; | |
} | |
cout << sum << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Norvig's Spell-correct port. Good for small words.