tralamazza/breathalizer.cpp

## breathalizer.cpp
/*
 * breathalizer.cpp
 *
 *  Created on: Aug 13, 2011
 *      Author: Daniel Tralamazza <daniel@tralamazza.com>
 *      URL: http://www.facebook.com/careers/puzzles.php?puzzle_id=17
 */

#define NORVIG
#define FACEBOOK

#include <string>
#include <iostream>
#include <map>
#include <vector>
#include <algorithm>
#include <sstream>
#include <fstream>
#ifdef NORVIG
#include <queue>
#include <utility>
#include <set>
#endif

using namespace std;

typedef map<string, int> map_str_int_type;
typedef map<int, vector<string> > map_int_vstr;

static map_str_int_type uwords;
static map_int_vstr idict;

#ifdef NORVIG
typedef set<string> set_str;
static set_str dict;
#endif

#ifdef FACEBOOK
static void read_input(const char* fn) {
	ifstream is(fn, ifstream::in);
	stringstream ss;
	string line, w;
	while (getline(is, line)) {
		ss.str(line);
		while (ss >> w) {
			transform(w.begin(), w.end(), w.begin(), ::toupper); // upper case
			uwords[w] = uwords[w] + 1; // frequency
		}
	}
}

static void read_dict() {
	ifstream is("/var/tmp/twl06.txt", ifstream::in);
	stringstream ss;
	string line;
	while (getline(is, line)) {
		ss.str(line);
		ss >> line;
		idict[line.size()].push_back(line);
#ifdef NORVIG
		dict.insert(line);
#endif
	}
}
#else // FACEBOOK
static void read_input_dict(const char* fn) {
	ifstream is(fn, ifstream::in);
	stringstream ss;
	string line, w;
	bool read_phrase = true;
	while (getline(is, line)) {
		if (line.empty())
			read_phrase = false;
		ss.str(line);
		if (read_phrase) {
			while (ss >> w) {
				transform(w.begin(), w.end(), w.begin(), ::toupper);
				uwords[w] = uwords[w] + 1;
			}
		} else {
			ss >> line;
			idict[line.size()].push_back(line);
#ifdef NORVIG
			dict.insert(line);
#endif
		}
	}
}
#endif // FACEBOOK
typedef pair<string, int> word_k;

#ifdef NORVIG
static inline int push_if_not(set<string>& visited, queue<word_k>& q, const string& w, const int k) {
	if (w.empty()) return 0;
	if (visited.count(w) == 0) {
		q.push(make_pair(w, k));
		visited.insert(w);
	}
	return dict.count(w);
}

static int shuffle_word(const string &w) {
	set<string> visited;
	queue<word_k> bfs_q;
	if (push_if_not(visited, bfs_q, w, 0))
		return 0;
	while (!bfs_q.empty()) {
		const string wk = bfs_q.front().first;
		const unsigned int& k = bfs_q.front().second + 1;
		const string::size_type wlen = wk.size();
		bfs_q.pop();

		for (string::size_type i = 0; i <= wlen; ++i) {
			const string& sub_0i = (i < wlen) ? wk.substr(0, i) : wk;
			const string& sub_i1n = (i + 1 < wlen) ? wk.substr(i + 1) : "";
			const string& sub_in = (i < wlen) ? wk.substr(i) : "";

			if ( push_if_not(visited, bfs_q, sub_0i + sub_i1n, k) ) return k;
			for (char c = 'A'; c <= 'Z'; ++c) {
				if ( push_if_not(visited, bfs_q, sub_0i + c + sub_i1n, k) ) return k;
				if ( push_if_not(visited, bfs_q, sub_0i + c + sub_in, k) ) return k;
			}
		}
	}
	return -1;
}
#endif

static int levenshtein(const string& s1, const string& s2, const int k) {
	int s1n = s1.size();
	int s2n = s2.size();

	const char* s1_c = s1.c_str();
	const char* s2_c = s2.c_str();

	while (s2n > 0 && s1n > 0 && *s1_c == *s2_c) {
		s2n--;
		s1n--;
		s1_c++;
		s2_c++;
	}

	while (s2n > 0 && s1n > 0 && s1_c[s2n-1] == s2_c[s1n-1]) {
		s2n--;
		s1n--;
	}

	if (s2n == 0) return s1n;
	if (s1n == 0) return s2n;

	int d[s1n + 1][s2n + 1];

	for(int j = 0; j <= s2n; ++j)
		d[0][j] = j;
	for (int i = 1; i <= s1n; ++i) {
		d[i][0] = i;
		const char si = s1_c[i - 1];
		for (int j = 1; j <= s2n; ++j) {
			const int cost = (si == s2_c[j - 1]) ? 0 : 1;
			int min = d[i - 1][j - 1] + cost;
			const int a = d[i - 1][j] + 1;
			const int b = d[i][j - 1] + 1;
			if (a < min) min = a;
			if (b < min) min = b;
			d[i][j] = min;
		}
	}

	return d[s1n][s2n];
}

static int scan_dict(const int dict, const string& w, int min) {
	const vector<string> &vd = idict[dict];
	if (vd.empty())
		return min; // no dictionary for this size
	int dist = 0;

	string min_w;
	for (vector<string>::const_iterator id = vd.begin(); id != vd.end(); id++) {
		dist = levenshtein(w, (*id), min);
		if (dist < min) {
			min = dist;
			min_w = (*id);
		}
		if (min == 0)
			break; // found perfect match
	}
	return min;
}

int main(int argc, char** args) {
#ifdef FACEBOOK
	read_input(args[1]);
	read_dict();
#else
	read_input_dict(args[1]);
#endif

	int sum = 0;
	for (map_str_int_type::const_iterator uw = uwords.begin(); uw != uwords.end(); ++uw) {
		const string w = (*uw).first;

		int min;
#ifdef NORVIG
		if (w.size() > 4) {
#endif
			const int len = w.size();
			min = scan_dict(len, w, len);
			for (int k = 1; ; ++k) {
				if (k > min || min == 0) break;
				min = scan_dict(len + k, w, min);
				if (k > min || min == 0) break;
				min = scan_dict(len - k, w, min);
			}
#ifdef NORVIG
		} else
			min = shuffle_word(w);
#endif
		sum += min * (*uw).second;
	}

	cout << sum << endl;
	return 0;
}
	/*
	* breathalizer.cpp
	*
	* Created on: Aug 13, 2011
	* Author: Daniel Tralamazza <daniel@tralamazza.com>
	* URL: http://www.facebook.com/careers/puzzles.php?puzzle_id=17
	*/

	#define NORVIG
	#define FACEBOOK

	#include <string>
	#include <iostream>
	#include <map>
	#include <vector>
	#include <algorithm>
	#include <sstream>
	#include <fstream>
	#ifdef NORVIG
	#include <queue>
	#include <utility>
	#include <set>
	#endif

	using namespace std;

	typedef map<string, int> map_str_int_type;
	typedef map<int, vector<string> > map_int_vstr;

	static map_str_int_type uwords;
	static map_int_vstr idict;

	#ifdef NORVIG
	typedef set<string> set_str;
	static set_str dict;
	#endif

	#ifdef FACEBOOK
	static void read_input(const char* fn) {
	ifstream is(fn, ifstream::in);
	stringstream ss;
	string line, w;
	while (getline(is, line)) {
	ss.str(line);
	while (ss >> w) {
	transform(w.begin(), w.end(), w.begin(), ::toupper); // upper case
	uwords[w] = uwords[w] + 1; // frequency
	}
	}
	}

	static void read_dict() {
	ifstream is("/var/tmp/twl06.txt", ifstream::in);
	stringstream ss;
	string line;
	while (getline(is, line)) {
	ss.str(line);
	ss >> line;
	idict[line.size()].push_back(line);
	#ifdef NORVIG
	dict.insert(line);
	#endif
	}
	}
	#else // FACEBOOK
	static void read_input_dict(const char* fn) {
	ifstream is(fn, ifstream::in);
	stringstream ss;
	string line, w;
	bool read_phrase = true;
	while (getline(is, line)) {
	if (line.empty())
	read_phrase = false;
	ss.str(line);
	if (read_phrase) {
	while (ss >> w) {
	transform(w.begin(), w.end(), w.begin(), ::toupper);
	uwords[w] = uwords[w] + 1;
	}
	} else {
	ss >> line;
	idict[line.size()].push_back(line);
	#ifdef NORVIG
	dict.insert(line);
	#endif
	}
	}
	}
	#endif // FACEBOOK
	typedef pair<string, int> word_k;

	#ifdef NORVIG
	static inline int push_if_not(set<string>& visited, queue<word_k>& q, const string& w, const int k) {
	if (w.empty()) return 0;
	if (visited.count(w) == 0) {
	q.push(make_pair(w, k));
	visited.insert(w);
	}
	return dict.count(w);
	}

	static int shuffle_word(const string &w) {
	set<string> visited;
	queue<word_k> bfs_q;
	if (push_if_not(visited, bfs_q, w, 0))
	return 0;
	while (!bfs_q.empty()) {
	const string wk = bfs_q.front().first;
	const unsigned int& k = bfs_q.front().second + 1;
	const string::size_type wlen = wk.size();
	bfs_q.pop();

	for (string::size_type i = 0; i <= wlen; ++i) {
	const string& sub_0i = (i < wlen) ? wk.substr(0, i) : wk;
	const string& sub_i1n = (i + 1 < wlen) ? wk.substr(i + 1) : "";
	const string& sub_in = (i < wlen) ? wk.substr(i) : "";

	if ( push_if_not(visited, bfs_q, sub_0i + sub_i1n, k) ) return k;
	for (char c = 'A'; c <= 'Z'; ++c) {
	if ( push_if_not(visited, bfs_q, sub_0i + c + sub_i1n, k) ) return k;
	if ( push_if_not(visited, bfs_q, sub_0i + c + sub_in, k) ) return k;
	}
	}
	}
	return -1;
	}
	#endif

	static int levenshtein(const string& s1, const string& s2, const int k) {
	int s1n = s1.size();
	int s2n = s2.size();

	const char* s1_c = s1.c_str();
	const char* s2_c = s2.c_str();

	while (s2n > 0 && s1n > 0 && s1_c == s2_c) {
	s2n--;
	s1n--;
	s1_c++;
	s2_c++;
	}

	while (s2n > 0 && s1n > 0 && s1_c[s2n-1] == s2_c[s1n-1]) {
	s2n--;
	s1n--;
	}

	if (s2n == 0) return s1n;
	if (s1n == 0) return s2n;

	int d[s1n + 1][s2n + 1];

	for(int j = 0; j <= s2n; ++j)
	d[0][j] = j;
	for (int i = 1; i <= s1n; ++i) {
	d[i][0] = i;
	const char si = s1_c[i - 1];
	for (int j = 1; j <= s2n; ++j) {
	const int cost = (si == s2_c[j - 1]) ? 0 : 1;
	int min = d[i - 1][j - 1] + cost;
	const int a = d[i - 1][j] + 1;
	const int b = d[i][j - 1] + 1;
	if (a < min) min = a;
	if (b < min) min = b;
	d[i][j] = min;
	}
	}

	return d[s1n][s2n];
	}

	static int scan_dict(const int dict, const string& w, int min) {
	const vector<string> &vd = idict[dict];
	if (vd.empty())
	return min; // no dictionary for this size
	int dist = 0;

	string min_w;
	for (vector<string>::const_iterator id = vd.begin(); id != vd.end(); id++) {
	dist = levenshtein(w, (*id), min);
	if (dist < min) {
	min = dist;
	min_w = (*id);
	}
	if (min == 0)
	break; // found perfect match
	}
	return min;
	}

	int main(int argc, char** args) {
	#ifdef FACEBOOK
	read_input(args[1]);
	read_dict();
	#else
	read_input_dict(args[1]);
	#endif

	int sum = 0;
	for (map_str_int_type::const_iterator uw = uwords.begin(); uw != uwords.end(); ++uw) {
	const string w = (*uw).first;

	int min;
	#ifdef NORVIG
	if (w.size() > 4) {
	#endif
	const int len = w.size();
	min = scan_dict(len, w, len);
	for (int k = 1; ; ++k) {
	if (k > min \|\| min == 0) break;
	min = scan_dict(len + k, w, min);
	if (k > min \|\| min == 0) break;
	min = scan_dict(len - k, w, min);
	}
	#ifdef NORVIG
	} else
	min = shuffle_word(w);
	#endif
	sum += min * (*uw).second;
	}

	cout << sum << endl;
	return 0;
	}