Created
May 14, 2012 18:41
-
-
Save foota/2695610 to your computer and use it in GitHub Desktop.
Edit distance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <bitset> | |
#include <ctime> | |
using namespace std; | |
// dynamic programming | |
const int SIZE = 100; | |
int edit_distance_dp(const string& str1, const string& str2) | |
{ | |
static int d[SIZE][SIZE]; | |
for (int i = 0; i < str1.size() + 1; i++) d[i][0] = i; | |
for (int i = 0; i < str2.size() + 1; i++) d[0][i] = i; | |
for (int i = 1; i < str1.size() + 1; i++) | |
for (int j = 1; j < str2.size() + 1; j++) | |
d[i][j] = min(min(d[i-1][j], d[i][j-1]) + 1, d[i-1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); | |
return d[str1.size()][str2.size()]; | |
} | |
// O(ND) algorithm | |
int edit_distance_ond(const string& str1, const string& str2) | |
{ | |
static int V[SIZE]; | |
int x, y; | |
int offset = str1.size(); | |
V[offset + 1] = 0; | |
for (int D = 0; D <= str1.size() + str2.size(); D++) { | |
for (int k = -D; k <= D; k += 2) { | |
if (k == -D || k != D && V[k-1+offset] < V[k+1+offset]) x = V[k+1+offset]; | |
else x = V[k-1+offset] + 1; | |
y = x - k; | |
while (x < str1.size() && y < str2.size() && str1[x] == str2[y]) { | |
x++; | |
y++; | |
} | |
V[k+offset] = x; | |
if (x >= str1.size() && y >= str2.size()) return D; | |
} | |
} | |
return -1; | |
} | |
// O(NP) algorithm | |
int snake(int k, int y, const string& str1, const string& str2) | |
{ | |
int x = y - k; | |
while (x < str1.size() && y < str2.size() && str1[x] == str2[y]) { | |
x++; | |
y++; | |
} | |
return y; | |
} | |
int edit_distance_onp(const string& str1, const string& str2) | |
{ | |
// required: s1->size() <= s2->size() | |
const string* const s1 = str1.size() > str2.size() ? &str2 : &str1; | |
const string* const s2 = str1.size() > str2.size() ? &str1 : &str2; | |
static int fp[SIZE]; | |
int x, y, k, p; | |
int offset = s1->size() + 1; | |
int delta = s2->size() - s1->size(); | |
for (int i = 0; i < SIZE; i++) fp[i] = -1; | |
for (p = 0; fp[delta + offset] != s2->size(); p++) { | |
for(k = -p; k < delta; k++) | |
fp[k + offset] = snake(k, max(fp[k-1+offset] + 1, fp[k+1+offset]), *s1, *s2); | |
for(k = delta + p; k > delta; k--) | |
fp[k + offset] = snake(k, max(fp[k-1+offset] + 1, fp[k+1+offset]), *s1, *s2); | |
fp[delta + offset] = snake(delta, max(fp[delta-1+offset] + 1, fp[delta+1+offset]), *s1, *s2); | |
} | |
return delta + (p - 1) * 2; | |
} | |
// bit-parallel | |
template<typename T> | |
int edit_distance_bit(const string& str1, const string& str2) | |
{ | |
char s1[sizeof(T) * 8] = { ' ' }; | |
char s2[sizeof(T) * 8] = { ' ' }; | |
char *p1, *p2; | |
// required: str1.size() >= str2.size() | |
if (str1.size() >= str2.size()) { p1 = s1; p2 = s2; } | |
else { p1 = s2; p2 = s1; } | |
copy(str1.begin(), str1.end(), p1 + 1); | |
copy(str2.begin(), str2.end(), p2 + 1); | |
int m = strlen(s1); | |
int n = strlen(s2); | |
const T ONE = 1; | |
T Peq[256] = { 0 }; | |
T Pv, Eq, Xv, Xh, Ph, Mh; | |
T Mv = 0; | |
int Score = m; | |
for (int i = 0; i < m; i++) { | |
Peq[s1[i]] |= ONE << i; | |
Pv |= (ONE << i); | |
} | |
for (int j = 0; j < n; j++) { | |
Eq = Peq[s2[j]]; | |
Xv = Eq | Mv; | |
Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; | |
Ph = Mv | ~(Xh | Pv); | |
Mh = Pv & Xh; | |
if (Ph & (ONE << (m - 1))) Score++; | |
else if (Mh & (ONE << (m - 1))) Score--; | |
Ph <<= ONE; | |
Pv = (Mh << ONE) | ~(Xv | Ph); | |
Mv = Ph & Xv; | |
} | |
return Score; | |
} | |
// bit-parallel (bitset) | |
template<size_t N> | |
const bitset<N> operator+(const bitset<N>& lhs, const bitset<N>& rhs) | |
{ | |
bitset<N> a(lhs), b(rhs), ret(lhs ^ rhs); | |
for (b = (a & b) << 1, a = ret; b.any(); b = (a & b) << 1, a = ret) ret ^= b; | |
return ret; | |
} | |
template<size_t N> | |
int edit_distance_bitset(const string& str1, const string& str2) | |
{ | |
char s1[N] = { ' ' }; | |
char s2[N] = { ' ' }; | |
char *p1, *p2; | |
// required: str1.size() >= str2.size() | |
if (str1.size() >= str2.size()) { p1 = s1; p2 = s2; } | |
else { p1 = s2; p2 = s1; } | |
copy(str1.begin(), str1.end(), p1 + 1); | |
copy(str2.begin(), str2.end(), p2 + 1); | |
int m = strlen(s1); | |
int n = strlen(s2); | |
const bitset<N> ONE(1); | |
bitset<N> Peq[256]; | |
bitset<N> Pv, Mv, Eq, Xv, Xh, Ph, Mh; | |
int Score = m; | |
for (int i = 0; i < m; i++) { | |
Peq[s1[i]] |= ONE << i; | |
Pv |= (ONE << i); | |
} | |
for (int j = 0; j < n; j++) { | |
Eq = Peq[s2[j]]; | |
Xv = Eq | Mv; | |
Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; | |
Ph = Mv | ~(Xh | Pv); | |
Mh = Pv & Xh; | |
if ((Ph & (ONE << (m - 1))).any()) Score++; | |
else if ((Mh & (ONE << (m - 1))).any()) Score--; | |
Ph <<= 1; | |
Pv = (Mh << 1) | ~(Xv | Ph); | |
Mv = Ph & Xv; | |
} | |
return Score; | |
} | |
// for test | |
void run(int (*func)(const string&, const string&), const string& arg1, const string& arg2, int num, const string& msg) | |
{ | |
clock_t start, finish; | |
start = clock(); | |
for (int i = 0; i < num - 1; i++) (*func)(arg1, arg2); | |
cout << msg << " : " << (*func)(arg1, arg2) << endl; | |
finish = clock(); | |
cout << "Time: " << (double)(finish - start) / CLOCKS_PER_SEC << "s (" << num << " times)" << endl; | |
cout << endl; | |
} | |
int main() | |
{ | |
string str1 = "agtcaaaagtcagtcagtcagtcagtcacagtcagaaggcatccaaccga"; | |
string str2 = "ccgttagtcagaaacagtcagtcagtcagtcagtccagtcttaggcccgga"; | |
cout << str1 << endl; | |
cout << str2 << endl; | |
run(edit_distance_dp, str1, str2, 100000, "dp "); | |
run(edit_distance_ond, str1, str2, 100000, "ond "); | |
run(edit_distance_onp, str1, str2, 100000, "onp "); | |
run(edit_distance_bit<unsigned long long>, str1, str2, 100000, "bit "); | |
run(edit_distance_bitset<60>, str1, str2, 100000, "bitset"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://handasse.blogspot.com/2009/04/c_29.html