sorrge/decipher.cpp

## decipher.cpp
#include <iostream>
#include <fstream>
#include <codecvt>
#include <string>
#include <sstream>
#include <unordered_set>
#include <unordered_map>
#include <set>
#include <vector>
#include <algorithm>

using namespace std;

wstring read_file(const string& fn) {
    wifstream file(fn);
    std::wstringstream wss;
    wss << file.rdbuf();
    return wss.str();
}


typedef vector<int> str;

class alpha {
    unordered_map<wchar_t, int> charmap;
    wstring alphabet;
public:
    alpha(wstring& alphabet) : alphabet(alphabet) {
        for(int i = 0; i < (int)alphabet.size(); ++i)
            charmap[alphabet[i]] = i;
    }

    str to_num(wstring& s) {
        str res(s.size());
        for(int i = 0; i < (int)s.size(); ++i)
            res[i] = charmap[s[i]];

        return res;
    }

    wstring to_str(str& v) {
        wstring res(v.size(), ' ');
        for(int i = 0; i < (int)v.size(); ++i)
            res[i] = alphabet[v[i]];

        return res;
    }

    int size() {
        return (int)alphabet.size();
    }
};

class bigram_model {
    vector<str> counts;

public:
    bigram_model(str& corpus, int size) : counts(size) {
        for(int i = 0; i < size; ++i)
            counts[i].resize(size, 0);

        for(int i = 0; i < (int)corpus.size() - 1; ++i)
            ++counts[corpus[i]][corpus[i + 1]];
    }

    int score(str& s) {
        int c = 0;
        for(int i = 0; i < (int)s.size() - 1; ++i)
            c += counts[s[i]][s[i + 1]];

        return c;
    }
};


class trigram_model {
    vector<vector<str>> counts;

public:
    trigram_model(str& corpus, int size) : counts(size) {
        for(int i = 0; i < size; ++i) {
            counts[i].resize(size);
            for(int j = 0; j < size; ++j)
                counts[i][j].resize(size, 0);
        }

        for(int i = 0; i < (int)corpus.size() - 2; ++i)
            ++counts[corpus[i]][corpus[i + 1]][corpus[i + 2]];
    }

    int score(str& s) {
        int c = 0;
        for(int i = 0; i < (int)s.size() - 2; ++i)
            c += counts[s[i]][s[i + 1]][s[i + 2]];

        return c;
    }
};


void decode_shifts(str& s, int shift, int size) {
    for(int i = 0; i < (int)s.size(); ++i)
        s[i] = ((s[i] - shift * i) % size + size) % size;
}


void substitute(str& s, str& subs) {
    for(int& c : s)
        c = subs[c];
}


void decode_1(str& s, str& subs, int shift, int size) {
    substitute(s, subs);
    decode_shifts(s, shift, size);
}


void decode_2(str& s, pair<str, str>& subs2, int shift, int size) {
    substitute(s, subs2.first);
    decode_shifts(s, shift, size);
    substitute(s, subs2.second);
}


void mutate_subs(str& subs) {
    int n = rand() % 2 + 1;
    for(int i = 0; i < n; ++i) {
        int idx1 = rand() % subs.size();
        int idx2 = (idx1 + 1 + rand() % (subs.size() - 1)) % subs.size();
        swap(subs[idx1], subs[idx2]);
    }
}


str used;
void crossover_subs(str& child, str& parent1, str parent2) {
    int size = (int)parent1.size();
    used.resize(size);
    fill_n(used.begin(), size, 0);
    for(int i = 0; i < size; ++i) {
        int chosen = rand() % 2 == 0 ? parent1[i] : parent2[i];
        ++used[chosen];
        child[i] = chosen;
    }

    auto unused = find(used.begin(), used.end(), 0);
    for(int i = 0; i < size && unused != used.end(); ++i)
        if (used[child[i]] > 1) {
            --used[child[i]];
            int c = (int)distance(used.begin(), unused);
            child[i] = c;
            ++used[c];
            unused = find(unused + 1, used.end(), 0);
        }
}


int main(int, char *[]) {
    std::locale::global (std::locale (""));
    wstring cipher = read_file("../cipher");
    wcout << cipher << endl;
    wcout << "cipher size: " << cipher.size() << endl;

    set<wchar_t> used_chars(cipher.begin(), cipher.end());
    wstring alphabet(used_chars.begin(), used_chars.end());
    wcout << "alphabet: " << alphabet << "\t" << alphabet.size() << " chars" << endl;

    wstring wp = read_file("../wp_pp.txt");
    wcout << "corpus: " << wp.size() << endl;

    alpha al(alphabet);
    auto ciph = al.to_num(cipher);
    auto corp = al.to_num(wp);
    bigram_model bg(corp, al.size());
    trigram_model tg(corp, al.size());
    int best_score = bg.score(ciph);
    wcout << "Init score: " << best_score << endl;

    int pop_size = 10000, elite = 500, select = 2000;

    typedef pair<str, str> org;

    org seed = make_pair(al.to_num(alphabet), al.to_num(alphabet));

    vector<org> population(pop_size, seed), new_population(population);
    str scores(pop_size);
    str order(pop_size);
    for(int i = 0; i < pop_size; ++i) {
        mutate_subs(population[i].first);
        mutate_subs(population[i].second);
        order[i] = i;
    }

    int key2 = 8 * 3;

    for(int gen = 0;; ++gen) {
#pragma omp parallel for
        for(int i = 0; i < pop_size; ++i) {
            str cc = ciph;
            decode_2(cc, population[i], key2, al.size());
            scores[i] = bg.score(cc);
        }

        sort(order.begin(), order.end(), [&scores](int i1, int i2) { return scores[i1] > scores[i2]; });
        if (scores[order[0]] > best_score) {
            best_score = scores[order[0]];
            wcout << endl;
            wcout << "New best: " << best_score << endl;
            str cc = ciph;
            decode_2(cc, population[order[0]], key2, al.size());
            wcout << al.to_str(cc) << endl << endl << endl;
        }

        for(int i = 0; i < elite; ++i)
            new_population[i] = population[order[i]];

        for(int i = elite; i < pop_size / 2; ++i) {
            new_population[i] = population[order[rand() % select]];
            mutate_subs(new_population[i].first);
            mutate_subs(new_population[i].second);
        }

        for(int i = pop_size / 2; i < pop_size; ++i) {
            int p1 = order[rand() % select], p2 = order[rand() % select];
            crossover_subs(new_population[i].first, population[p1].first, population[p2].first);
            crossover_subs(new_population[i].second, population[p1].second, population[p2].second);
        }

        population.swap(new_population);
        wcout << ".";
        if (gen % 100 == 0)
            wcout << gen;

        wcout.flush();
    }

    return 0;
}
	#include <iostream>
	#include <fstream>
	#include <codecvt>
	#include <string>
	#include <sstream>
	#include <unordered_set>
	#include <unordered_map>
	#include <set>
	#include <vector>
	#include <algorithm>

	using namespace std;

	wstring read_file(const string& fn) {
	wifstream file(fn);
	std::wstringstream wss;
	wss << file.rdbuf();
	return wss.str();
	}


	typedef vector<int> str;

	class alpha {
	unordered_map<wchar_t, int> charmap;
	wstring alphabet;
	public:
	alpha(wstring& alphabet) : alphabet(alphabet) {
	for(int i = 0; i < (int)alphabet.size(); ++i)
	charmap[alphabet[i]] = i;
	}

	str to_num(wstring& s) {
	str res(s.size());
	for(int i = 0; i < (int)s.size(); ++i)
	res[i] = charmap[s[i]];

	return res;
	}

	wstring to_str(str& v) {
	wstring res(v.size(), ' ');
	for(int i = 0; i < (int)v.size(); ++i)
	res[i] = alphabet[v[i]];

	return res;
	}

	int size() {
	return (int)alphabet.size();
	}
	};

	class bigram_model {
	vector<str> counts;

	public:
	bigram_model(str& corpus, int size) : counts(size) {
	for(int i = 0; i < size; ++i)
	counts[i].resize(size, 0);

	for(int i = 0; i < (int)corpus.size() - 1; ++i)
	++counts[corpus[i]][corpus[i + 1]];
	}

	int score(str& s) {
	int c = 0;
	for(int i = 0; i < (int)s.size() - 1; ++i)
	c += counts[s[i]][s[i + 1]];

	return c;
	}
	};


	class trigram_model {
	vector<vector<str>> counts;

	public:
	trigram_model(str& corpus, int size) : counts(size) {
	for(int i = 0; i < size; ++i) {
	counts[i].resize(size);
	for(int j = 0; j < size; ++j)
	counts[i][j].resize(size, 0);
	}

	for(int i = 0; i < (int)corpus.size() - 2; ++i)
	++counts[corpus[i]][corpus[i + 1]][corpus[i + 2]];
	}

	int score(str& s) {
	int c = 0;
	for(int i = 0; i < (int)s.size() - 2; ++i)
	c += counts[s[i]][s[i + 1]][s[i + 2]];

	return c;
	}
	};


	void decode_shifts(str& s, int shift, int size) {
	for(int i = 0; i < (int)s.size(); ++i)
	s[i] = ((s[i] - shift * i) % size + size) % size;
	}


	void substitute(str& s, str& subs) {
	for(int& c : s)
	c = subs[c];
	}


	void decode_1(str& s, str& subs, int shift, int size) {
	substitute(s, subs);
	decode_shifts(s, shift, size);
	}


	void decode_2(str& s, pair<str, str>& subs2, int shift, int size) {
	substitute(s, subs2.first);
	decode_shifts(s, shift, size);
	substitute(s, subs2.second);
	}


	void mutate_subs(str& subs) {
	int n = rand() % 2 + 1;
	for(int i = 0; i < n; ++i) {
	int idx1 = rand() % subs.size();
	int idx2 = (idx1 + 1 + rand() % (subs.size() - 1)) % subs.size();
	swap(subs[idx1], subs[idx2]);
	}
	}


	str used;
	void crossover_subs(str& child, str& parent1, str parent2) {
	int size = (int)parent1.size();
	used.resize(size);
	fill_n(used.begin(), size, 0);
	for(int i = 0; i < size; ++i) {
	int chosen = rand() % 2 == 0 ? parent1[i] : parent2[i];
	++used[chosen];
	child[i] = chosen;
	}

	auto unused = find(used.begin(), used.end(), 0);
	for(int i = 0; i < size && unused != used.end(); ++i)
	if (used[child[i]] > 1) {
	--used[child[i]];
	int c = (int)distance(used.begin(), unused);
	child[i] = c;
	++used[c];
	unused = find(unused + 1, used.end(), 0);
	}
	}


	int main(int, char *[]) {
	std::locale::global (std::locale (""));
	wstring cipher = read_file("../cipher");
	wcout << cipher << endl;
	wcout << "cipher size: " << cipher.size() << endl;

	set<wchar_t> used_chars(cipher.begin(), cipher.end());
	wstring alphabet(used_chars.begin(), used_chars.end());
	wcout << "alphabet: " << alphabet << "\t" << alphabet.size() << " chars" << endl;

	wstring wp = read_file("../wp_pp.txt");
	wcout << "corpus: " << wp.size() << endl;

	alpha al(alphabet);
	auto ciph = al.to_num(cipher);
	auto corp = al.to_num(wp);
	bigram_model bg(corp, al.size());
	trigram_model tg(corp, al.size());
	int best_score = bg.score(ciph);
	wcout << "Init score: " << best_score << endl;

	int pop_size = 10000, elite = 500, select = 2000;

	typedef pair<str, str> org;

	org seed = make_pair(al.to_num(alphabet), al.to_num(alphabet));

	vector<org> population(pop_size, seed), new_population(population);
	str scores(pop_size);
	str order(pop_size);
	for(int i = 0; i < pop_size; ++i) {
	mutate_subs(population[i].first);
	mutate_subs(population[i].second);
	order[i] = i;
	}

	int key2 = 8 * 3;

	for(int gen = 0;; ++gen) {
	#pragma omp parallel for
	for(int i = 0; i < pop_size; ++i) {
	str cc = ciph;
	decode_2(cc, population[i], key2, al.size());
	scores[i] = bg.score(cc);
	}

	sort(order.begin(), order.end(), [&scores](int i1, int i2) { return scores[i1] > scores[i2]; });
	if (scores[order[0]] > best_score) {
	best_score = scores[order[0]];
	wcout << endl;
	wcout << "New best: " << best_score << endl;
	str cc = ciph;
	decode_2(cc, population[order[0]], key2, al.size());
	wcout << al.to_str(cc) << endl << endl << endl;
	}

	for(int i = 0; i < elite; ++i)
	new_population[i] = population[order[i]];

	for(int i = elite; i < pop_size / 2; ++i) {
	new_population[i] = population[order[rand() % select]];
	mutate_subs(new_population[i].first);
	mutate_subs(new_population[i].second);
	}

	for(int i = pop_size / 2; i < pop_size; ++i) {
	int p1 = order[rand() % select], p2 = order[rand() % select];
	crossover_subs(new_population[i].first, population[p1].first, population[p2].first);
	crossover_subs(new_population[i].second, population[p1].second, population[p2].second);
	}

	population.swap(new_population);
	wcout << ".";
	if (gen % 100 == 0)
	wcout << gen;

	wcout.flush();
	}

	return 0;
	}