/* | |
words.h - random fake word generator | |
Coded by Tibor Djurica Potpara <tibor.djurica@ojdip.net>, 2012 | |
http://www.ojdip.net | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU Lesser General Public License as published | |
by the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
#include <iostream> | |
#include <fstream> | |
#include <string> | |
#include <map> | |
#include <random> | |
#include <stdexcept> | |
#include <functional> | |
#include <cstdint> | |
#include <ctime> | |
#include <set> | |
#include <utf8.h> | |
// Utility functions to convert between UTF-8 and UTF-16 | |
const std::wstring utf8_c(const std::string& input) | |
{ | |
std::wstring result; | |
utf8::utf8to16(input.begin(), input.end(), std::back_inserter(result)); | |
return result; | |
} | |
const std::string utf8_c(const std::wstring& input) | |
{ | |
std::string result; | |
utf8::utf16to8(input.begin(), input.end(), std::back_inserter(result)); | |
return result; | |
} | |
// A random number generator singleton - to ensure it is seeded only once | |
class rng | |
{ | |
private: | |
std::mt19937 rng_engine; | |
static rng& instance() | |
{ | |
static rng instance_; | |
return instance_; | |
} | |
rng() { | |
rng_engine.seed( | |
static_cast<unsigned long>(time(nullptr)) | |
); | |
}; | |
rng(rng const&); | |
void operator=(rng const&); | |
public: | |
static long random(long low, long high) | |
{ | |
return std::uniform_int_distribution<long>(low, high)(instance().rng_engine); | |
} | |
}; | |
// Function that returns a next character randomly, weighted by a frequency map | |
wchar_t next_character(std::map<wchar_t, long> &old_map) | |
{ | |
int count = 0; | |
for (auto &a : old_map) | |
count += a.second; | |
int choice = rng::random(0, count - 1); | |
int left = 0; | |
for (auto &a : old_map) | |
{ | |
left += a.second; | |
if (left > choice) | |
return a.first; | |
} | |
return L''; | |
} | |
int main(int argc, char** argv) | |
{ | |
std::string infile, outfile; | |
try | |
{ | |
if (argc != 3) | |
throw std::runtime_error("Invalid commandline parameters!"); | |
// We generate a frequency map for strings of n words. | |
int n_chars = std::atoi(argv[1]); | |
std::ifstream dictionary(argv[2]); | |
if (!dictionary) | |
throw std::runtime_error("Cannot open file!"); | |
std::map<std::wstring, | |
std::map<wchar_t, long>> freq; | |
std::set<std::wstring> originals; | |
// Dictionary must be a newline-delimited wordlist with no additional data and in UTF-8 encoding | |
while (!dictionary.eof()) | |
{ | |
std::string line_utf8; | |
std::getline(dictionary, line_utf8); | |
std::wstring line(utf8_c(line_utf8)); | |
std::wstring last = L""; | |
auto add = [&] (wchar_t cur) { | |
auto iter = freq[last].find(cur); | |
if (iter == freq[last].end()) | |
freq[last][cur] = 1; | |
else | |
iter->second ++; | |
if (last.size() == 5) | |
last.erase(last.begin()); | |
last.push_back(cur); | |
}; | |
for (auto& r : line) | |
add(r); | |
add(L''); | |
originals.insert(line); | |
} | |
std::set<std::wstring> results; | |
// We will generate at most 1,000,000 words | |
for (int i=0; i<1000000; i++) | |
{ | |
std::wstring last = L""; | |
std::wstring result; | |
for (;;) | |
{ | |
auto new_string = next_character(freq[last]); | |
if (new_string == L'') break; | |
if (last.size() == 5) | |
last.erase(last.begin()); | |
last.push_back(new_string); | |
result += new_string; | |
} | |
// Algorithm tends to produce words of excessive length, so we drop anything longer then 12 letters | |
if (result.length() < 12) | |
// We want to remove words actually present in the real language | |
if (originals.find(result) == originals.end()) | |
results.insert(result); | |
} | |
// Let's print them out... | |
for (auto &l : results) | |
{ | |
std::cout << utf8_c(l) << std::endl; | |
} | |
} catch (std::exception& e) | |
{ | |
std::cerr << e.what(); | |
return 1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment