Created
April 6, 2020 12:50
-
-
Save tibordp/b993f3c85f8bd5a74cafc1b0c0e9f589 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
words.h - random fake word generator | |
Coded by Tibor Djurica Potpara <tibor.djurica@ojdip.net>, 2012 | |
http://www.ojdip.net | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU Lesser General Public License as published | |
by the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
#include <iostream> | |
#include <fstream> | |
#include <string> | |
#include <map> | |
#include <random> | |
#include <stdexcept> | |
#include <functional> | |
#include <cstdint> | |
#include <ctime> | |
#include <set> | |
#include <utf8.h> | |
// Utility functions to convert between UTF-8 and UTF-16 | |
const std::wstring utf8_c(const std::string& input) | |
{ | |
std::wstring result; | |
utf8::utf8to16(input.begin(), input.end(), std::back_inserter(result)); | |
return result; | |
} | |
const std::string utf8_c(const std::wstring& input) | |
{ | |
std::string result; | |
utf8::utf16to8(input.begin(), input.end(), std::back_inserter(result)); | |
return result; | |
} | |
// A random number generator singleton - to ensure it is seeded only once | |
class rng | |
{ | |
private: | |
std::mt19937 rng_engine; | |
static rng& instance() | |
{ | |
static rng instance_; | |
return instance_; | |
} | |
rng() { | |
rng_engine.seed( | |
static_cast<unsigned long>(time(nullptr)) | |
); | |
}; | |
rng(rng const&); | |
void operator=(rng const&); | |
public: | |
static long random(long low, long high) | |
{ | |
return std::uniform_int_distribution<long>(low, high)(instance().rng_engine); | |
} | |
}; | |
// Function that returns a next character randomly, weighted by a frequency map | |
wchar_t next_character(std::map<wchar_t, long> &old_map) | |
{ | |
int count = 0; | |
for (auto &a : old_map) | |
count += a.second; | |
int choice = rng::random(0, count - 1); | |
int left = 0; | |
for (auto &a : old_map) | |
{ | |
left += a.second; | |
if (left > choice) | |
return a.first; | |
} | |
return L''; | |
} | |
int main(int argc, char** argv) | |
{ | |
std::string infile, outfile; | |
try | |
{ | |
if (argc != 3) | |
throw std::runtime_error("Invalid commandline parameters!"); | |
// We generate a frequency map for strings of n words. | |
int n_chars = std::atoi(argv[1]); | |
std::ifstream dictionary(argv[2]); | |
if (!dictionary) | |
throw std::runtime_error("Cannot open file!"); | |
std::map<std::wstring, | |
std::map<wchar_t, long>> freq; | |
std::set<std::wstring> originals; | |
// Dictionary must be a newline-delimited wordlist with no additional data and in UTF-8 encoding | |
while (!dictionary.eof()) | |
{ | |
std::string line_utf8; | |
std::getline(dictionary, line_utf8); | |
std::wstring line(utf8_c(line_utf8)); | |
std::wstring last = L""; | |
auto add = [&] (wchar_t cur) { | |
auto iter = freq[last].find(cur); | |
if (iter == freq[last].end()) | |
freq[last][cur] = 1; | |
else | |
iter->second ++; | |
if (last.size() == 5) | |
last.erase(last.begin()); | |
last.push_back(cur); | |
}; | |
for (auto& r : line) | |
add(r); | |
add(L''); | |
originals.insert(line); | |
} | |
std::set<std::wstring> results; | |
// We will generate at most 1,000,000 words | |
for (int i=0; i<1000000; i++) | |
{ | |
std::wstring last = L""; | |
std::wstring result; | |
for (;;) | |
{ | |
auto new_string = next_character(freq[last]); | |
if (new_string == L'') break; | |
if (last.size() == 5) | |
last.erase(last.begin()); | |
last.push_back(new_string); | |
result += new_string; | |
} | |
// Algorithm tends to produce words of excessive length, so we drop anything longer then 12 letters | |
if (result.length() < 12) | |
// We want to remove words actually present in the real language | |
if (originals.find(result) == originals.end()) | |
results.insert(result); | |
} | |
// Let's print them out... | |
for (auto &l : results) | |
{ | |
std::cout << utf8_c(l) << std::endl; | |
} | |
} catch (std::exception& e) | |
{ | |
std::cerr << e.what(); | |
return 1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment