Skip to content

Instantly share code, notes, and snippets.

@tibordp

tibordp/words.cpp

Created Apr 6, 2020
Embed
What would you like to do?
/*
words.h - random fake word generator
Coded by Tibor Djurica Potpara <tibor.djurica@ojdip.net>, 2012
http://www.ojdip.net
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <random>
#include <stdexcept>
#include <functional>
#include <cstdint>
#include <ctime>
#include <set>
#include <utf8.h>
// Utility functions to convert between UTF-8 and UTF-16
const std::wstring utf8_c(const std::string& input)
{
std::wstring result;
utf8::utf8to16(input.begin(), input.end(), std::back_inserter(result));
return result;
}
const std::string utf8_c(const std::wstring& input)
{
std::string result;
utf8::utf16to8(input.begin(), input.end(), std::back_inserter(result));
return result;
}
// A random number generator singleton - to ensure it is seeded only once
class rng
{
private:
std::mt19937 rng_engine;
static rng& instance()
{
static rng instance_;
return instance_;
}
rng() {
rng_engine.seed(
static_cast<unsigned long>(time(nullptr))
);
};
rng(rng const&);
void operator=(rng const&);
public:
static long random(long low, long high)
{
return std::uniform_int_distribution<long>(low, high)(instance().rng_engine);
}
};
// Function that returns a next character randomly, weighted by a frequency map
wchar_t next_character(std::map<wchar_t, long> &old_map)
{
int count = 0;
for (auto &a : old_map)
count += a.second;
int choice = rng::random(0, count - 1);
int left = 0;
for (auto &a : old_map)
{
left += a.second;
if (left > choice)
return a.first;
}
return L'';
}
int main(int argc, char** argv)
{
std::string infile, outfile;
try
{
if (argc != 3)
throw std::runtime_error("Invalid commandline parameters!");
// We generate a frequency map for strings of n words.
int n_chars = std::atoi(argv[1]);
std::ifstream dictionary(argv[2]);
if (!dictionary)
throw std::runtime_error("Cannot open file!");
std::map<std::wstring,
std::map<wchar_t, long>> freq;
std::set<std::wstring> originals;
// Dictionary must be a newline-delimited wordlist with no additional data and in UTF-8 encoding
while (!dictionary.eof())
{
std::string line_utf8;
std::getline(dictionary, line_utf8);
std::wstring line(utf8_c(line_utf8));
std::wstring last = L"";
auto add = [&] (wchar_t cur) {
auto iter = freq[last].find(cur);
if (iter == freq[last].end())
freq[last][cur] = 1;
else
iter->second ++;
if (last.size() == 5)
last.erase(last.begin());
last.push_back(cur);
};
for (auto& r : line)
add(r);
add(L'');
originals.insert(line);
}
std::set<std::wstring> results;
// We will generate at most 1,000,000 words
for (int i=0; i<1000000; i++)
{
std::wstring last = L"";
std::wstring result;
for (;;)
{
auto new_string = next_character(freq[last]);
if (new_string == L'') break;
if (last.size() == 5)
last.erase(last.begin());
last.push_back(new_string);
result += new_string;
}
// Algorithm tends to produce words of excessive length, so we drop anything longer then 12 letters
if (result.length() < 12)
// We want to remove words actually present in the real language
if (originals.find(result) == originals.end())
results.insert(result);
}
// Let's print them out...
for (auto &l : results)
{
std::cout << utf8_c(l) << std::endl;
}
} catch (std::exception& e)
{
std::cerr << e.what();
return 1;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.