Skip to content

Instantly share code, notes, and snippets.

@johannestaas
Last active July 18, 2017 15:59
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johannestaas/aca467f3b56dc93814f2d2549ba99866 to your computer and use it in GitHub Desktop.
Save johannestaas/aca467f3b56dc93814f2d2549ba99866 to your computer and use it in GitHub Desktop.
Namerator - Name generator PoC based on text patterns
'''
namerator
=========
Name generator, by generating characters based on frequencies of letter patterns
Usage:
$ python namerator.py -n 5 elf_names.txt
egil-gonamilin
taedhorilie
lalorahinduil
cirduiel
amrothrie
License: GPLv3+
See https://www.gnu.org/licenses/gpl-3.0.en.html
'''
from math import sqrt
from random import gauss, seed, choice
from collections import Counter
def load(path):
with open(path) as f:
return [x.strip() for x in f.readlines() if x.strip()]
def chop_name(name):
lst = [x for x in name.lower().strip().split()[0]]
return ['', '', '', ''] + lst + ['$']
def inc_frequencies(chop, freqs):
freqs[0] = freqs.get(0, Counter())
for i in range(1, 5):
freqs[i] = freqs.get(i, {})
for i in range(3, len(chop)):
a, b, c, d = chop[i-4], chop[i-3], chop[i-2], chop[i-1]
char = chop[i]
freqs[0].update([char])
freqs[1][d] = freqs[1].get(d, Counter())
freqs[1][d].update([char])
freqs[2][c + d] = freqs[2].get(c + d, Counter())
freqs[2][c + d].update([char])
freqs[3][b + c + d] = freqs[3].get(b + c + d, Counter())
freqs[3][b + c + d].update([char])
freqs[4][a + b + c + d] = freqs[4].get(a + b + c + d, Counter())
freqs[4][a + b + c + d].update([char])
return freqs
def calc_frequencies(names):
freqs = {'len': Counter()}
for name in names:
freqs['len'].update([len(name)])
chop = chop_name(name)
inc_frequencies(chop, freqs)
return freqs
def combined_frequencies(freqs, last1, last2, last3, last4):
freq = Counter()
freq.update(freqs[1][last1])
for i in range(5):
freq.update(freqs[2].get(last2, []))
for i in range(10):
freq.update(freqs[3].get(last3, []))
for i in range(25):
freq.update(freqs[4].get(last4, []))
return freq.most_common()
def choose(common, over=0, letter_freqs=None):
lst = []
for char, num in common:
if over < 0 and char == '$':
continue
lst += [char] * num
if over >= 0:
lst += ['$'] * dict(common).get('$', 1) * (over + 1)
if not lst:
return choose(letter_freqs, over=over)
return choice(lst)
def calc_gauss(lens):
expanded = []
for l, num in lens:
expanded += [l] * num
mean = sum(expanded) / len(expanded)
sigma = sum((mean - x)**2 for x in expanded)
sigma /= len(expanded) - 1
sigma = sqrt(sigma)
return mean, sigma
def generate(freqs):
name = ''
last1 = ''
last2 = ''
last3 = ''
last4 = ''
shortest = min(x for x, y in freqs['len'].most_common())
# longest = max(x for x, y in freqs['len'].most_common())
mean, sigma = calc_gauss(freqs['len'].most_common())
namelen = int(max(gauss(mean, sigma), shortest))
while True:
combined = combined_frequencies(freqs, last1, last2, last3, last4)
next_letter = choose(combined, over=len(name) - namelen,
letter_freqs=freqs[0].most_common())
if next_letter == '$':
break
name += next_letter
last4 = last3 + next_letter
last3 = last2 + next_letter
last2 = last1 + next_letter
last1 = next_letter
return name
def main():
import argparse
import json
parser = argparse.ArgumentParser()
parser.add_argument('path')
parser.add_argument('--num', '-n', type=int, default=10)
parser.add_argument('--freq-output', '-f')
args = parser.parse_args()
names = load(args.path)
freqs = calc_frequencies(names)
if args.freq_output:
dump = {}
dump['len'] = freqs['len'].most_common()
dump['frequency'] = freqs[0].most_common()
dump['first'] = {k: v.most_common() for k, v in freqs[1].items()}
dump['second'] = {k: v.most_common() for k, v in freqs[2].items()}
dump['third'] = {k: v.most_common() for k, v in freqs[3].items()}
dump['fourth'] = {k: v.most_common() for k, v in freqs[4].items()}
with open(args.freq_output, 'w') as f:
json.dump(dump, f, indent=4)
print('Dumped frequencies to {args.freq_output}'.format(args=args))
ct = 0
names = set(names)
while ct < args.num:
name = generate(freqs)
if name in names:
seed()
continue
print(name)
ct += 1
if __name__ == '__main__':
main()
azaghâl
balin
bifur
bofur
bombur
borin
dáin
dís
dori
durin
dwalin
farin
fíli
flói
frár
frerin
frór
fundin
gamil
gimli
glóin
gróin
grór
ibûn
khîm
kíli
lóni
mîm
náin
náli
nár
narvi
nori
óin
ori
telchar
thorin
thráin
thrór
aegnor
amarië
amdír
amras
amrod
amroth
anairë
angrod
annael
aranwë
aredhel
argon
arminas
beleg
caranthir
celeborn
celebrían
celebrimbor
celegorm
círdan
curufin
daeron
denethor
eärwen
ecthelion
edrahil
egalmoth
eldalótë
elemmakil
elemmírë
elenwë
elmo
enel
enerdhil
eöl
erestor
fëanor
finarfin
findis
finduilas
fingolfin
fingon
finrod
finwë
galadhon
galadriel
galathil
galdor
galion
gelmir
gelmir
gildor
gil-galad
glorfindel
glorfindel
guilin
gwindor
haldir
idril
imin
indis
ingwë
ingwion
irimë
legolas
lenwë
lúthien
mablung
maedhros
maeglin
maglor
mahtan
míriel
mithrellas
nellas
nerdanel
nimloth
olwë
orodreth
oropher
orophin
pengolodh
rúmil
rúmil
tata
thingol
thranduil
turgon
voronwë
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment