Last active
August 14, 2017 20:17
-
-
Save dloscutoff/d4d79df2ab71411d99d6d646e8f45ee6 to your computer and use it in GitHub Desktop.
A Markov-based country name generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import re, random | |
# A regex that matches a syllable, with three groups for the three | |
# segments of the syllable: onset (initial consonants), nucleus (vowels), | |
# and coda (final consonants). | |
# The regex also matches if there is just an onset (even an empty | |
# onset); this case corresponds to the final partial syllable of the | |
# stem, which is usually the consonant before a vowel ending (for | |
# example, the d in "ca-na-d a"). | |
syllableRgx = re.compile(r"(y|[^aeiouy]*)([aeiouy]+|$)([^aeiouy]*)") | |
nameFile = "names.txt" | |
# Dictionary that holds the frequency of each syllable count (note that these | |
# are the syllables *before* the ending, so "al-ba-n ia" only counts two) | |
syllableCounts = {} | |
# List of four dictionaries (for onsets, nuclei, codas, and endings): | |
# Each dictionary's key/value pairs are prevSegment:segmentDict, where | |
# segmentDict is a frequency dictionary of various onsets, nuclei, codas, | |
# or endings, and prevSegment is a segment that can be the last nonempty | |
# segment preceding them. A prevSegment of None marks segments at the | |
# beginnings of names. | |
segmentData = [{}, {}, {}, {}] | |
ONSET = 0 | |
NUCLEUS = 1 | |
CODA = 2 | |
ENDING = 3 | |
# Read names from file and generate the segmentData structure | |
with open(nameFile) as f: | |
for line in f.readlines(): | |
# Strip whitespace, ignore blank lines and comments | |
line = line.strip() | |
if not line: | |
continue | |
if line[0] == "#": | |
continue | |
stem, ending = line.split() | |
# Endings should be of the format noun/adj | |
if "/" not in ending: | |
# The noun ending is given; the adjective ending can be | |
# derived by appending -n | |
ending = "{}/{}n".format(ending, ending) | |
# Syllable count is the number of hyphens | |
syllableCount = stem.count("-") | |
if syllableCount in syllableCounts: | |
syllableCounts[syllableCount] += 1 | |
else: | |
syllableCounts[syllableCount] = 1 | |
# Add the segments in this name to segmentData | |
prevSegment = None | |
for syllable in stem.split("-"): | |
segments = syllableRgx.match(syllable).groups() | |
if segments[NUCLEUS] == segments[CODA] == "": | |
# A syllable with emtpy nucleus and coda comes right before | |
# the ending, so we only process the onset | |
segments = (segments[ONSET],) | |
for segType, segment in enumerate(segments): | |
if prevSegment not in segmentData[segType]: | |
segmentData[segType][prevSegment] = {} | |
segFrequencies = segmentData[segType][prevSegment] | |
if segment in segFrequencies: | |
segFrequencies[segment] += 1 | |
else: | |
segFrequencies[segment] = 1 | |
if segment: | |
prevSegment = segment | |
# Add the ending to segmentData | |
if prevSegment not in segmentData[ENDING]: | |
segmentData[ENDING][prevSegment] = {} | |
endFrequencies = segmentData[ENDING][prevSegment] | |
if ending in endFrequencies: | |
endFrequencies[ending] += 1 | |
else: | |
endFrequencies[ending] = 1 | |
def randFromFrequencies(dictionary): | |
"Returns a random dictionary key, where the values represent frequencies." | |
keys = dictionary.keys() | |
frequencies = dictionary.values() | |
index = random.randrange(sum(dictionary.values())) | |
for key, freq in dictionary.items(): | |
if index < freq: | |
# Select this one | |
return key | |
else: | |
index -= freq | |
# Weird, should have returned something | |
raise ValueError("randFromFrequencies didn't pick a value " | |
"(index remainder is {})".format(index)) | |
def markovName(syllableCount): | |
"Generate a country name using a Markov-chain-like process." | |
prevSegment = None | |
stem = "" | |
for syll in range(syllableCount): | |
for segType in [ONSET, NUCLEUS, CODA]: | |
try: | |
segFrequencies = segmentData[segType][prevSegment] | |
except KeyError: | |
# In the unusual situation that the chain fails to find an | |
# appropriate next segment, it's too complicated to try to | |
# roll back and pick a better prevSegment; so instead, | |
# return None and let the caller generate a new name | |
return None | |
segment = randFromFrequencies(segFrequencies) | |
stem += segment | |
if segment: | |
prevSegment = segment | |
endingOnset = None | |
# Try different onsets for the last syllable till we find one that's | |
# legal before an ending; we also allow empty onsets. Because it's | |
# possible we won't find one, we also limit the number of retries | |
# allowed. | |
retries = 10 | |
while (retries and endingOnset != "" | |
and endingOnset not in segmentData[ENDING]): | |
segFrequencies = segmentData[ONSET][prevSegment] | |
endingOnset = randFromFrequencies(segFrequencies) | |
retries -= 1 | |
stem += endingOnset | |
if endingOnset != "": | |
prevSegment = endingOnset | |
if prevSegment in segmentData[ENDING]: | |
# Pick an ending that goes with the prevSegment | |
endFrequencies = segmentData[ENDING][prevSegment] | |
endings = randFromFrequencies(endFrequencies) | |
else: | |
# It can happen, if we used an empty last-syllable onset, that | |
# the previous segment does not appear before any ending in the | |
# data set. In this case, we'll just use -a(n) for the ending. | |
endings = "a/an" | |
endings = endings.split("/") | |
nounForm = stem + endings[0] | |
# Filter out names that are too short or too long | |
if len(nounForm) < 3: | |
# This would give two-letter names like Mo, which don't appeal | |
# to me | |
return None | |
if len(nounForm) > 11: | |
# This would give very long names like Imbadossorbia that are too | |
# much of a mouthful | |
return None | |
# Filter out names with weird consonant clusters at the end | |
for consonants in ["bl", "tn", "sr", "sn", "sm", "shm"]: | |
if nounForm.endswith(consonants): | |
return None | |
# Filter out names that sound like anatomical references | |
for bannedSubstring in ["vag", "coc", "cok", "kok", "peni"]: | |
if bannedSubstring in stem: | |
return None | |
if nounForm == "ass": | |
# This isn't a problem if it's part of a larger name like Assyria, | |
# so filter it out only if it's the entire name | |
return None | |
return stem, endings | |
def printCountryNames(count): | |
for i in range(count): | |
syllableCount = randFromFrequencies(syllableCounts) | |
nameInfo = markovName(syllableCount) | |
while nameInfo is None: | |
nameInfo = markovName(syllableCount) | |
stem, endings = nameInfo | |
stem = stem.capitalize() | |
noun = stem + endings[0] | |
adjective = stem + endings[1] | |
print("{} ({})".format(noun, adjective)) | |
if __name__ == "__main__": | |
printCountryNames(30) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A few names have been respelled or omitted to avoid awkward-looking outputs | |
# Latinate names ending in -a/-an, -ia/-ian, or -a/-ian | |
a-fri-c a | |
sar-di-n ia | |
cor-si-c a | |
i-be-r ia | |
an-dor-r a | |
dal-ma-t ia | |
da-c ia | |
mo-e-s ia | |
thra-c ia | |
i-o-n ia | |
il-ly-r ia | |
do-r ia | |
a-cha- ia | |
spar-t a | |
a-s ia | |
ly-d ia | |
smyr-n a/ian | |
thy-a-ti-r a | |
la-o-di-ce- a | |
phry-g ia | |
my-s ia | |
bi-thi-n ia | |
ga-la-t ia | |
pi-si-d ia | |
pam-phi-l ia | |
ci-li-c ia | |
cap-pa-do-c ia | |
se-leu-c ia | |
ju-de- a | |
scy-th ia | |
par-th ia | |
me-d ia | |
per-s ia | |
me-so-po-ta-m ia | |
chal-de- a | |
mau-ri-ta-n ia | |
lib-y a | |
al-ge-r ia | |
tu-ni-s ia | |
e-thi-o-p ia | |
so-ma-l ia | |
e-ri-tre- a | |
u-gan-d a | |
ken-y a | |
tan-za-n ia | |
ru-an-d a | |
rho-de-s ia | |
zam-b ia | |
an-go-l a | |
na-mi-b ia | |
ni-ge-r ia | |
gam-b ia | |
li-be-r ia | |
gu-i-ne- a | |
a-ra-b ia | |
sy-r ia | |
ar-me-n ia | |
al-ba-n ia | |
bos-n ia | |
her-ze-go-vi-n a/ian | |
cro-a-t ia | |
ser-b ia | |
yu-go-sla-v ia | |
bul-ga-r ia | |
ro-ma-n ia | |
mol-do-v a | |
aus-tr ia | |
slo-va-k ia | |
slo-ve-n ia | |
es-to-n ia | |
lat-v ia | |
li-thu-a-n ia | |
scan-di-na-v ia | |
rus-s ia | |
si-be-r ia | |
ge-or-g ia | |
ab-kha-z ia | |
os-se-t ia | |
mon-go-l ia | |
ko-re- a | |
in-d ia | |
go- a | |
cam-bo-d ia | |
kam-pu-che- a | |
ma-lay-s ia | |
in-do-ne-s ia | |
ja-v a | |
ja-kar-t a | |
kra-ka-to- a | |
pa-pu- a | |
aus-tra-l ia | |
taz-ma-n ia | |
po-ly-ne-s ia | |
o-ce-a-n ia | |
me-la-ne-s ia | |
mi-cro-ne-s ia | |
sa-mo- a | |
ton-g a | |
ca-na-d a/ian | |
al-ber-t a | |
ma-ni-to-b a | |
a-me-ri-c a | |
a-las-k a | |
ca-li-for-n ia | |
ne-va-d a | |
a-ri-zo-n a | |
mon-ta-n a | |
da-ko-t a | |
ne-bras-k a | |
min-ne-so-t a | |
i-o-w a | |
lou-i-si-a-n a | |
a-la-ba-m a | |
flo-ri-d a/ian | |
ca-ro-li-n a/ian | |
vir-gi-n ia | |
pen-syl-va-n ia | |
do-mi-ni-c a | |
his-pa-ni-o-l a | |
ja-mai-c a | |
an-ti-gu- a | |
bar-bu-d a | |
ber-mu-d a | |
an-guil-l a/ian | |
gre-na-d a/ian | |
a-ru-b a | |
gua-te-ma-l a | |
ni-ca-ra-gu- a | |
ri-c a | |
co-lom-b ia | |
ve-ne-zue-l a | |
ar-gen-ti-n a/ian | |
bo-li-v ia | |
nar-n ia | |
pe-re-lan-dr a/ian | |
zoo-to-p ia | |
sur-d a | |
el-les-me-r a | |
ve-g a | |
ta-zen-d a | |
# Names with modified Latinate endings (-as/-an, -o/-an, _/-an, _/-ian, etc.) | |
eu-ro-pe /an | |
his-pa-n ia/ic | |
bri-tan-n ia/ic | |
gal-l ia/ic | |
pa-ri-s /ian | |
ger-ma-n ia/ic | |
ro-m e/an | |
i-ta-l y/ian | |
si-ci-l y/ian | |
mo-na-c o/an | |
cre-t e/an | |
hun-ga-r y/ian | |
be-la-ru-s /ian | |
u-krai-n e/ian | |
ma-ce-do-n /ian | |
a-the-n s/ian | |
co-rin-th /ian | |
pe-lo-pon-ne-s e/ian | |
phi-lip-pi- /an | |
co-los-s e/ian | |
sar-di s/an | |
troy- /an | |
ty-r e/ian | |
si-do-n /ian | |
su-me-r /ian | |
ba-by-lo-n /ian | |
pa-les-ti-n e/ian | |
mo-roc-c o/an | |
e-gyp-t /ian | |
dji-bou-ti- /an | |
bu-run-di- /an | |
ma-la-wi- /an | |
zim-bab-we- /an | |
ca-me-roo-n /ian | |
ver-de- /an | |
gha-na- /ian | |
cha-d /ian | |
ma-li- /an | |
jor-da-n /ian | |
i-ra-n /ian | |
ti-be-t /an | |
la-o-s /ian | |
sin-ga-po-re- /an | |
bru-nei- /an | |
fi-ji- /an | |
tu-va-lu- /an | |
na-u-ru- /an | |
to-ke-lau- /an | |
ni-u-e- /an | |
pa-lau- /an | |
on-ta-ri- o/an | |
la-bra-do-r /ian | |
ha-wai-i- /an | |
o-re-go-n /ian | |
i-da-ho- /an | |
co-lo-ra-d o/an | |
kan-s as/an | |
mis-sou-ri- /an | |
ar-kan-s as/an | |
tex- as/an | |
mis-sis-sip-pi- /an | |
ten-nes-see- /an | |
o-hi-o- /an | |
ri-c o/an | |
hai-ti- /an | |
mex-i-c o/an | |
hon-du-r as/an | |
sal-va-do-r /ian | |
e-cua-do-r /ian | |
bra-zi-l /ian | |
pa-ra-guay- /an | |
u-ru-guay- /an | |
chi-le- /an | |
gon-do-r /ian | |
mor-por-k /ian | |
kal-ga-n /ian | |
a-tu-r /an | |
ca-di-no-r /ian | |
ha-mil-to-n /ian | |
# Names ending in _/-i | |
is-ra-e-l /i | |
mo-a-b /i | |
is-mai-l /i | |
sau-d /i | |
ye-me-n /i | |
o-ma-n /i | |
ka-ta-r /i | |
bah-rai-n /i | |
ku-wai-t /i | |
i-ra-q /i | |
a-zer-bai-ja-n /i | |
pa-kis-ta-n /i | |
kash-mi-r /i | |
pun-ja-b /i | |
ra-jas-tha-n /i | |
gu-ja-ra-t /i | |
kon-ka-n /i | |
ben-ga-l /i | |
ban-gla-de-sh /i | |
mun-kha-sh /i | |
# Names ending in -a/-ese, _/-ese, etc. | |
mal-t a/ese | |
ge-no- a/ese | |
mi-la-n /ese | |
to-ri-n o/ese | |
bo-log-n a/ese | |
ba-r i/ese | |
mes-si-n a/ese | |
ve-ro-n a/ese | |
a-ra-go-n /ese | |
vi-en-n a/ese | |
dub-li-n /ese | |
fa-ro- e/ese | |
su-da-n /ese | |
ga-bo-n /ese | |
be-ni-n /ese | |
se-ne-ga-l /ese | |
ja-pa-n /ese | |
chi-n a/ese | |
can-to-n /ese | |
tai-wa-n /ese | |
bhu-ta-n /ese | |
ne-pa-l /ese | |
as-sa-m /ese | |
sin-ha-l a/ese | |
si-a-m /ese | |
bur-m a/ese | |
vi-et-na-m /ese | |
gu-ya-n a/ese | |
su-ri-na-m e/ese | |
flo-ri-n /ese |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment