Skip to content

Instantly share code, notes, and snippets.

@dloscutoff
Last active August 14, 2017 20:17
Show Gist options
  • Save dloscutoff/d4d79df2ab71411d99d6d646e8f45ee6 to your computer and use it in GitHub Desktop.
Save dloscutoff/d4d79df2ab71411d99d6d646e8f45ee6 to your computer and use it in GitHub Desktop.
A Markov-based country name generator
#!/usr/bin/python3
import re, random
# A regex that matches a syllable, with three groups for the three
# segments of the syllable: onset (initial consonants), nucleus (vowels),
# and coda (final consonants).
# The regex also matches if there is just an onset (even an empty
# onset); this case corresponds to the final partial syllable of the
# stem, which is usually the consonant before a vowel ending (for
# example, the d in "ca-na-d a").
syllableRgx = re.compile(r"(y|[^aeiouy]*)([aeiouy]+|$)([^aeiouy]*)")
nameFile = "names.txt"
# Dictionary that holds the frequency of each syllable count (note that these
# are the syllables *before* the ending, so "al-ba-n ia" only counts two)
syllableCounts = {}
# List of four dictionaries (for onsets, nuclei, codas, and endings):
# Each dictionary's key/value pairs are prevSegment:segmentDict, where
# segmentDict is a frequency dictionary of various onsets, nuclei, codas,
# or endings, and prevSegment is a segment that can be the last nonempty
# segment preceding them. A prevSegment of None marks segments at the
# beginnings of names.
segmentData = [{}, {}, {}, {}]
ONSET = 0
NUCLEUS = 1
CODA = 2
ENDING = 3
# Read names from file and generate the segmentData structure
with open(nameFile) as f:
for line in f.readlines():
# Strip whitespace, ignore blank lines and comments
line = line.strip()
if not line:
continue
if line[0] == "#":
continue
stem, ending = line.split()
# Endings should be of the format noun/adj
if "/" not in ending:
# The noun ending is given; the adjective ending can be
# derived by appending -n
ending = "{}/{}n".format(ending, ending)
# Syllable count is the number of hyphens
syllableCount = stem.count("-")
if syllableCount in syllableCounts:
syllableCounts[syllableCount] += 1
else:
syllableCounts[syllableCount] = 1
# Add the segments in this name to segmentData
prevSegment = None
for syllable in stem.split("-"):
segments = syllableRgx.match(syllable).groups()
if segments[NUCLEUS] == segments[CODA] == "":
# A syllable with emtpy nucleus and coda comes right before
# the ending, so we only process the onset
segments = (segments[ONSET],)
for segType, segment in enumerate(segments):
if prevSegment not in segmentData[segType]:
segmentData[segType][prevSegment] = {}
segFrequencies = segmentData[segType][prevSegment]
if segment in segFrequencies:
segFrequencies[segment] += 1
else:
segFrequencies[segment] = 1
if segment:
prevSegment = segment
# Add the ending to segmentData
if prevSegment not in segmentData[ENDING]:
segmentData[ENDING][prevSegment] = {}
endFrequencies = segmentData[ENDING][prevSegment]
if ending in endFrequencies:
endFrequencies[ending] += 1
else:
endFrequencies[ending] = 1
def randFromFrequencies(dictionary):
"Returns a random dictionary key, where the values represent frequencies."
keys = dictionary.keys()
frequencies = dictionary.values()
index = random.randrange(sum(dictionary.values()))
for key, freq in dictionary.items():
if index < freq:
# Select this one
return key
else:
index -= freq
# Weird, should have returned something
raise ValueError("randFromFrequencies didn't pick a value "
"(index remainder is {})".format(index))
def markovName(syllableCount):
"Generate a country name using a Markov-chain-like process."
prevSegment = None
stem = ""
for syll in range(syllableCount):
for segType in [ONSET, NUCLEUS, CODA]:
try:
segFrequencies = segmentData[segType][prevSegment]
except KeyError:
# In the unusual situation that the chain fails to find an
# appropriate next segment, it's too complicated to try to
# roll back and pick a better prevSegment; so instead,
# return None and let the caller generate a new name
return None
segment = randFromFrequencies(segFrequencies)
stem += segment
if segment:
prevSegment = segment
endingOnset = None
# Try different onsets for the last syllable till we find one that's
# legal before an ending; we also allow empty onsets. Because it's
# possible we won't find one, we also limit the number of retries
# allowed.
retries = 10
while (retries and endingOnset != ""
and endingOnset not in segmentData[ENDING]):
segFrequencies = segmentData[ONSET][prevSegment]
endingOnset = randFromFrequencies(segFrequencies)
retries -= 1
stem += endingOnset
if endingOnset != "":
prevSegment = endingOnset
if prevSegment in segmentData[ENDING]:
# Pick an ending that goes with the prevSegment
endFrequencies = segmentData[ENDING][prevSegment]
endings = randFromFrequencies(endFrequencies)
else:
# It can happen, if we used an empty last-syllable onset, that
# the previous segment does not appear before any ending in the
# data set. In this case, we'll just use -a(n) for the ending.
endings = "a/an"
endings = endings.split("/")
nounForm = stem + endings[0]
# Filter out names that are too short or too long
if len(nounForm) < 3:
# This would give two-letter names like Mo, which don't appeal
# to me
return None
if len(nounForm) > 11:
# This would give very long names like Imbadossorbia that are too
# much of a mouthful
return None
# Filter out names with weird consonant clusters at the end
for consonants in ["bl", "tn", "sr", "sn", "sm", "shm"]:
if nounForm.endswith(consonants):
return None
# Filter out names that sound like anatomical references
for bannedSubstring in ["vag", "coc", "cok", "kok", "peni"]:
if bannedSubstring in stem:
return None
if nounForm == "ass":
# This isn't a problem if it's part of a larger name like Assyria,
# so filter it out only if it's the entire name
return None
return stem, endings
def printCountryNames(count):
for i in range(count):
syllableCount = randFromFrequencies(syllableCounts)
nameInfo = markovName(syllableCount)
while nameInfo is None:
nameInfo = markovName(syllableCount)
stem, endings = nameInfo
stem = stem.capitalize()
noun = stem + endings[0]
adjective = stem + endings[1]
print("{} ({})".format(noun, adjective))
if __name__ == "__main__":
printCountryNames(30)
# A few names have been respelled or omitted to avoid awkward-looking outputs
# Latinate names ending in -a/-an, -ia/-ian, or -a/-ian
a-fri-c a
sar-di-n ia
cor-si-c a
i-be-r ia
an-dor-r a
dal-ma-t ia
da-c ia
mo-e-s ia
thra-c ia
i-o-n ia
il-ly-r ia
do-r ia
a-cha- ia
spar-t a
a-s ia
ly-d ia
smyr-n a/ian
thy-a-ti-r a
la-o-di-ce- a
phry-g ia
my-s ia
bi-thi-n ia
ga-la-t ia
pi-si-d ia
pam-phi-l ia
ci-li-c ia
cap-pa-do-c ia
se-leu-c ia
ju-de- a
scy-th ia
par-th ia
me-d ia
per-s ia
me-so-po-ta-m ia
chal-de- a
mau-ri-ta-n ia
lib-y a
al-ge-r ia
tu-ni-s ia
e-thi-o-p ia
so-ma-l ia
e-ri-tre- a
u-gan-d a
ken-y a
tan-za-n ia
ru-an-d a
rho-de-s ia
zam-b ia
an-go-l a
na-mi-b ia
ni-ge-r ia
gam-b ia
li-be-r ia
gu-i-ne- a
a-ra-b ia
sy-r ia
ar-me-n ia
al-ba-n ia
bos-n ia
her-ze-go-vi-n a/ian
cro-a-t ia
ser-b ia
yu-go-sla-v ia
bul-ga-r ia
ro-ma-n ia
mol-do-v a
aus-tr ia
slo-va-k ia
slo-ve-n ia
es-to-n ia
lat-v ia
li-thu-a-n ia
scan-di-na-v ia
rus-s ia
si-be-r ia
ge-or-g ia
ab-kha-z ia
os-se-t ia
mon-go-l ia
ko-re- a
in-d ia
go- a
cam-bo-d ia
kam-pu-che- a
ma-lay-s ia
in-do-ne-s ia
ja-v a
ja-kar-t a
kra-ka-to- a
pa-pu- a
aus-tra-l ia
taz-ma-n ia
po-ly-ne-s ia
o-ce-a-n ia
me-la-ne-s ia
mi-cro-ne-s ia
sa-mo- a
ton-g a
ca-na-d a/ian
al-ber-t a
ma-ni-to-b a
a-me-ri-c a
a-las-k a
ca-li-for-n ia
ne-va-d a
a-ri-zo-n a
mon-ta-n a
da-ko-t a
ne-bras-k a
min-ne-so-t a
i-o-w a
lou-i-si-a-n a
a-la-ba-m a
flo-ri-d a/ian
ca-ro-li-n a/ian
vir-gi-n ia
pen-syl-va-n ia
do-mi-ni-c a
his-pa-ni-o-l a
ja-mai-c a
an-ti-gu- a
bar-bu-d a
ber-mu-d a
an-guil-l a/ian
gre-na-d a/ian
a-ru-b a
gua-te-ma-l a
ni-ca-ra-gu- a
ri-c a
co-lom-b ia
ve-ne-zue-l a
ar-gen-ti-n a/ian
bo-li-v ia
nar-n ia
pe-re-lan-dr a/ian
zoo-to-p ia
sur-d a
el-les-me-r a
ve-g a
ta-zen-d a
# Names with modified Latinate endings (-as/-an, -o/-an, _/-an, _/-ian, etc.)
eu-ro-pe /an
his-pa-n ia/ic
bri-tan-n ia/ic
gal-l ia/ic
pa-ri-s /ian
ger-ma-n ia/ic
ro-m e/an
i-ta-l y/ian
si-ci-l y/ian
mo-na-c o/an
cre-t e/an
hun-ga-r y/ian
be-la-ru-s /ian
u-krai-n e/ian
ma-ce-do-n /ian
a-the-n s/ian
co-rin-th /ian
pe-lo-pon-ne-s e/ian
phi-lip-pi- /an
co-los-s e/ian
sar-di s/an
troy- /an
ty-r e/ian
si-do-n /ian
su-me-r /ian
ba-by-lo-n /ian
pa-les-ti-n e/ian
mo-roc-c o/an
e-gyp-t /ian
dji-bou-ti- /an
bu-run-di- /an
ma-la-wi- /an
zim-bab-we- /an
ca-me-roo-n /ian
ver-de- /an
gha-na- /ian
cha-d /ian
ma-li- /an
jor-da-n /ian
i-ra-n /ian
ti-be-t /an
la-o-s /ian
sin-ga-po-re- /an
bru-nei- /an
fi-ji- /an
tu-va-lu- /an
na-u-ru- /an
to-ke-lau- /an
ni-u-e- /an
pa-lau- /an
on-ta-ri- o/an
la-bra-do-r /ian
ha-wai-i- /an
o-re-go-n /ian
i-da-ho- /an
co-lo-ra-d o/an
kan-s as/an
mis-sou-ri- /an
ar-kan-s as/an
tex- as/an
mis-sis-sip-pi- /an
ten-nes-see- /an
o-hi-o- /an
ri-c o/an
hai-ti- /an
mex-i-c o/an
hon-du-r as/an
sal-va-do-r /ian
e-cua-do-r /ian
bra-zi-l /ian
pa-ra-guay- /an
u-ru-guay- /an
chi-le- /an
gon-do-r /ian
mor-por-k /ian
kal-ga-n /ian
a-tu-r /an
ca-di-no-r /ian
ha-mil-to-n /ian
# Names ending in _/-i
is-ra-e-l /i
mo-a-b /i
is-mai-l /i
sau-d /i
ye-me-n /i
o-ma-n /i
ka-ta-r /i
bah-rai-n /i
ku-wai-t /i
i-ra-q /i
a-zer-bai-ja-n /i
pa-kis-ta-n /i
kash-mi-r /i
pun-ja-b /i
ra-jas-tha-n /i
gu-ja-ra-t /i
kon-ka-n /i
ben-ga-l /i
ban-gla-de-sh /i
mun-kha-sh /i
# Names ending in -a/-ese, _/-ese, etc.
mal-t a/ese
ge-no- a/ese
mi-la-n /ese
to-ri-n o/ese
bo-log-n a/ese
ba-r i/ese
mes-si-n a/ese
ve-ro-n a/ese
a-ra-go-n /ese
vi-en-n a/ese
dub-li-n /ese
fa-ro- e/ese
su-da-n /ese
ga-bo-n /ese
be-ni-n /ese
se-ne-ga-l /ese
ja-pa-n /ese
chi-n a/ese
can-to-n /ese
tai-wa-n /ese
bhu-ta-n /ese
ne-pa-l /ese
as-sa-m /ese
sin-ha-l a/ese
si-a-m /ese
bur-m a/ese
vi-et-na-m /ese
gu-ya-n a/ese
su-ri-na-m e/ese
flo-ri-n /ese
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment