Skip to content

Instantly share code, notes, and snippets.

@cstorey
Last active January 14, 2018 15:29
Show Gist options
  • Save cstorey/cde296dc99870728c9b78fecfe030560 to your computer and use it in GitHub Desktop.
Save cstorey/cde296dc99870728c9b78fecfe030560 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import metaphone as mp\n",
"import collections as cs\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('A', u'A'),\n",
" ('be', u'P'),\n",
" ('l', u'L'),\n",
" ('mo', u'M'),\n",
" ('schu', u'X'),\n",
" ('s', u'S')]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def split_to_phones(word):\n",
" word_mp = mp.doublemetaphone(word)[0]\n",
" prev_w_idx = 0\n",
" prev_mp_idx =0\n",
" for x in xrange(1, len(word)+1):\n",
" w0, w1 = (word[:x], word[x:])\n",
" m0, m1 = (mp.doublemetaphone(w0)[0], mp.doublemetaphone(w1)[0])\n",
" if (m0 + m1) == word_mp:\n",
" word_slice = w0[prev_w_idx:]\n",
" mp_slice = m0[prev_mp_idx:]\n",
" if mp_slice and word_slice:\n",
" yield (word_slice, mp_slice)\n",
" prev_w_idx = len(w0)\n",
" prev_mp_idx = len(m0)\n",
"list(split_to_phones('Abelmoschus'))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"words = [w for w in file('/usr/share/dict/words').read().split('\\n') if w.strip()]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"157011"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.html.widgets import FloatProgress\n",
"from IPython.display import display\n",
"#display(f)\n",
"#for i in xrange(100):\n",
"# sleep(0.1)\n",
"# f.value = i\n",
"\n",
" \n",
"window_size = 4\n",
"mp_markov = cs.defaultdict(lambda: cs.defaultdict(int))\n",
"\n",
"total = len(words)\n",
"f = FloatProgress(min=0, max=total)\n",
"display(f)\n",
"\n",
"for n, w in enumerate(words):\n",
" if (n%100) == 0:\n",
" #print \n",
" f.value = n\n",
" f.description = \"{}/{} ({:4f}%)\".format(n, total, 100*float(n)/total)\n",
" \n",
" ws = []\n",
" for (w, m) in split_to_phones(w):\n",
" ws.append(w.lower())\n",
" ws.append(None)\n",
" \n",
" for idx in xrange(len(ws)-1):\n",
" preceeding = tuple(ws[max(0, idx-window_size+1):idx])\n",
" succ = ws[idx]\n",
" mp_markov[preceeding][succ] += 1\n",
"\n",
"f.value = total; f.description = \"Done\"\n",
"len(mp_markov)\n",
"# {m: {w: cnt for w, cnt in cnts.iteritems()} for m, cnts in mp_markov.iteritems()}"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(('ly', 'phy', 'llou'), defaultdict(<type 'int'>, {'s': 2})),\n",
" (('a', 'thi'), defaultdict(<type 'int'>, {'r': 1, 'n': 1})),\n",
" (('s', 'ki', 'r'),\n",
" defaultdict(<type 'int'>, {'ni': 1, 'ty': 1, 'mi': 6, 'l': 2, 'li': 1, 'p': 1, 't': 9, 'ti': 2, 'te': 4})),\n",
" (('u', 'nou', 'two'), defaultdict(<type 'int'>, {'r': 1})),\n",
" (('s', 'pi', 'nge'), defaultdict(<type 'int'>, {'l': 1})),\n",
" (('vi', 'ce', 'n'), defaultdict(<type 'int'>, {'t': 3})),\n",
" (('s', 'mo', 'si'), defaultdict(<type 'int'>, {'s': 16, 'te': 1, 'c': 1})),\n",
" (('zzi', 'ni', 's'), defaultdict(<type 'int'>, {'t': 1})),\n",
" (('ga', 'rrya'), defaultdict(<type 'int'>, {'ceae': 1})),\n",
" (('a', 'm', 'phio'), defaultdict(<type 'int'>, {'ni': 1, 'n': 1}))]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mp_markov.items()[:10]"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def markovify():\n",
" state = ()\n",
" while True:\n",
" #print state, state[-(window_size-1):]\n",
"\n",
" options = mp_markov.get(state, {})\n",
" if not options:\n",
" return\n",
" nxt = random.choice([o for o, cnt in options.iteritems() for _ in xrange(cnt)])\n",
" #print state, nxt#, options\n",
" yield nxt\n",
" state = state[-(window_size-2):] + (nxt,)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hippelatestringefulness\n",
"savingness\n",
"vary\n",
"pholidota\n",
"fiuman\n",
"doundake\n",
"bearwood\n",
"fourthly\n",
"overreligionistraddlebusternoscapethriftlessness\n",
"adtevac\n",
"fishhooks\n",
"sion\n",
"carmaniac\n",
"certy\n",
"frothiness\n",
"hantlessly\n",
"mizzly\n",
"mealless\n",
"futuristfulness\n",
"dulcigenously\n",
"snottiness\n",
"casuistrych\n",
"fastidiousness\n",
"overassertiversatility\n",
"viceversalgia\n",
"phrasistlessly\n",
"fourieristikon\n",
"anthropicringmanshipmentaneously\n",
"ammeline\n",
"apiose\n"
]
}
],
"source": [
"for _ in xrange(30):\n",
" print \"\".join(markovify())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.8"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment