tjvr/reverse speech.ipynb

## reverse speech.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#coding=utf8\n",
    "from __future__ import unicode_literals\n",
    "\n",
    "import itertools\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package cmudict to /Users/tim/nltk_data...\n",
      "[nltk_data]   Package cmudict is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('cmudict')\n",
    "from nltk.corpus import cmudict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "d = cmudict.dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def lookup(w):\n",
    "    return d[w][0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the CMU pronouncing dictionary. For each word, it has a list of phonemes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[u'HH', u'AE1', u'T'], [u'F', u'EY1', u'L', u'Y', u'ER0']]"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = [lookup('hat'), lookup('failure')]\n",
    "words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, lets get rid of the numbers. I think they indicate stress. Whatever, we don't need 'em!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def strip_phonemes(phs):\n",
    "    return [s.rstrip('0123456789') for s in phs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[u'HH', u'AE', u'T'], [u'F', u'EY', u'L', u'Y', u'ER']]"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = map(strip_phonemes, words)\n",
    "words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some of these are dipthongs. Benjamin reckons we better split those up, so here goes. I looked at the docs for cmudict:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File Format: Each line consists of an uppercased word,\n",
      "a counter (for alternative pronunciations), and a transcription.\n",
      "Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).\n",
      "E.g.: NATURAL 1 N AE1 CH ER0 AH0 L\n",
      "\n",
      "The dictionary contains 127069 entries.  Of these, 119400 words are assigned\n",
      "a unique pronunciation, 6830 words have two pronunciations, and 839 words have\n",
      "three or more pronunciations.  Many of these are fast-speech variants.\n",
      "\n",
      "Phonemes: There are 39 phonemes, as shown below:\n",
      "    \n",
      "    Phoneme Example Translation    Phoneme Example Translation\n",
      "    ------- ------- -----------    ------- ------- -----------\n",
      "    AA      odd     AA D           AE      at      AE T\n",
      "    AH      hut     HH AH T        AO      ought   AO T\n",
      "    AW      cow     K AW           AY      hide    HH AY D\n",
      "    B       be      B IY           CH      cheese  CH IY Z\n",
      "    D       dee     D IY           DH      thee    DH IY\n",
      "    EH      Ed      EH D           ER      hurt    HH ER T\n",
      "    EY      ate     EY T           F       fee     F IY\n",
      "    G       green   G R IY N       HH      he      HH IY\n",
      "    IH      it      IH T           IY      eat     IY T\n",
      "    JH      gee     JH IY          K       key     K IY\n",
      "    L       lee     L IY           M       me      M IY\n",
      "    N       knee    N IY           NG      ping    P IH NG\n",
      "    OW      oat     OW T           OY      toy     T OY\n",
      "    P       pee     P IY           R       read    R IY D\n",
      "    S       sea     S IY           SH      she     SH IY\n",
      "    T       tea     T IY           TH      theta   TH EY T AH\n",
      "    UH      hood    HH UH D        UW      two     T UW\n",
      "    V       vee     V IY           W       we      W IY\n",
      "    Y       yield   Y IY L D       Z       zee     Z IY\n",
      "    ZH      seizure S IY ZH ER\n",
      "\n",
      "(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2\n",
      "are contiguous, and not separated by FIRE'S 1.)\n"
     ]
    }
   ],
   "source": [
    "print(\"\\n\\n\".join(cmudict.readme().split(\"\\n\\n\")[3:-6]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on this, Benjamin came up with the following table:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dipthongs_etc = {\n",
    "    'AW': ['AE', 'UW'],\n",
    "    'OW': ['AO', 'UW'],\n",
    "    'EY': ['EH', 'IY'],\n",
    "    'AY': ['AA', 'IY'],\n",
    "    'OY': ['AO', 'IY'],\n",
    "    'JH': ['D', 'ZH'],\n",
    "    'CH': ['T', 'SH'],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def dipthify(words):\n",
    "    return sum([dipthongs_etc.get(s, [s]) for s in words], [])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[u'HH', u'AE', u'T'], [u'F', u'EH', u'IY', u'L', u'Y', u'ER']]"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = map(dipthify, words)\n",
    "words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We could use `reversed()` to get the result, but it's hard to read…"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[u'ER', u'Y', u'L', u'IY', u'EH', u'F'], [u'T', u'AE', u'HH']]"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(reversed([list(reversed(w)) for w in words]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## IPA output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Benjamin can read IPA, so let's try that!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ipa = {\n",
    "    'AA': 'ɑː',\n",
    "    'AH': 'ʌ',\n",
    "    'AW': 'ou',\n",
    "    'B':  'b',\n",
    "    'D':  'd',\n",
    "    'EH': 'e',\n",
    "    'EY': 'eɪ',\n",
    "    'G':  'g',\n",
    "    'IH': 'ɪ',\n",
    "    'JH': 'dʒ',\n",
    "    'L':  'l',\n",
    "    'N':  'n',\n",
    "    'OW': 'əʊ',\n",
    "    'P':  'p',\n",
    "    'S':  's',\n",
    "    'T':  't',\n",
    "    'UH': 'ʊ',\n",
    "    'V':  'v',\n",
    "    'Y':  'j',\n",
    "    'ZH': 'ʒ',\n",
    "    'AE': 'æ',\n",
    "    'AO': 'ɔː',\n",
    "    'AY': 'ʌɪ',\n",
    "    'CH': 'tʃ',\n",
    "    'DH': 'ð',\n",
    "    'ER': 'əː',\n",
    "    'F':  'f',\n",
    "    'HH': 'h',\n",
    "    'IY': 'iː',\n",
    "    'K':  'k',\n",
    "    'M':  'm',\n",
    "    'NG': 'ŋ',\n",
    "    'OY': 'ɔɪ',\n",
    "    'R':  'r',\n",
    "    'SH': 'ʃ',\n",
    "    'TH': 'θ',\n",
    "    'UW': 'uː',\n",
    "    'W':  'w',\n",
    "    'Z':  'z',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "əːjliːef tæh\n"
     ]
    }
   ],
   "source": [
    "def ipaify(word):\n",
    "    return ''.join(map(ipa.get, word))\n",
    "\n",
    "result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))\n",
    "print(result_ipa)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I can't read that, but it seemed to sound ok when Benjamin tried it!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## English output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It'd be nice if we could make version which uses English words where possible…"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So let's reverse the cmudict to get a dictionary mapping phonemes to words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "backwards = {}\n",
    "for word, pronounciations in d.items():\n",
    "    word = word_pat.search(word).group(1)\n",
    "    for phonemes in pronounciations:\n",
    "        key = tuple(strip_phonemes(phonemes))\n",
    "        backwards[key] = word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'are'"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "backwards[tuple(reversed(strip_phonemes(lookup('ra'))))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "We need single-phonemes to fall back on…"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "english = {\n",
    "    'AA': 'o', # 'ah' not 'aw'\n",
    "    'AH': 'uh',\n",
    "    'AW': 'ow',\n",
    "    'B':  'b',\n",
    "    'D':  'd',\n",
    "    'EH': 'eh',\n",
    "    'EY': 'ay',\n",
    "    'G':  'g',\n",
    "    'IH': 'ih',\n",
    "    'JH': 'jh',\n",
    "    'L':  'l',\n",
    "    'N':  'n',\n",
    "    'OW': 'oah',\n",
    "    'P':  'p',\n",
    "    'S':  's',\n",
    "    'T':  't',\n",
    "    'UH': 'ooh',\n",
    "    'V':  'v',\n",
    "    'Y':  'y',\n",
    "    'ZH': 'zz',\n",
    "    'AE': 'aa',\n",
    "    'AO': 'aww',\n",
    "    'AY': 'eye',\n",
    "    'CH': 'ch',\n",
    "    'DH': 'th',\n",
    "    'ER': 'er',\n",
    "    'F':  'f',\n",
    "    'HH': 'h',\n",
    "    'IY': 'ee', # 'ee' would be better than 'e'\n",
    "    'K':  'k',\n",
    "    'M':  'm',\n",
    "    'NG': 'ng',\n",
    "    'OY': 'oy',\n",
    "    'R':  'r',\n",
    "    'SH': 'sh',\n",
    "    'TH': 'th',\n",
    "    'UW': 'ooo', # 'oo' would be better than 'ou'\n",
    "    'W':  'w',\n",
    "    'Z':  'z',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for phoneme, word in english.items():\n",
    "    backwards[phoneme] = word"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then we can do a dumb greedy search."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'are-y-leigh-f t-aa-h'"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def engify(word):\n",
    "    result = []\n",
    "    word = list(word)\n",
    "    while word:\n",
    "        for i in range(len(word), 0, -1):\n",
    "            key = tuple(word[:i])\n",
    "            if key in backwards:\n",
    "                result.append(backwards[key])\n",
    "                word = word[i:]\n",
    "                break\n",
    "        else:\n",
    "            result.append(english[word.pop(0)])\n",
    "    return '-'.join(result)\n",
    "\n",
    "result_english = ' '.join(reversed(map(engify, map(reversed, words))))\n",
    "result_english"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Put it all together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "stu-s t-aa-th v-uhh e-n-eh f-ih sioux-zz-d d-n-uhh z-g-eh clim v-aa-h e-awe\n",
      "stuːs tæð vʌ iːne fɪ suːʒd dnʌ zge klɪm væh iːɑː\n"
     ]
    }
   ],
   "source": [
    "word_pat = re.compile(u'([A-z]+)')\n",
    "\n",
    "text = unicode(\"I have milk, eggs and juice if any of that suits\")\n",
    "raw_words = filter(word_pat.match, word_pat.split(text))\n",
    "\n",
    "words = [] # really phonemes\n",
    "for w in raw_words:\n",
    "    try:\n",
    "        words.append(d[w.lower()][0])\n",
    "    except KeyError:\n",
    "        raise UnknownWord(w)\n",
    "\n",
    "words = map(strip_phonemes, words)\n",
    "\n",
    "words = map(dipthify, words)\n",
    "result_english = ' '.join(reversed(map(engify, map(reversed, words))))\n",
    "\n",
    "result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))\n",
    "\n",
    "print(result_english)\n",
    "print(result_ipa)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "bugreport: ɑː is prounced “ah”, not “aw”\n",
    "and “oo” would be better than “ou” for uː\n",
    "and for that matter, \n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class UnknownWord(Exception): pass"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#coding=utf8\n",
	"from __future__ import unicode_literals\n",
	"\n",
	"import itertools\n",
	"import re"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package cmudict to /Users/tim/nltk_data...\n",
	"[nltk_data] Package cmudict is already up-to-date!\n"
	]
	}
	],
	"source": [
	"import nltk\n",
	"nltk.download('cmudict')\n",
	"from nltk.corpus import cmudict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"d = cmudict.dict()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 81,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def lookup(w):\n",
	" return d[w][0]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This is the CMU pronouncing dictionary. For each word, it has a list of phonemes:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 102,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[u'HH', u'AE1', u'T'], [u'F', u'EY1', u'L', u'Y', u'ER0']]"
	]
	},
	"execution_count": 102,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"words = [lookup('hat'), lookup('failure')]\n",
	"words"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"First, lets get rid of the numbers. I think they indicate stress. Whatever, we don't need 'em!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 103,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def strip_phonemes(phs):\n",
	" return [s.rstrip('0123456789') for s in phs]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 104,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[u'HH', u'AE', u'T'], [u'F', u'EY', u'L', u'Y', u'ER']]"
	]
	},
	"execution_count": 104,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"words = map(strip_phonemes, words)\n",
	"words"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Some of these are dipthongs. Benjamin reckons we better split those up, so here goes. I looked at the docs for cmudict:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 140,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"File Format: Each line consists of an uppercased word,\n",
	"a counter (for alternative pronunciations), and a transcription.\n",
	"Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).\n",
	"E.g.: NATURAL 1 N AE1 CH ER0 AH0 L\n",
	"\n",
	"The dictionary contains 127069 entries. Of these, 119400 words are assigned\n",
	"a unique pronunciation, 6830 words have two pronunciations, and 839 words have\n",
	"three or more pronunciations. Many of these are fast-speech variants.\n",
	"\n",
	"Phonemes: There are 39 phonemes, as shown below:\n",
	" \n",
	" Phoneme Example Translation Phoneme Example Translation\n",
	" ------- ------- ----------- ------- ------- -----------\n",
	" AA odd AA D AE at AE T\n",
	" AH hut HH AH T AO ought AO T\n",
	" AW cow K AW AY hide HH AY D\n",
	" B be B IY CH cheese CH IY Z\n",
	" D dee D IY DH thee DH IY\n",
	" EH Ed EH D ER hurt HH ER T\n",
	" EY ate EY T F fee F IY\n",
	" G green G R IY N HH he HH IY\n",
	" IH it IH T IY eat IY T\n",
	" JH gee JH IY K key K IY\n",
	" L lee L IY M me M IY\n",
	" N knee N IY NG ping P IH NG\n",
	" OW oat OW T OY toy T OY\n",
	" P pee P IY R read R IY D\n",
	" S sea S IY SH she SH IY\n",
	" T tea T IY TH theta TH EY T AH\n",
	" UH hood HH UH D UW two T UW\n",
	" V vee V IY W we W IY\n",
	" Y yield Y IY L D Z zee Z IY\n",
	" ZH seizure S IY ZH ER\n",
	"\n",
	"(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2\n",
	"are contiguous, and not separated by FIRE'S 1.)\n"
	]
	}
	],
	"source": [
	"print(\"\\n\\n\".join(cmudict.readme().split(\"\\n\\n\")[3:-6]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Based on this, Benjamin came up with the following table:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 141,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"dipthongs_etc = {\n",
	" 'AW': ['AE', 'UW'],\n",
	" 'OW': ['AO', 'UW'],\n",
	" 'EY': ['EH', 'IY'],\n",
	" 'AY': ['AA', 'IY'],\n",
	" 'OY': ['AO', 'IY'],\n",
	" 'JH': ['D', 'ZH'],\n",
	" 'CH': ['T', 'SH'],\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 142,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def dipthify(words):\n",
	" return sum([dipthongs_etc.get(s, [s]) for s in words], [])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 143,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[u'HH', u'AE', u'T'], [u'F', u'EH', u'IY', u'L', u'Y', u'ER']]"
	]
	},
	"execution_count": 143,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"words = map(dipthify, words)\n",
	"words"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We could use `reversed()` to get the result, but it's hard to read…"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 144,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[u'ER', u'Y', u'L', u'IY', u'EH', u'F'], [u'T', u'AE', u'HH']]"
	]
	},
	"execution_count": 144,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"list(reversed([list(reversed(w)) for w in words]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## IPA output"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Benjamin can read IPA, so let's try that!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 112,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"ipa = {\n",
	" 'AA': 'ɑː',\n",
	" 'AH': 'ʌ',\n",
	" 'AW': 'ou',\n",
	" 'B': 'b',\n",
	" 'D': 'd',\n",
	" 'EH': 'e',\n",
	" 'EY': 'eɪ',\n",
	" 'G': 'g',\n",
	" 'IH': 'ɪ',\n",
	" 'JH': 'dʒ',\n",
	" 'L': 'l',\n",
	" 'N': 'n',\n",
	" 'OW': 'əʊ',\n",
	" 'P': 'p',\n",
	" 'S': 's',\n",
	" 'T': 't',\n",
	" 'UH': 'ʊ',\n",
	" 'V': 'v',\n",
	" 'Y': 'j',\n",
	" 'ZH': 'ʒ',\n",
	" 'AE': 'æ',\n",
	" 'AO': 'ɔː',\n",
	" 'AY': 'ʌɪ',\n",
	" 'CH': 'tʃ',\n",
	" 'DH': 'ð',\n",
	" 'ER': 'əː',\n",
	" 'F': 'f',\n",
	" 'HH': 'h',\n",
	" 'IY': 'iː',\n",
	" 'K': 'k',\n",
	" 'M': 'm',\n",
	" 'NG': 'ŋ',\n",
	" 'OY': 'ɔɪ',\n",
	" 'R': 'r',\n",
	" 'SH': 'ʃ',\n",
	" 'TH': 'θ',\n",
	" 'UW': 'uː',\n",
	" 'W': 'w',\n",
	" 'Z': 'z',\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 121,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"əːjliːef tæh\n"
	]
	}
	],
	"source": [
	"def ipaify(word):\n",
	" return ''.join(map(ipa.get, word))\n",
	"\n",
	"result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))\n",
	"print(result_ipa)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"I can't read that, but it seemed to sound ok when Benjamin tried it!"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## English output"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"It'd be nice if we could make version which uses English words where possible…"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"So let's reverse the cmudict to get a dictionary mapping phonemes to words."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 126,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"backwards = {}\n",
	"for word, pronounciations in d.items():\n",
	" word = word_pat.search(word).group(1)\n",
	" for phonemes in pronounciations:\n",
	" key = tuple(strip_phonemes(phonemes))\n",
	" backwards[key] = word"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 167,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"u'are'"
	]
	},
	"execution_count": 167,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"backwards[tuple(reversed(strip_phonemes(lookup('ra'))))]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"We need single-phonemes to fall back on…"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 122,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"english = {\n",
	" 'AA': 'o', # 'ah' not 'aw'\n",
	" 'AH': 'uh',\n",
	" 'AW': 'ow',\n",
	" 'B': 'b',\n",
	" 'D': 'd',\n",
	" 'EH': 'eh',\n",
	" 'EY': 'ay',\n",
	" 'G': 'g',\n",
	" 'IH': 'ih',\n",
	" 'JH': 'jh',\n",
	" 'L': 'l',\n",
	" 'N': 'n',\n",
	" 'OW': 'oah',\n",
	" 'P': 'p',\n",
	" 'S': 's',\n",
	" 'T': 't',\n",
	" 'UH': 'ooh',\n",
	" 'V': 'v',\n",
	" 'Y': 'y',\n",
	" 'ZH': 'zz',\n",
	" 'AE': 'aa',\n",
	" 'AO': 'aww',\n",
	" 'AY': 'eye',\n",
	" 'CH': 'ch',\n",
	" 'DH': 'th',\n",
	" 'ER': 'er',\n",
	" 'F': 'f',\n",
	" 'HH': 'h',\n",
	" 'IY': 'ee', # 'ee' would be better than 'e'\n",
	" 'K': 'k',\n",
	" 'M': 'm',\n",
	" 'NG': 'ng',\n",
	" 'OY': 'oy',\n",
	" 'R': 'r',\n",
	" 'SH': 'sh',\n",
	" 'TH': 'th',\n",
	" 'UW': 'ooo', # 'oo' would be better than 'ou'\n",
	" 'W': 'w',\n",
	" 'Z': 'z',\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 168,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"for phoneme, word in english.items():\n",
	" backwards[phoneme] = word"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Then we can do a dumb greedy search."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 169,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"u'are-y-leigh-f t-aa-h'"
	]
	},
	"execution_count": 169,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def engify(word):\n",
	" result = []\n",
	" word = list(word)\n",
	" while word:\n",
	" for i in range(len(word), 0, -1):\n",
	" key = tuple(word[:i])\n",
	" if key in backwards:\n",
	" result.append(backwards[key])\n",
	" word = word[i:]\n",
	" break\n",
	" else:\n",
	" result.append(english[word.pop(0)])\n",
	" return '-'.join(result)\n",
	"\n",
	"result_english = ' '.join(reversed(map(engify, map(reversed, words))))\n",
	"result_english"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Put it all together"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 177,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"stu-s t-aa-th v-uhh e-n-eh f-ih sioux-zz-d d-n-uhh z-g-eh clim v-aa-h e-awe\n",
	"stuːs tæð vʌ iːne fɪ suːʒd dnʌ zge klɪm væh iːɑː\n"
	]
	}
	],
	"source": [
	"word_pat = re.compile(u'([A-z]+)')\n",
	"\n",
	"text = unicode(\"I have milk, eggs and juice if any of that suits\")\n",
	"raw_words = filter(word_pat.match, word_pat.split(text))\n",
	"\n",
	"words = [] # really phonemes\n",
	"for w in raw_words:\n",
	" try:\n",
	" words.append(d[w.lower()][0])\n",
	" except KeyError:\n",
	" raise UnknownWord(w)\n",
	"\n",
	"words = map(strip_phonemes, words)\n",
	"\n",
	"words = map(dipthify, words)\n",
	"result_english = ' '.join(reversed(map(engify, map(reversed, words))))\n",
	"\n",
	"result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))\n",
	"\n",
	"print(result_english)\n",
	"print(result_ipa)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"```\n",
	"bugreport: ɑː is prounced “ah”, not “aw”\n",
	"and “oo” would be better than “ou” for uː\n",
	"and for that matter, \n",
	"```"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class UnknownWord(Exception): pass"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}