aflaxman/2018_10_14a_nlp_in_python_n-gram_language_model.ipynb

## 2018_10_14a_nlp_in_python_n-gram_language_model.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sun Oct 14 17:33:14 PDT 2018\r\n"
     ]
    }
   ],
   "source": [
    "import numpy as np, pandas as pd, matplotlib.pyplot as plt\n",
    "pd.set_option('display.max_rows', 8)\n",
    "%matplotlib inline\n",
    "!date"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# I would like to know more and do more with text data\n",
    "\n",
    "This is an ongoing effort to figure out how.  Starting with an n-gram language model, as described in Chapter 3 of [Speech and Language Processing](https://web.stanford.edu/~jurafsky/slp3/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>newid</th>\n",
       "      <th>module</th>\n",
       "      <th>site</th>\n",
       "      <th>gs_text34</th>\n",
       "      <th>sex</th>\n",
       "      <th>age_years</th>\n",
       "      <th>age_months</th>\n",
       "      <th>age_days</th>\n",
       "      <th>open_response</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Adult</td>\n",
       "      <td>Mexico</td>\n",
       "      <td>Cirrhosis</td>\n",
       "      <td>1.0</td>\n",
       "      <td>51.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>no comments.[PERSON] only told us what happene...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Adult</td>\n",
       "      <td>AP</td>\n",
       "      <td>Epilepsy</td>\n",
       "      <td>1.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[PATIENT] was suffering for the last two years...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Adult</td>\n",
       "      <td>AP</td>\n",
       "      <td>Pneumonia</td>\n",
       "      <td>2.0</td>\n",
       "      <td>62.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>she has stopped consuming tablets for b.p and ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Adult</td>\n",
       "      <td>Mexico</td>\n",
       "      <td>COPD</td>\n",
       "      <td>2.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>my mother's condition was already very poor du...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11974</th>\n",
       "      <td>2621</td>\n",
       "      <td>Neonate</td>\n",
       "      <td>Dar</td>\n",
       "      <td>Stillbirth</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>respondent was satisfied with the service rece...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11975</th>\n",
       "      <td>2622</td>\n",
       "      <td>Neonate</td>\n",
       "      <td>UP</td>\n",
       "      <td>Stillbirth</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>lady had labour pain at 9 pm. in the morning s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11976</th>\n",
       "      <td>2623</td>\n",
       "      <td>Neonate</td>\n",
       "      <td>AP</td>\n",
       "      <td>Congenital malformation</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>since my baby was born, she had the difficulty...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11977</th>\n",
       "      <td>2625</td>\n",
       "      <td>Neonate</td>\n",
       "      <td>Dar</td>\n",
       "      <td>Pneumonia</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.0</td>\n",
       "      <td>client had no additional point</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11978 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       newid   module    site                gs_text34  sex  age_years  \\\n",
       "0          1    Adult  Mexico                Cirrhosis  1.0       51.0   \n",
       "1          2    Adult      AP                 Epilepsy  1.0       24.0   \n",
       "2          3    Adult      AP                Pneumonia  2.0       62.0   \n",
       "3          4    Adult  Mexico                     COPD  2.0       80.0   \n",
       "...      ...      ...     ...                      ...  ...        ...   \n",
       "11974   2621  Neonate     Dar               Stillbirth  1.0        NaN   \n",
       "11975   2622  Neonate      UP               Stillbirth  1.0        NaN   \n",
       "11976   2623  Neonate      AP  Congenital malformation  2.0        NaN   \n",
       "11977   2625  Neonate     Dar                Pneumonia  1.0        NaN   \n",
       "\n",
       "       age_months  age_days                                      open_response  \n",
       "0             NaN       NaN  no comments.[PERSON] only told us what happene...  \n",
       "1             NaN       NaN  [PATIENT] was suffering for the last two years...  \n",
       "2             NaN       NaN  she has stopped consuming tablets for b.p and ...  \n",
       "3             NaN       NaN  my mother's condition was already very poor du...  \n",
       "...           ...       ...                                                ...  \n",
       "11974         NaN       0.0  respondent was satisfied with the service rece...  \n",
       "11975         NaN       0.0  lady had labour pain at 9 pm. in the morning s...  \n",
       "11976         NaN       2.0  since my baby was born, she had the difficulty...  \n",
       "11977         NaN      10.0                     client had no additional point  \n",
       "\n",
       "[11978 rows x 9 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# start with a corpus of text, relevant to global health metrics\n",
    "# described here: https://gatesopenresearch.org/articles/2-18/v1\n",
    "\n",
    "df = pd.read_excel('https://osf.io/w87ym/download', sheet_name='data')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It is not like this has not been done before.  But I want my own.  Here is the first page of Google results for [\"nlp in python n-grem language model\"](https://www.google.com/search?q=nlp+in+python+n-grem+language+model).  Should I fix the spelling?  Search engines are smart enough to handle that now, right?\n",
    "\n",
    "1. https://www.quora.com/Is-there-a-tutorial-on-how-to-train-an-n-gram-language-model-in-Python\n",
    "2. https://github.com/BigFav/n-grams\n",
    "3. https://cs.nyu.edu/courses/fall17/CSCI-UA.0480-006/lecture3-and-half-n-grams.pdf\n",
    "4. https://nlpforhackers.io/language-models/\n",
    "5. http://www.albertauyeung.com/post/generating-ngrams-python/\n",
    "6. https://pdfs.semanticscholar.org/3b46/9baa9bc5662f7702bfe4da0a72716acdbe4f.pdf\n",
    "7. https://www.cs.bgu.ac.il/~elhadad/nlp18/hw1.html\n",
    "8. https://stackoverflow.com/questions/13423919/computing-n-grams-using-python\n",
    "9. https://medium.com/@NicolasPapernot/natural-language-processing-bfa888e4e429\n",
    "\n",
    "A few of these do just what I want, and result 4 has some simple code snippets, so I'm going to start with that.  (It turns out that result 4 is terrible to copy code from, so perhaps this notebook will still be of some value.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3.3'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk  # seems like this has changed since the code I am following has been written\n",
    "nltk.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.15.2'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.__version__  # I don't expect this to make a difference, but might as well note it, too"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import word_tokenize, trigrams\n",
    "from collections import defaultdict\n",
    "\n",
    "model = defaultdict(lambda: defaultdict(lambda: 0))\n",
    "for sentence in df[df.module == 'Adult'].open_response:\n",
    "    for w1, w2, w3 in trigrams(word_tokenize(str(sentence)), pad_left=True, pad_right=True):\n",
    "        model[w1,w2][w3] += 1\n",
    "        \n",
    "# transform the counts to probabilities\n",
    "for w1_w2 in model:\n",
    "    total_count = float(sum(model[w1_w2].values()))\n",
    "    for w3 in model[w1_w2]:\n",
    "        model[w1_w2][w3] /= total_count\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(12345) # set random seed for reproducibility"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "according to the [ HOSPITAL ] a month , we would like for not helping her to go see her son .\n"
     ]
    }
   ],
   "source": [
    "text = [None, None]\n",
    " \n",
    "sentence_finished = False\n",
    " \n",
    "while not sentence_finished and len(text) < 100:\n",
    "    r = np.random.rand()\n",
    "    accumulator = .0\n",
    " \n",
    "    for word in model[tuple(text[-2:])].keys():\n",
    "        accumulator += model[tuple(text[-2:])][word]\n",
    " \n",
    "        if accumulator >= r:\n",
    "            text.append(word)\n",
    "            break\n",
    " \n",
    "    if text[-2:] == [None, None]:\n",
    "        sentence_finished = True\n",
    " \n",
    "print(' '.join([t for t in text if t]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now what it the most reasonable way to use this in my data generation sim?"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (vivarium)",
   "language": "python",
   "name": "vivarium"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Sun Oct 14 17:33:14 PDT 2018\r\n"
	]
	}
	],
	"source": [
	"import numpy as np, pandas as pd, matplotlib.pyplot as plt\n",
	"pd.set_option('display.max_rows', 8)\n",
	"%matplotlib inline\n",
	"!date"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"# I would like to know more and do more with text data\n",
	"\n",
	"This is an ongoing effort to figure out how. Starting with an n-gram language model, as described in Chapter 3 of [Speech and Language Processing](https://web.stanford.edu/~jurafsky/slp3/)."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>newid</th>\n",
	" <th>module</th>\n",
	" <th>site</th>\n",
	" <th>gs_text34</th>\n",
	" <th>sex</th>\n",
	" <th>age_years</th>\n",
	" <th>age_months</th>\n",
	" <th>age_days</th>\n",
	" <th>open_response</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>Adult</td>\n",
	" <td>Mexico</td>\n",
	" <td>Cirrhosis</td>\n",
	" <td>1.0</td>\n",
	" <td>51.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>no comments.[PERSON] only told us what happene...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2</td>\n",
	" <td>Adult</td>\n",
	" <td>AP</td>\n",
	" <td>Epilepsy</td>\n",
	" <td>1.0</td>\n",
	" <td>24.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>[PATIENT] was suffering for the last two years...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3</td>\n",
	" <td>Adult</td>\n",
	" <td>AP</td>\n",
	" <td>Pneumonia</td>\n",
	" <td>2.0</td>\n",
	" <td>62.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>she has stopped consuming tablets for b.p and ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>4</td>\n",
	" <td>Adult</td>\n",
	" <td>Mexico</td>\n",
	" <td>COPD</td>\n",
	" <td>2.0</td>\n",
	" <td>80.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>my mother's condition was already very poor du...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11974</th>\n",
	" <td>2621</td>\n",
	" <td>Neonate</td>\n",
	" <td>Dar</td>\n",
	" <td>Stillbirth</td>\n",
	" <td>1.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>respondent was satisfied with the service rece...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11975</th>\n",
	" <td>2622</td>\n",
	" <td>Neonate</td>\n",
	" <td>UP</td>\n",
	" <td>Stillbirth</td>\n",
	" <td>1.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>0.0</td>\n",
	" <td>lady had labour pain at 9 pm. in the morning s...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11976</th>\n",
	" <td>2623</td>\n",
	" <td>Neonate</td>\n",
	" <td>AP</td>\n",
	" <td>Congenital malformation</td>\n",
	" <td>2.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2.0</td>\n",
	" <td>since my baby was born, she had the difficulty...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11977</th>\n",
	" <td>2625</td>\n",
	" <td>Neonate</td>\n",
	" <td>Dar</td>\n",
	" <td>Pneumonia</td>\n",
	" <td>1.0</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>10.0</td>\n",
	" <td>client had no additional point</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>11978 rows × 9 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" newid module site gs_text34 sex age_years \\\n",
	"0 1 Adult Mexico Cirrhosis 1.0 51.0 \n",
	"1 2 Adult AP Epilepsy 1.0 24.0 \n",
	"2 3 Adult AP Pneumonia 2.0 62.0 \n",
	"3 4 Adult Mexico COPD 2.0 80.0 \n",
	"... ... ... ... ... ... ... \n",
	"11974 2621 Neonate Dar Stillbirth 1.0 NaN \n",
	"11975 2622 Neonate UP Stillbirth 1.0 NaN \n",
	"11976 2623 Neonate AP Congenital malformation 2.0 NaN \n",
	"11977 2625 Neonate Dar Pneumonia 1.0 NaN \n",
	"\n",
	" age_months age_days open_response \n",
	"0 NaN NaN no comments.[PERSON] only told us what happene... \n",
	"1 NaN NaN [PATIENT] was suffering for the last two years... \n",
	"2 NaN NaN she has stopped consuming tablets for b.p and ... \n",
	"3 NaN NaN my mother's condition was already very poor du... \n",
	"... ... ... ... \n",
	"11974 NaN 0.0 respondent was satisfied with the service rece... \n",
	"11975 NaN 0.0 lady had labour pain at 9 pm. in the morning s... \n",
	"11976 NaN 2.0 since my baby was born, she had the difficulty... \n",
	"11977 NaN 10.0 client had no additional point \n",
	"\n",
	"[11978 rows x 9 columns]"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# start with a corpus of text, relevant to global health metrics\n",
	"# described here: https://gatesopenresearch.org/articles/2-18/v1\n",
	"\n",
	"df = pd.read_excel('https://osf.io/w87ym/download', sheet_name='data')\n",
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"It is not like this has not been done before. But I want my own. Here is the first page of Google results for [\"nlp in python n-grem language model\"](https://www.google.com/search?q=nlp+in+python+n-grem+language+model). Should I fix the spelling? Search engines are smart enough to handle that now, right?\n",
	"\n",
	"1. https://www.quora.com/Is-there-a-tutorial-on-how-to-train-an-n-gram-language-model-in-Python\n",
	"2. https://github.com/BigFav/n-grams\n",
	"3. https://cs.nyu.edu/courses/fall17/CSCI-UA.0480-006/lecture3-and-half-n-grams.pdf\n",
	"4. https://nlpforhackers.io/language-models/\n",
	"5. http://www.albertauyeung.com/post/generating-ngrams-python/\n",
	"6. https://pdfs.semanticscholar.org/3b46/9baa9bc5662f7702bfe4da0a72716acdbe4f.pdf\n",
	"7. https://www.cs.bgu.ac.il/~elhadad/nlp18/hw1.html\n",
	"8. https://stackoverflow.com/questions/13423919/computing-n-grams-using-python\n",
	"9. https://medium.com/@NicolasPapernot/natural-language-processing-bfa888e4e429\n",
	"\n",
	"A few of these do just what I want, and result 4 has some simple code snippets, so I'm going to start with that. (It turns out that result 4 is terrible to copy code from, so perhaps this notebook will still be of some value.)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'3.3'"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import nltk # seems like this has changed since the code I am following has been written\n",
	"nltk.__version__"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'1.15.2'"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"np.__version__ # I don't expect this to make a difference, but might as well note it, too"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"from nltk import word_tokenize, trigrams\n",
	"from collections import defaultdict\n",
	"\n",
	"model = defaultdict(lambda: defaultdict(lambda: 0))\n",
	"for sentence in df[df.module == 'Adult'].open_response:\n",
	" for w1, w2, w3 in trigrams(word_tokenize(str(sentence)), pad_left=True, pad_right=True):\n",
	" model[w1,w2][w3] += 1\n",
	" \n",
	"# transform the counts to probabilities\n",
	"for w1_w2 in model:\n",
	" total_count = float(sum(model[w1_w2].values()))\n",
	" for w3 in model[w1_w2]:\n",
	" model[w1_w2][w3] /= total_count\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"np.random.seed(12345) # set random seed for reproducibility"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"according to the [ HOSPITAL ] a month , we would like for not helping her to go see her son .\n"
	]
	}
	],
	"source": [
	"text = [None, None]\n",
	" \n",
	"sentence_finished = False\n",
	" \n",
	"while not sentence_finished and len(text) < 100:\n",
	" r = np.random.rand()\n",
	" accumulator = .0\n",
	" \n",
	" for word in model[tuple(text[-2:])].keys():\n",
	" accumulator += model[tuple(text[-2:])][word]\n",
	" \n",
	" if accumulator >= r:\n",
	" text.append(word)\n",
	" break\n",
	" \n",
	" if text[-2:] == [None, None]:\n",
	" sentence_finished = True\n",
	" \n",
	"print(' '.join([t for t in text if t]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now what it the most reasonable way to use this in my data generation sim?"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python (vivarium)",
	"language": "python",
	"name": "vivarium"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}