Skip to content

Instantly share code, notes, and snippets.

@Orbifold
Created January 27, 2018 06:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Orbifold/9fda1691dd4a6856c856c839a9b6d193 to your computer and use it in GitHub Desktop.
Save Orbifold/9fda1691dd4a6856c856c839a9b6d193 to your computer and use it in GitHub Desktop.
Mapping language to intends
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Language understanding, take one\n",
"\n",
"Part of [an article](http://www.orbifold.net/default/2017/01/10/language-understanding-1/) on Orbifold. \n",
"Purpose is to map multilingual input to intends."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
},
"outputs": [],
"source": [
"import keras\n",
"from keras.models import Sequential\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.layers.recurrent import SimpleRNN\n",
"from keras.layers.core import Dense, Dropout, Flatten\n",
"from keras.layers.wrappers import TimeDistributed\n",
"from keras.layers import Convolution1D, LSTM\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense, Dropout, Activation\n",
"from keras.utils import np_utils\n",
"from keras.preprocessing.text import Tokenizer\n",
"import itertools\n",
"import numpy as np\n",
"from keras.utils.np_utils import to_categorical\n",
"\n",
"train = [\"What would it cost to travel to the city on Monday?\",\n",
" \"Need to travel this afternoon\",\n",
" \"I want to buy a ticket\",\n",
" \"Can I order a trip?\", \n",
" \"I would like to buy a ticket to Brussels\", \n",
"\n",
" \"What will be the weather tomorrow?\",\n",
" \"Will it rain this afternoon?\",\n",
" \"The sunshine feels great\",\n",
" \"Can you predict rain?\",\n",
" \"Guess I should wear a jacket hey!\",\n",
"\n",
" \"Dit is geheel iets anders\",\n",
" \"Kan ik dit goed vinden\",\n",
" \"Wat is dit soms goed\",\n",
" \"Maar anders is soms goed\"]\n",
"\n",
"T = \"Buy a train ticket\"\n",
"W = \"Asking about the weather\"\n",
"F = \"Babble in 't Vlaamsch\"\n",
"labelsTrain = [T,\n",
" T,\n",
" T,\n",
" T,\n",
" T,\n",
"\n",
" W,\n",
" W,\n",
" W,\n",
" W,\n",
" W,\n",
"\n",
" F,\n",
" F,\n",
" F,\n",
" F]\n",
"\n",
"test = [\n",
" \"Do you think it will be sunny tomorrow?\",\n",
" \"What a wonderful feeling in the sun!\",\n",
" \"How can I travel to Leuven?\",\n",
" \"Can I buy it from you?\",\n",
" \"Anders is heel goed\"\n",
" ]\n",
"labelsTest = [W, W, T, T, F]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/swa/conda/lib/python3.6/site-packages/ipykernel_launcher.py:20: UserWarning: The `dropout` argument is no longer support in `Embedding`. You can apply a `keras.layers.SpatialDropout1D` layer right after the `Embedding` layer to get the same behavior.\n",
"/Users/swa/conda/lib/python3.6/site-packages/keras/models.py:944: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n",
" warnings.warn('The `nb_epoch` argument in `fit` '\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"acc: 80.00%\n"
]
}
],
"source": [
"tokenizer = Tokenizer()\n",
"all_texts = train + test\n",
"tokenizer.fit_on_texts(all_texts)\n",
"# print(tokenizer.word_index)\n",
"\n",
"X_train = tokenizer.texts_to_matrix(train)\n",
"X_test = tokenizer.texts_to_matrix(test)\n",
"\n",
"all_labels = labelsTest + labelsTrain\n",
"labels = set(all_labels)\n",
"idx2labels = list(labels)\n",
"label2idx = dict((v, i) for i, v in enumerate(labels))\n",
"\n",
"y_train = to_categorical([label2idx[w] for w in labelsTrain])\n",
"y_test = to_categorical([label2idx[w] for w in labelsTest])\n",
"\n",
"vocab_size = len(tokenizer.word_index) + 1\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(2, 45, input_length= X_train.shape[1], dropout=0.2 ))\n",
"model.add(Flatten())\n",
"model.add(Dense(50, name='middle'))\n",
"model.add(Dropout(0.2))\n",
"model.add(Dense(3, activation='softmax', name='output')) \n",
"\n",
"model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])\n",
"\n",
"model.fit(X_train, y=y_train, nb_epoch=1500, verbose=0, validation_split=0.2, shuffle=True)\n",
"\n",
"scores = model.evaluate(X_test, y_test, verbose=0)\n",
"print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0., 1., 0.]], dtype=float32)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.predict(tokenizer.texts_to_matrix([\"Welke dag is het vandaag?\"])).round()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
},
"outputs": [
{
"data": {
"text/plain": [
"[\"Babble in 't Vlaamsch\", 'Asking about the weather', 'Buy a train ticket']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx2labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Glove embedding\n",
"\nYou can download various [pretrained models from the GloVe website](https://nlp.stanford.edu/projects/glove/)."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 400000 word vectors.\n"
]
}
],
"source": [
"embeddings_index = {}\n",
"glove_data = '~/Desktop/LargeFiles/glove/glove.6B.50d.txt'\n",
"with open(glove_data, encoding='UTF-8') as f:\n",
" for line in f:\n",
" values = line.split()\n",
" word = values[0]\n",
" value = np.asarray(values[1:], dtype='float32')\n",
" embeddings_index[word] = value\n",
"print('Loaded %s word vectors.' % len(embeddings_index))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/swa/conda/lib/python3.6/site-packages/keras/models.py:944: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n",
" warnings.warn('The `nb_epoch` argument in `fit` '\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"acc: 40.00%\n"
]
}
],
"source": [
"embedding_dimension = 50\n",
"word_index = tokenizer.word_index\n",
"embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))\n",
"\n",
"for word, i in word_index.items():\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None:\n",
" # words not found in embedding index will be all-zeros.\n",
" embedding_matrix[i] = embedding_vector[:embedding_dimension]\n",
"\n",
"embedding_layer = Embedding(embedding_matrix.shape[0],\n",
" embedding_matrix.shape[1],\n",
" weights=[embedding_matrix],\n",
" name='W2V_embedding',\n",
" input_length=len(word_index) + 1)\n",
"\n",
"from keras.preprocessing.sequence import pad_sequences\n",
"X_train = tokenizer.texts_to_sequences(train)\n",
"X_train = pad_sequences(X_train, maxlen=len(word_index) + 1)\n",
"\n",
"model = Sequential()\n",
"model.add(embedding_layer)\n",
"model.add(Flatten())\n",
"model.add(Dense(50, activation='sigmoid', name='middle_layer'))\n",
"model.layers[0].trainable=False # bug in Keras or Theano\n",
"model.add(Dropout(0.2))\n",
"model.add(Dense(3, activation='softmax', name='output')) \n",
"\n",
"model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])\n",
"\n",
"model.fit(X_train, y=y_train, nb_epoch=2500, verbose=0, validation_split=0.2, shuffle=True)\n",
"\n",
"scores = model.evaluate(X_test, y_test, verbose=0)\n",
"print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With LSTM you do not get better results."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/swa/conda/lib/python3.6/site-packages/ipykernel_launcher.py:3: UserWarning: Update your `LSTM` call to the Keras 2 API: `LSTM(128, dropout=0.2, recurrent_dropout=0.2)`\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n",
"/Users/swa/conda/lib/python3.6/site-packages/keras/models.py:944: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n",
" warnings.warn('The `nb_epoch` argument in `fit` '\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"acc: 40.00%\n"
]
}
],
"source": [
"model = Sequential()\n",
"model.add(embedding_layer)\n",
"model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) \n",
"model.add(Dense(3, activation='softmax', name='output')) \n",
"\n",
"model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
"\n",
"model.fit(X_train, y=y_train, nb_epoch=1000, verbose=0, validation_split=0.2, shuffle=True)\n",
"\n",
"scores = model.evaluate(X_test, y_test, verbose=0)\n",
"print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"kernel_info": {
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.6.3",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"nteract": {
"version": "0.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment