zer0n/CNN Tutorial on Text Classification.ipynb

## CNN Tutorial on Text Classification.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Text Classification using 1D Convolution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Boiler-plate code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using Theano backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "from keras.preprocessing import sequence\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, Activation\n",
    "from keras.layers import Embedding\n",
    "from keras.layers import Convolution1D\n",
    "from keras.layers.pooling import GlobalMaxPooling1D\n",
    "from keras.datasets import imdb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# set parameters:\n",
    "max_features = 5000\n",
    "maxlen = 400\n",
    "batch_size = 32\n",
    "embedding_dims = 50\n",
    "nb_filter = 250\n",
    "filter_length = 3\n",
    "hidden_dims = 250\n",
    "nb_epoch = 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Prep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading data...\n",
      "(25000, 'train sequences')\n",
      "(25000, 'test sequences')\n",
      "Pad sequences (samples x time)\n",
      "('X_train shape:', (25000, 400))\n",
      "('X_test shape:', (25000, 400))\n"
     ]
    }
   ],
   "source": [
    "print('Loading data...')\n",
    "(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)\n",
    "print(len(X_train), 'train sequences')\n",
    "print(len(X_test), 'test sequences')\n",
    "\n",
    "print('Pad sequences (samples x time)')\n",
    "X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n",
    "X_test = sequence.pad_sequences(X_test, maxlen=maxlen)\n",
    "print('X_train shape:', X_train.shape)\n",
    "print('X_test shape:', X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Definition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "\n",
    "# we start off with an efficient embedding layer which maps\n",
    "# our vocab indices into embedding_dims dimensions\n",
    "model.add(Embedding(max_features,\n",
    "                    embedding_dims,\n",
    "                    input_length=maxlen,\n",
    "                    dropout=0.2))\n",
    "\n",
    "# we add a Convolution1D, which will learn nb_filter\n",
    "# word group filters of size filter_length:\n",
    "\n",
    "model.add(Convolution1D(nb_filter=nb_filter,\n",
    "                        filter_length=filter_length,\n",
    "                        border_mode='valid',\n",
    "                        activation='relu',\n",
    "                        subsample_length=1))\n",
    "\n",
    "# we use max pooling:\n",
    "model.add(GlobalMaxPooling1D())\n",
    "\n",
    "# We add a vanilla hidden layer:\n",
    "model.add(Dense(hidden_dims))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Activation('relu'))\n",
    "\n",
    "# We project onto a single unit output layer, and squash it with a sigmoid:\n",
    "model.add(Dense(1))\n",
    "model.add(Activation('sigmoid'))\n",
    "\n",
    "from keras.utils.visualize_util import plot\n",
    "plot(model, to_file='model_text_classification.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize model architecture\n",
    "<img src='model_text_classification.png'>\n",
    "\n",
    "## Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/2\n",
      "25000/25000 [==============================] - 136s - loss: 0.4330 - acc: 0.7868 - val_loss: 0.2953 - val_acc: 0.8807\n",
      "Epoch 2/2\n",
      "25000/25000 [==============================] - 142s - loss: 0.2911 - acc: 0.8769 - val_loss: 0.2820 - val_acc: 0.8813\n"
     ]
    }
   ],
   "source": [
    "model.compile(loss='binary_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])\n",
    "model.fit(X_train, y_train,\n",
    "          batch_size=batch_size,\n",
    "          nb_epoch=nb_epoch,\n",
    "          validation_data=(X_test, y_test));"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"slideshow": {
	"slide_type": "slide"
	}
	},
	"source": [
	"# Text Classification using 1D Convolution"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Boiler-plate code"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Using Theano backend.\n"
	]
	}
	],
	"source": [
	"import numpy as np\n",
	"\n",
	"from keras.preprocessing import sequence\n",
	"from keras.models import Sequential\n",
	"from keras.layers import Dense, Dropout, Activation\n",
	"from keras.layers import Embedding\n",
	"from keras.layers import Convolution1D\n",
	"from keras.layers.pooling import GlobalMaxPooling1D\n",
	"from keras.datasets import imdb"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# set parameters:\n",
	"max_features = 5000\n",
	"maxlen = 400\n",
	"batch_size = 32\n",
	"embedding_dims = 50\n",
	"nb_filter = 250\n",
	"filter_length = 3\n",
	"hidden_dims = 250\n",
	"nb_epoch = 2"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Data Prep"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Loading data...\n",
	"(25000, 'train sequences')\n",
	"(25000, 'test sequences')\n",
	"Pad sequences (samples x time)\n",
	"('X_train shape:', (25000, 400))\n",
	"('X_test shape:', (25000, 400))\n"
	]
	}
	],
	"source": [
	"print('Loading data...')\n",
	"(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)\n",
	"print(len(X_train), 'train sequences')\n",
	"print(len(X_test), 'test sequences')\n",
	"\n",
	"print('Pad sequences (samples x time)')\n",
	"X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n",
	"X_test = sequence.pad_sequences(X_test, maxlen=maxlen)\n",
	"print('X_train shape:', X_train.shape)\n",
	"print('X_test shape:', X_test.shape)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Model Definition"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"model = Sequential()\n",
	"\n",
	"# we start off with an efficient embedding layer which maps\n",
	"# our vocab indices into embedding_dims dimensions\n",
	"model.add(Embedding(max_features,\n",
	" embedding_dims,\n",
	" input_length=maxlen,\n",
	" dropout=0.2))\n",
	"\n",
	"# we add a Convolution1D, which will learn nb_filter\n",
	"# word group filters of size filter_length:\n",
	"\n",
	"model.add(Convolution1D(nb_filter=nb_filter,\n",
	" filter_length=filter_length,\n",
	" border_mode='valid',\n",
	" activation='relu',\n",
	" subsample_length=1))\n",
	"\n",
	"# we use max pooling:\n",
	"model.add(GlobalMaxPooling1D())\n",
	"\n",
	"# We add a vanilla hidden layer:\n",
	"model.add(Dense(hidden_dims))\n",
	"model.add(Dropout(0.2))\n",
	"model.add(Activation('relu'))\n",
	"\n",
	"# We project onto a single unit output layer, and squash it with a sigmoid:\n",
	"model.add(Dense(1))\n",
	"model.add(Activation('sigmoid'))\n",
	"\n",
	"from keras.utils.visualize_util import plot\n",
	"plot(model, to_file='model_text_classification.png')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Visualize model architecture\n",
	"<img src='model_text_classification.png'>\n",
	"\n",
	"## Train the model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train on 25000 samples, validate on 25000 samples\n",
	"Epoch 1/2\n",
	"25000/25000 [==============================] - 136s - loss: 0.4330 - acc: 0.7868 - val_loss: 0.2953 - val_acc: 0.8807\n",
	"Epoch 2/2\n",
	"25000/25000 [==============================] - 142s - loss: 0.2911 - acc: 0.8769 - val_loss: 0.2820 - val_acc: 0.8813\n"
	]
	}
	],
	"source": [
	"model.compile(loss='binary_crossentropy',\n",
	" optimizer='adam',\n",
	" metrics=['accuracy'])\n",
	"model.fit(X_train, y_train,\n",
	" batch_size=batch_size,\n",
	" nb_epoch=nb_epoch,\n",
	" validation_data=(X_test, y_test));"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}