huhuhang/IMDB + TFIDF + LogReg.ipynb

## IMDB + TFIDF + LogReg.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import classification_report\n",
    "from keras.datasets import imdb\n",
    "\n",
    "import json\n",
    "import numpy\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dictionaries\n",
    "\n",
    "Create an index -> word mapping for each word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "word_to_index = imdb.get_word_index()\n",
    "index_to_word = [None] * (max(word_to_index.values()) + 1)\n",
    "for w, i in word_to_index.items():\n",
    "    index_to_word[i] = w"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Review reconstruction\n",
    "\n",
    "Translate the lists of word-indices to strings using the aforementioned dictionaries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "(X_train, y_train), (X_test, y_test) = imdb.load_data()\n",
    "X_train = [\n",
    "    ' '.join(\n",
    "        index_to_word[i]\n",
    "        for i in X_train[i]\n",
    "        if i < len(index_to_word)\n",
    "    ) for i in range(X_train.shape[0])\n",
    "]\n",
    "\n",
    "X_test = [\n",
    "    ' '.join(\n",
    "        index_to_word[i]\n",
    "        for i in X_test[i]\n",
    "        if i < len(index_to_word)\n",
    "    ) for i in range(X_test.shape[0])\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TFIDF + logistic regression pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,\n",
       " ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
       "          verbose=0, warm_start=False))])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = Pipeline([\n",
    "    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),\n",
    "    ('log', LogisticRegression())\n",
    "])\n",
    "\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.88      0.90      0.89     12280\n",
      "          1       0.90      0.88      0.89     12720\n",
      "\n",
      "avg / total       0.89      0.89      0.89     25000\n",
      "\n",
      "0.88904\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(X_test)\n",
    "print(classification_report(y_pred, y_test))\n",
    "print((y_pred == y_test).mean())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "IPython (Python 3)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"from sklearn.pipeline import Pipeline\n",
	"from sklearn.metrics import classification_report\n",
	"from keras.datasets import imdb\n",
	"\n",
	"import json\n",
	"import numpy\n",
	"import nltk"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Dictionaries\n",
	"\n",
	"Create an index -> word mapping for each word"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"word_to_index = imdb.get_word_index()\n",
	"index_to_word = [None] * (max(word_to_index.values()) + 1)\n",
	"for w, i in word_to_index.items():\n",
	" index_to_word[i] = w"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Review reconstruction\n",
	"\n",
	"Translate the lists of word-indices to strings using the aforementioned dictionaries."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"(X_train, y_train), (X_test, y_test) = imdb.load_data()\n",
	"X_train = [\n",
	" ' '.join(\n",
	" index_to_word[i]\n",
	" for i in X_train[i]\n",
	" if i < len(index_to_word)\n",
	" ) for i in range(X_train.shape[0])\n",
	"]\n",
	"\n",
	"X_test = [\n",
	" ' '.join(\n",
	" index_to_word[i]\n",
	" for i in X_test[i]\n",
	" if i < len(index_to_word)\n",
	" ) for i in range(X_test.shape[0])\n",
	"]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# TFIDF + logistic regression pipeline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
	" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
	" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
	" ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,\n",
	" ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
	" verbose=0, warm_start=False))])"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model = Pipeline([\n",
	" ('tfidf', TfidfVectorizer(ngram_range=(1,2))),\n",
	" ('log', LogisticRegression())\n",
	"])\n",
	"\n",
	"model.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.88 0.90 0.89 12280\n",
	" 1 0.90 0.88 0.89 12720\n",
	"\n",
	"avg / total 0.89 0.89 0.89 25000\n",
	"\n",
	"0.88904\n"
	]
	}
	],
	"source": [
	"y_pred = model.predict(X_test)\n",
	"print(classification_report(y_pred, y_test))\n",
	"print((y_pred == y_test).mean())"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "IPython (Python 3)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}