zevisert/mlp.ipynb

## mlp.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import html\n",
    "import xmltodict\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.preprocessing import MaxAbsScaler\n",
    "from sklearn.metrics import classification_report, confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "headTags = (\"000\", \"000\")\n",
    "\n",
    "with open(\"./EnglishLS.train\") as raw:\n",
    "    with open(\"./EnglishLS.train.decoded\", \"w\") as decoded:\n",
    "        decoded.write(\"<EnglishLS>\")\n",
    "        for line in raw:\n",
    "            line = html.unescape(line)\n",
    "            if headTags is not None:\n",
    "                line = line.replace(\"<head>\", headTags[0]).replace(\"</head>\", headTags[1])\n",
    "            decoded.write(line)\n",
    "        decoded.write(\"</EnglishLS>\")\n",
    "\n",
    "with open('./EnglishLS.train.decoded') as cleaned:\n",
    "    trainxml = xmltodict.parse(cleaned.read())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "appear = [lexelt for lexelt in trainxml['EnglishLS']['lexelt'] if lexelt['@item'] == \"appear.v\"].pop()\n",
    "instances = appear['instance']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = pd.DataFrame(instances)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# For rows that have more than one answer, take only the first answer\n",
    "mask = train.answer.apply(lambda row: isinstance(row, list))\n",
    "train.loc[mask, 'answer'] = train.loc[mask, 'answer'].apply(lambda row: row[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_text = train.context.values\n",
    "vectorizer = CountVectorizer(\n",
    "    input='content',\n",
    "    stop_words='english',\n",
    "    max_df=1.0,\n",
    "    min_df=1,\n",
    "    binary=False\n",
    ")\n",
    "\n",
    "X = vectorizer.fit_transform(X_text)\n",
    "y = train.answer.apply(lambda row: row['@senseid']).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MaxAbsScaler(copy=True)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scaler = MaxAbsScaler()\n",
    "# Fit only to the training data\n",
    "scaler.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = scaler.transform(X_train)\n",
    "X_test = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
       "       beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
       "       hidden_layer_sizes=(1000, 500, 30), learning_rate='constant',\n",
       "       learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
       "       nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
       "       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n",
       "       verbose=False, warm_start=False)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mlp = MLPClassifier(\n",
    "    hidden_layer_sizes=(1000, 500, 30)\n",
    ")\n",
    "\n",
    "mlp.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "predictions = mlp.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train set size: (198, 6263)\n",
      "Test set size: (67, 6263)\n"
     ]
    }
   ],
   "source": [
    "print(\"Train set size: {}\\nTest set size: {}\".format(X_train.shape, X_test.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 7 15  2]\n",
      " [ 2 32  3]\n",
      " [ 0  3  3]]\n"
     ]
    }
   ],
   "source": [
    "print(confusion_matrix(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "     190901       0.78      0.29      0.42        24\n",
      "     190902       0.64      0.86      0.74        37\n",
      "     190903       0.38      0.50      0.43         6\n",
      "\n",
      "avg / total       0.67      0.63      0.60        67\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import html\n",
	"import xmltodict\n",
	"import pandas as pd\n",
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"from sklearn.neural_network import MLPClassifier\n",
	"from sklearn.preprocessing import MaxAbsScaler\n",
	"from sklearn.metrics import classification_report, confusion_matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"headTags = (\"000\", \"000\")\n",
	"\n",
	"with open(\"./EnglishLS.train\") as raw:\n",
	" with open(\"./EnglishLS.train.decoded\", \"w\") as decoded:\n",
	" decoded.write(\"<EnglishLS>\")\n",
	" for line in raw:\n",
	" line = html.unescape(line)\n",
	" if headTags is not None:\n",
	" line = line.replace(\"<head>\", headTags[0]).replace(\"</head>\", headTags[1])\n",
	" decoded.write(line)\n",
	" decoded.write(\"</EnglishLS>\")\n",
	"\n",
	"with open('./EnglishLS.train.decoded') as cleaned:\n",
	" trainxml = xmltodict.parse(cleaned.read())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"appear = [lexelt for lexelt in trainxml['EnglishLS']['lexelt'] if lexelt['@item'] == \"appear.v\"].pop()\n",
	"instances = appear['instance']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"train = pd.DataFrame(instances)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# For rows that have more than one answer, take only the first answer\n",
	"mask = train.answer.apply(lambda row: isinstance(row, list))\n",
	"train.loc[mask, 'answer'] = train.loc[mask, 'answer'].apply(lambda row: row[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X_text = train.context.values\n",
	"vectorizer = CountVectorizer(\n",
	" input='content',\n",
	" stop_words='english',\n",
	" max_df=1.0,\n",
	" min_df=1,\n",
	" binary=False\n",
	")\n",
	"\n",
	"X = vectorizer.fit_transform(X_text)\n",
	"y = train.answer.apply(lambda row: row['@senseid']).values"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X_train, X_test, y_train, y_test = train_test_split(X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MaxAbsScaler(copy=True)"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"scaler = MaxAbsScaler()\n",
	"# Fit only to the training data\n",
	"scaler.fit(X_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X_train = scaler.transform(X_train)\n",
	"X_test = scaler.transform(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
	" beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
	" hidden_layer_sizes=(1000, 500, 30), learning_rate='constant',\n",
	" learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
	" nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
	" shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n",
	" verbose=False, warm_start=False)"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"mlp = MLPClassifier(\n",
	" hidden_layer_sizes=(1000, 500, 30)\n",
	")\n",
	"\n",
	"mlp.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"predictions = mlp.predict(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train set size: (198, 6263)\n",
	"Test set size: (67, 6263)\n"
	]
	}
	],
	"source": [
	"print(\"Train set size: {}\\nTest set size: {}\".format(X_train.shape, X_test.shape))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[[ 7 15 2]\n",
	" [ 2 32 3]\n",
	" [ 0 3 3]]\n"
	]
	}
	],
	"source": [
	"print(confusion_matrix(y_test, predictions))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 190901 0.78 0.29 0.42 24\n",
	" 190902 0.64 0.86 0.74 37\n",
	" 190903 0.38 0.50 0.43 6\n",
	"\n",
	"avg / total 0.67 0.63 0.60 67\n",
	"\n"
	]
	}
	],
	"source": [
	"print(classification_report(y_test, predictions))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}