Skip to content

Instantly share code, notes, and snippets.

@zevisert
Created August 1, 2017 03:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zevisert/53348d6a1606d320dcc7f1727bbcb849 to your computer and use it in GitHub Desktop.
Save zevisert/53348d6a1606d320dcc7f1727bbcb849 to your computer and use it in GitHub Desktop.
How I did NLP assignment 3 Q2 - maybe useful for our project as we move into MLP's
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import html\n",
"import xmltodict\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.preprocessing import MaxAbsScaler\n",
"from sklearn.metrics import classification_report, confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"headTags = (\"000\", \"000\")\n",
"\n",
"with open(\"./EnglishLS.train\") as raw:\n",
" with open(\"./EnglishLS.train.decoded\", \"w\") as decoded:\n",
" decoded.write(\"<EnglishLS>\")\n",
" for line in raw:\n",
" line = html.unescape(line)\n",
" if headTags is not None:\n",
" line = line.replace(\"<head>\", headTags[0]).replace(\"</head>\", headTags[1])\n",
" decoded.write(line)\n",
" decoded.write(\"</EnglishLS>\")\n",
"\n",
"with open('./EnglishLS.train.decoded') as cleaned:\n",
" trainxml = xmltodict.parse(cleaned.read())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"appear = [lexelt for lexelt in trainxml['EnglishLS']['lexelt'] if lexelt['@item'] == \"appear.v\"].pop()\n",
"instances = appear['instance']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train = pd.DataFrame(instances)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# For rows that have more than one answer, take only the first answer\n",
"mask = train.answer.apply(lambda row: isinstance(row, list))\n",
"train.loc[mask, 'answer'] = train.loc[mask, 'answer'].apply(lambda row: row[0])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_text = train.context.values\n",
"vectorizer = CountVectorizer(\n",
" input='content',\n",
" stop_words='english',\n",
" max_df=1.0,\n",
" min_df=1,\n",
" binary=False\n",
")\n",
"\n",
"X = vectorizer.fit_transform(X_text)\n",
"y = train.answer.apply(lambda row: row['@senseid']).values"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"MaxAbsScaler(copy=True)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaler = MaxAbsScaler()\n",
"# Fit only to the training data\n",
"scaler.fit(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train = scaler.transform(X_train)\n",
"X_test = scaler.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
" beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
" hidden_layer_sizes=(1000, 500, 30), learning_rate='constant',\n",
" learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
" nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
" shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,\n",
" verbose=False, warm_start=False)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mlp = MLPClassifier(\n",
" hidden_layer_sizes=(1000, 500, 30)\n",
")\n",
"\n",
"mlp.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"predictions = mlp.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train set size: (198, 6263)\n",
"Test set size: (67, 6263)\n"
]
}
],
"source": [
"print(\"Train set size: {}\\nTest set size: {}\".format(X_train.shape, X_test.shape))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 7 15 2]\n",
" [ 2 32 3]\n",
" [ 0 3 3]]\n"
]
}
],
"source": [
"print(confusion_matrix(y_test, predictions))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 190901 0.78 0.29 0.42 24\n",
" 190902 0.64 0.86 0.74 37\n",
" 190903 0.38 0.50 0.43 6\n",
"\n",
"avg / total 0.67 0.63 0.60 67\n",
"\n"
]
}
],
"source": [
"print(classification_report(y_test, predictions))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment