Last active
June 14, 2016 05:40
-
-
Save jaskiratr/d677975fc4696b9494db2fe23c4d352b to your computer and use it in GitHub Desktop.
ML Spring 2016: Classification Task
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import matplotlib.pyplot as plt\n", | |
"from sklearn.preprocessing import Imputer\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(455000, 13)" | |
] | |
}, | |
"execution_count": 99, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data = pd.read_csv('../Data/Amazon.csv')\n", | |
"# data = data[0:1000]\n", | |
"data.shape\n", | |
"# data.head(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(455000,)" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# define y\n", | |
"y = data.iloc[:, 12].values\n", | |
"y.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(455000, 131072)" | |
] | |
}, | |
"execution_count": 101, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#bag of words on text // Replace with hashingvectorizer\n", | |
"# vectorize Bag of Words from review text; as sparse matrix\n", | |
"from sklearn.feature_extraction.text import HashingVectorizer\n", | |
"hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)\n", | |
"XText = hv.transform(data.Text)\n", | |
"XText.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(455000, 131072)" | |
] | |
}, | |
"execution_count": 102, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#bag of words on summary\n", | |
"# creating newSummary with \"no summary given\" where summary does not exist.\n", | |
"# data['SummaryFill'] = np.where(pd.isnull(data['Summary']) == True, 'no summary given', data['Summary'])\n", | |
"\n", | |
"data['Summary'].fillna('null', inplace=True)\n", | |
"XSummary = hv.transform(data.Summary)\n", | |
"XSummary.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 140, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Ngrams\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"# nVec = TfidfVectorizer(ngram_range=(3,3))\n", | |
"nVec = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), min_df=1)\n", | |
"Xngram = nVec.fit_transform(data.Text)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 141, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"scipy.sparse.csr.csr_matrix" | |
] | |
}, | |
"execution_count": 141, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"type(Xngram)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 142, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# features from Amazon.csv to add to feature set\n", | |
"# XScore = data.iloc[:, 7].values.reshape(data.shape[0], 1)\n", | |
"\n", | |
"# data['reviewLen'] = data['Text'].str.len()\n", | |
"# data['summaryLen'] = data['Summary'].str.len()\n", | |
"# data['usernameWords'] = data ['ProfileName'].str.split().str.len()\n", | |
"# data['newLines'] = data['Text'].str.split('\\n').str.len()\n", | |
"\n", | |
"# data['hasSC'] = data['Text'].str.count(';')\n", | |
"# data['hasEX'] = data['Text'].str.count('!')\n", | |
"# data['hasQ'] = data['Text'].str.count('\\?')\n", | |
"# data['punctCount'] = data['Text'].str.count('[.,!;:()/\\?-]')\n", | |
"# data['punctToWords'] = data['punctCount'] / data['reviewLen']\n", | |
"# data['avWordLength'] = data['Text'].str.len() // (data['Text'].str.count(' ') + 1)\n", | |
"# data['sumLen'] = data['Summary'].str.count('\\S')\n", | |
"# data['avSumWordLen'] = data['Summary'].str.len() // (data['Summary'].str.count(' ') + 1)\n", | |
"# data['sumHasEX'] = data['Summary'].str.count('!')\n", | |
"# data['sumHasQ'] = data['Summary'].str.count('\\?')\n", | |
"\n", | |
"# XreviewLen = data.iloc[:, 13].values.reshape(data.shape[0], 1)\n", | |
"# XsummaryLen = data.iloc[:, 14].values.reshape(data.shape[0], 1)\n", | |
"# XusernameWords = data.iloc[:, 15].values.reshape(data.shape[0], 1)\n", | |
"# XexclamationNumber = data.iloc[:, 16].values.reshape(data.shape[0], 1)\n", | |
"# XnewLines = data.iloc[:, 17].values.reshape(data.shape[0], 1)\n", | |
"# XhasSC = data.iloc[:, 15].values.reshape(data.shape[0], 1)\n", | |
"# XhasEX = data.iloc[:, 16].values.reshape(data.shape[0], 1)\n", | |
"# XhasQ = data.iloc[:, 17].values.reshape(data.shape[0], 1)\n", | |
"# XpunctCount = data.iloc[:, 18].values.reshape(data.shape[0], 1)\n", | |
"# XpunctToWords = data.iloc[:, 19].values.reshape(data.shape[0], 1)\n", | |
"# XavWordLength = data.iloc[:, 20].values.reshape(data.shape[0], 1)\n", | |
"# XsumLen = data.iloc[:, 21].values.reshape(data.shape[0], 1)\n", | |
"# XavSumWordLen = data.iloc[:, 22].values.reshape(data.shape[0], 1)\n", | |
"# XsumHasEX = data.iloc[:, 23].values.reshape(data.shape[0], 1)\n", | |
"# XsumHasQ = data.iloc[:, 24].values.reshape(data.shape[0], 1)\n", | |
"\n", | |
"# Xtoadd = np.concatenate((XScore,XreviewLen,XsummaryLen,XusernameWords,XnewLines,XhasSC,XhasEX,XhasQ,XpunctCount,XpunctToWords,XsumLen,XavSumWordLen,XsumHasEX,XsumHasQ), axis=1)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 143, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# vectorize Bag of Words from Summary text; as sparse matrix\n", | |
"from sklearn.feature_extraction.text import HashingVectorizer\n", | |
"hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)\n", | |
"XhashText = hv.transform(data.Text)\n", | |
"XhashSummary = hv.transform(data.Summary)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 144, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# convert to CSR\n", | |
"from scipy.sparse import csr_matrix, hstack\n", | |
"XtoaddSparse = csr_matrix(Xtoadd)\n", | |
"# Xfinal = hstack([ Xtoadd, Xngram, XtoaddSparse,XhashText,XhashSummary,XText,XSummary])\n", | |
"Xfinal = hstack([ Xngram, XtoaddSparse,XhashText,XhashSummary,XText,XSummary])\n", | |
"X = csr_matrix(Xfinal)\n", | |
"imp = Imputer(missing_values='NaN', strategy='mean', axis=0)\n", | |
"X = imp.fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 145, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.cross_validation import train_test_split\n", | |
"\n", | |
"X_train, X_test, y_train, y_test = train_test_split(\n", | |
" X, y, test_size=0.3, random_state=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 146, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# report on training and test sets\n", | |
"def print_results():\n", | |
" print('Error rate on training set: ')\n", | |
" print((y_train != y_pred).sum() / X_train.shape[0])\n", | |
" print('Accuracy rate on training set: ')\n", | |
" print(1 - (y_train != y_pred).sum() / X_train.shape[0])\n", | |
" print('True positive rate on training set:')\n", | |
" print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())\n", | |
" print('**************')\n", | |
" print('Error rate on test set: ')\n", | |
" print((y_test != y_pred_test).sum() / X_test.shape[0])\n", | |
" print('Accuracy rate on test set: ')\n", | |
" print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])\n", | |
" print('True positive rate on test set')\n", | |
" print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())\n", | |
" print('True negative rate on test set')\n", | |
" print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 147, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# feature scaling\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"sc = StandardScaler(with_mean=False)\n", | |
"sc.fit(X_train)\n", | |
"X_train_std = sc.transform(X_train)\n", | |
"X_test_std = sc.transform(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 148, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Error rate on training set: \n", | |
"0.000835164835165\n", | |
"Accuracy rate on training set: \n", | |
"0.999164835165\n", | |
"True positive rate on training set:\n", | |
"0.993858707322\n", | |
"**************\n", | |
"Error rate on test set: \n", | |
"0.134490842491\n", | |
"Accuracy rate on test set: \n", | |
"0.865509157509\n", | |
"True positive rate on test set\n", | |
"0.446834170854\n", | |
"True negative rate on test set\n", | |
"0.898427499012\n" | |
] | |
} | |
], | |
"source": [ | |
"# Perceptron\n", | |
"from sklearn import linear_model\n", | |
"clf = linear_model.SGDClassifier(loss='squared_hinge')\n", | |
"clf.fit(X_train_std, y_train)\n", | |
"y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)\n", | |
"y_pred_test = clf.predict(X_test_std)\n", | |
"print_results()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 149, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# from sklearn.svm import SVC\n", | |
"# svm = SVC(kernel='linear', C=1.0, random_state=0)\n", | |
"# svm.fit(X_train, y_train)\n", | |
"# y_pred = svm.predict(X_train)\n", | |
"# y_pred_test = svm.predict(X_test)\n", | |
"# print_results()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment