Skip to content

Instantly share code, notes, and snippets.

@jaskiratr
Last active June 14, 2016 05:40
Show Gist options
  • Save jaskiratr/d677975fc4696b9494db2fe23c4d352b to your computer and use it in GitHub Desktop.
Save jaskiratr/d677975fc4696b9494db2fe23c4d352b to your computer and use it in GitHub Desktop.
ML Spring 2016: Classification Task
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import Imputer\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(455000, 13)"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('../Data/Amazon.csv')\n",
"# data = data[0:1000]\n",
"data.shape\n",
"# data.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(455000,)"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# define y\n",
"y = data.iloc[:, 12].values\n",
"y.shape"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(455000, 131072)"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#bag of words on text // Replace with hashingvectorizer\n",
"# vectorize Bag of Words from review text; as sparse matrix\n",
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)\n",
"XText = hv.transform(data.Text)\n",
"XText.shape"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(455000, 131072)"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#bag of words on summary\n",
"# creating newSummary with \"no summary given\" where summary does not exist.\n",
"# data['SummaryFill'] = np.where(pd.isnull(data['Summary']) == True, 'no summary given', data['Summary'])\n",
"\n",
"data['Summary'].fillna('null', inplace=True)\n",
"XSummary = hv.transform(data.Summary)\n",
"XSummary.shape"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Ngrams\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"# nVec = TfidfVectorizer(ngram_range=(3,3))\n",
"nVec = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), min_df=1)\n",
"Xngram = nVec.fit_transform(data.Text)\n"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"scipy.sparse.csr.csr_matrix"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(Xngram)"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# features from Amazon.csv to add to feature set\n",
"# XScore = data.iloc[:, 7].values.reshape(data.shape[0], 1)\n",
"\n",
"# data['reviewLen'] = data['Text'].str.len()\n",
"# data['summaryLen'] = data['Summary'].str.len()\n",
"# data['usernameWords'] = data ['ProfileName'].str.split().str.len()\n",
"# data['newLines'] = data['Text'].str.split('\\n').str.len()\n",
"\n",
"# data['hasSC'] = data['Text'].str.count(';')\n",
"# data['hasEX'] = data['Text'].str.count('!')\n",
"# data['hasQ'] = data['Text'].str.count('\\?')\n",
"# data['punctCount'] = data['Text'].str.count('[.,!;:()/\\?-]')\n",
"# data['punctToWords'] = data['punctCount'] / data['reviewLen']\n",
"# data['avWordLength'] = data['Text'].str.len() // (data['Text'].str.count(' ') + 1)\n",
"# data['sumLen'] = data['Summary'].str.count('\\S')\n",
"# data['avSumWordLen'] = data['Summary'].str.len() // (data['Summary'].str.count(' ') + 1)\n",
"# data['sumHasEX'] = data['Summary'].str.count('!')\n",
"# data['sumHasQ'] = data['Summary'].str.count('\\?')\n",
"\n",
"# XreviewLen = data.iloc[:, 13].values.reshape(data.shape[0], 1)\n",
"# XsummaryLen = data.iloc[:, 14].values.reshape(data.shape[0], 1)\n",
"# XusernameWords = data.iloc[:, 15].values.reshape(data.shape[0], 1)\n",
"# XexclamationNumber = data.iloc[:, 16].values.reshape(data.shape[0], 1)\n",
"# XnewLines = data.iloc[:, 17].values.reshape(data.shape[0], 1)\n",
"# XhasSC = data.iloc[:, 15].values.reshape(data.shape[0], 1)\n",
"# XhasEX = data.iloc[:, 16].values.reshape(data.shape[0], 1)\n",
"# XhasQ = data.iloc[:, 17].values.reshape(data.shape[0], 1)\n",
"# XpunctCount = data.iloc[:, 18].values.reshape(data.shape[0], 1)\n",
"# XpunctToWords = data.iloc[:, 19].values.reshape(data.shape[0], 1)\n",
"# XavWordLength = data.iloc[:, 20].values.reshape(data.shape[0], 1)\n",
"# XsumLen = data.iloc[:, 21].values.reshape(data.shape[0], 1)\n",
"# XavSumWordLen = data.iloc[:, 22].values.reshape(data.shape[0], 1)\n",
"# XsumHasEX = data.iloc[:, 23].values.reshape(data.shape[0], 1)\n",
"# XsumHasQ = data.iloc[:, 24].values.reshape(data.shape[0], 1)\n",
"\n",
"# Xtoadd = np.concatenate((XScore,XreviewLen,XsummaryLen,XusernameWords,XnewLines,XhasSC,XhasEX,XhasQ,XpunctCount,XpunctToWords,XsumLen,XavSumWordLen,XsumHasEX,XsumHasQ), axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# vectorize Bag of Words from Summary text; as sparse matrix\n",
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)\n",
"XhashText = hv.transform(data.Text)\n",
"XhashSummary = hv.transform(data.Summary)"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# convert to CSR\n",
"from scipy.sparse import csr_matrix, hstack\n",
"XtoaddSparse = csr_matrix(Xtoadd)\n",
"# Xfinal = hstack([ Xtoadd, Xngram, XtoaddSparse,XhashText,XhashSummary,XText,XSummary])\n",
"Xfinal = hstack([ Xngram, XtoaddSparse,XhashText,XhashSummary,XText,XSummary])\n",
"X = csr_matrix(Xfinal)\n",
"imp = Imputer(missing_values='NaN', strategy='mean', axis=0)\n",
"X = imp.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.cross_validation import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.3, random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# report on training and test sets\n",
"def print_results():\n",
" print('Error rate on training set: ')\n",
" print((y_train != y_pred).sum() / X_train.shape[0])\n",
" print('Accuracy rate on training set: ')\n",
" print(1 - (y_train != y_pred).sum() / X_train.shape[0])\n",
" print('True positive rate on training set:')\n",
" print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())\n",
" print('**************')\n",
" print('Error rate on test set: ')\n",
" print((y_test != y_pred_test).sum() / X_test.shape[0])\n",
" print('Accuracy rate on test set: ')\n",
" print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])\n",
" print('True positive rate on test set')\n",
" print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())\n",
" print('True negative rate on test set')\n",
" print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# feature scaling\n",
"from sklearn.preprocessing import StandardScaler\n",
"sc = StandardScaler(with_mean=False)\n",
"sc.fit(X_train)\n",
"X_train_std = sc.transform(X_train)\n",
"X_test_std = sc.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error rate on training set: \n",
"0.000835164835165\n",
"Accuracy rate on training set: \n",
"0.999164835165\n",
"True positive rate on training set:\n",
"0.993858707322\n",
"**************\n",
"Error rate on test set: \n",
"0.134490842491\n",
"Accuracy rate on test set: \n",
"0.865509157509\n",
"True positive rate on test set\n",
"0.446834170854\n",
"True negative rate on test set\n",
"0.898427499012\n"
]
}
],
"source": [
"# Perceptron\n",
"from sklearn import linear_model\n",
"clf = linear_model.SGDClassifier(loss='squared_hinge')\n",
"clf.fit(X_train_std, y_train)\n",
"y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)\n",
"y_pred_test = clf.predict(X_test_std)\n",
"print_results()"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# from sklearn.svm import SVC\n",
"# svm = SVC(kernel='linear', C=1.0, random_state=0)\n",
"# svm.fit(X_train, y_train)\n",
"# y_pred = svm.predict(X_train)\n",
"# y_pred_test = svm.predict(X_test)\n",
"# print_results()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment