Skip to content

Instantly share code, notes, and snippets.

@boates
Last active January 16, 2019 01:37
Show Gist options
  • Save boates/5127281 to your computer and use it in GitHub Desktop.
Save boates/5127281 to your computer and use it in GitHub Desktop.
For running sklearn classification algorithms easily on pandas data frame. Also perform tests on model accuracy.
def splitData(df, trainPerc=0.6, cvPerc=0.2, testPerc=0.2):
"""
return: training, cv, test
(as pandas dataframes)
params:
df: pandas dataframe
trainPerc: float | percentage of data for trainin set (default=0.6
cvPerc: float | percentage of data for cross validation set (default=0.2)
testPerc: float | percentage of data for test set (default=0.2)
(trainPerc + cvPerc + testPerc must equal 1.0)
"""
assert trainPerc + cvPerc + testPerc == 1.0
# create random list of indices
from random import shuffle
N = len(df)
l = range(N)
shuffle(l)
# get splitting indicies
trainLen = int(N*trainPerc)
cvLen = int(N*cvPerc)
testLen = int(N*testPerc)
# get training, cv, and test sets
training = df.ix[l[:trainLen]]
cv = df.ix[l[trainLen:trainLen+cvLen]]
test = df.ix[l[trainLen+cvLen:]]
#print len(cl), len(training), len(cv), len(test)
return training, cv, test
def getScore(df, classifier, classTitle, trainPerc, testPerc):
"""
return: float | accuracy score for classification model (e[0,1])
params:
df: pandas dataframe
classifier: sklearn classifier
classTitle: string | title of class column in df
trainPerc: percentage of data to train on (default=0.80)
testPerc: percentage of data to test on (default=0.20)
(trainPerc + testPerc = 1.0)
"""
assert trainPerc + testPerc == 1.0
# split the dataset
training, cv, test = splitData(df, trainPerc=trainPerc, cvPerc=0.00, testPerc=testPerc)
# get the features and classes
featureNames = [col for col in df.columns if col != classTitle]
trainFeatures = training[ featureNames ].values
trainClasses = training[ classTitle ].values
# create class dict to track numeric classes
classToString = {}
classToNumber = {}
for i, c in enumerate( sorted(set(trainClasses)) ):
classToString[i] = c
classToNumber[c] = i
# change classes to numbers (if not already)
trainClasses = [classToNumber[c] for c in trainClasses]
# fit the model
classifier.fit(trainFeatures, trainClasses)
# formt cross validation set
testFeatures = test[ featureNames ].values
testClasses = [classToNumber[c] for c in test[classTitle].values]
# compute the score on the test set
score = classifier.score(testFeatures, testClasses)
return score
def testModel(df, classifier, classTitle, N=1, trainPerc=0.80, testPerc=0.20):
"""
return: list[float] | list of scores for model (e[0,1])
params:
df: pandas dataframe
classifier: sklearn classifier
classTitle: string | title of class column in df
N: int | number of tests to run (default=1)
trainPerc: percentage of data to train on (default=0.80)
testPerc: percentage of data to test on (default=0.20)
(trainPerc + testPerc = 1.0)
"""
# compute N scores
scores = []
for i in range(N):
score = getScore(df=df, classifier=classifier, classTitle=classTitle, trainPerc=trainPerc, testPerc=testPerc)
scores.append(score)
return scores
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment