Skip to content

Instantly share code, notes, and snippets.

@AruniRC
Last active November 29, 2016 16:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AruniRC/e61a8c4d30051e040bc71e21e48b2b97 to your computer and use it in GitHub Desktop.
Save AruniRC/e61a8c4d30051e040bc71e21e48b2b97 to your computer and use it in GitHub Desktop.
CS 620 HW - preliminary script
#!/usr/bin/python
import pandas as pd
import numpy as np
import random
FILENAME = '../data/scores_artificial_vs_real.csv'
N_FOLDS = 5
# read in the CSV file as a Pandas DataFrame
df = pd.read_csv(FILENAME)
df.head() # debug: check if the values look ok
# Handle non-unique Family values:
# The column "Formula" has the value "ochiai" repeated for the same Bug,
# once with "Family=mbfl" and again with "Family=sbfl".
# Hack: for rows "Family=sbfl","Formula=ochiai", rename "Formula=ochiai_s"
df.loc[(df.Formula == 'ochiai') & (df.Family == 'sbfl'), 'Formula'] \
= 'ochiai_s'
# get unique elems in "Formula" column
formulaSet = list(set(df['Formula'].tolist()))
# create Cross-Validation folds as an extra column in the DataFrame
# Within each Technique or Formula, get K splits of the bugs (rows)
df['Fold'] = np.nan
for fs in formulaSet:
idx = (df['Formula'] == fs).tolist()
# generate the K folds
nData = len(idx)
numRep = np.ceil(nData/N_FOLDS) + 1
kFoldSet = range(1,N_FOLDS+1)
kFolds = np.repeat(kFoldSet, numRep)
if len(kFolds) > nData:
kFolds = kFolds[0:nData] # clip unequal lengths
# random permutation on the fold values
random.seed(24)
random.shuffle(kFolds)
# assign the fold values to all row with "Formula=fs"
df.loc[idx, 'Fold'] = kFolds
# import pdb
# pdb.set_trace()
# get mean scores (ScoreWRTLoadedClasses) grouped by "Formula"
dfMeanScore = df.groupby('Formula')['ScoreWRTLoadedClasses'].mean()
print(dfMeanScore)
# mean scores for each fold
cvMeanScore = df.groupby(['Formula', 'Fold'])['ScoreWRTLoadedClasses'].mean()
# TODO - average and std dev over the folds.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment