Created
April 7, 2018 00:19
-
-
Save BabaCafe/dbc1b24102ef54260fea08dac531cd08 to your computer and use it in GitHub Desktop.
Test Cases for pytest framework testing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 174, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#A total of 10 test Cases added" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 291, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"label=preprocessing.LabelEncoder()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 357, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import random as rnd\n", | |
"import xgboost as xgb\n", | |
"from xgboost.sklearn import XGBClassifier\n", | |
"from sklearn import preprocessing\n", | |
"from sklearn.cross_validation import train_test_split\n", | |
"from sklearn import metrics \n", | |
"from sklearn import feature_selection\n", | |
"import mifs\n", | |
"import seaborn as sns\n", | |
"import matplotlib as mpl\n", | |
"import matplotlib.pyplot as plt\n", | |
"import matplotlib.pylab as pylab\n", | |
"\n", | |
"mpl.style.use('ggplot')\n", | |
"sns.set_style('white')\n", | |
"pylab.rcParams['figure.figsize']=12,8\n", | |
"\n", | |
"#len(data)\n", | |
"#data.head()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 363, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data=pd.read_excel(\"2010 Federal STEM Education Inventory Data Set.xls\")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 364, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def columns(data): \n", | |
" column=pd.Series(data.loc['Index Number'].values)\n", | |
" for j, i in column.iteritems():\n", | |
" if pd.isna(i):\n", | |
" column[j]=column[j-1]\n", | |
" \n", | |
" column[3:]=column[3:].map(lambda x: x.split(')')[1][1:])\n", | |
" #column.head() \n", | |
" #data.columns \n", | |
" #data.loc['Index Number']=column.values\n", | |
" data.columns=column.values\n", | |
" data=data[1:-1]\n", | |
" #data.columns.values \n", | |
" column_names=data.columns.values\n", | |
" return data,column_names\n", | |
"#len(column_names)\n", | |
"data,column_names=columns(data)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase1\n", | |
"def test_columns():\n", | |
" Data,Columns=columns(data)\n", | |
" assert isinstance(data,pd.DataFrame)\n", | |
" assert Data.shape==(252,255)\n", | |
" assert Columns[0]=='Investment Name'\n", | |
" assert isinstance(Columns[0],str)\n", | |
" #assert Columns=list of columns names as desired\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 176, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def describe_more( df ):\n", | |
" var = [] ; l = [] ; t = [] ; numerical=[]; categorical=[]\n", | |
" for n,x in enumerate(df):\n", | |
" yup=pd.Series(df.iloc[:,n]) \n", | |
" var.append( x )\n", | |
" l.append( len( yup.value_counts() )) \n", | |
" t.append( yup.dtypes )\n", | |
" if (len(yup[yup.notnull()]))>0:\n", | |
" if isinstance(yup[yup.notnull()][0],numbers.Complex): # this part of code changed after testing as Funding cols\n", | |
" numerical.append(x) # were getting left out due to dtype== object.\n", | |
" else:\n", | |
" categorical.append(x)\n", | |
" else: categorical.append(x)\n", | |
" #print(var,l,t)\n", | |
" Info = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )\n", | |
" Info.sort_values( by = 'Levels' , inplace = True )\n", | |
" return Info, numerical,categorical,l\n", | |
"\n", | |
"#Info,numerical,categorical,levels=describe_more(data)\n", | |
"#levels=describe_more(X)[3]\n", | |
"#len(levels)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 177, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase2\n", | |
"def test_describe_more(): \n", | |
" Info,numerical,categorical,levels=describe_more(data)\n", | |
" assert isinstance(Info,pd.DataFrame)\n", | |
" assert Info.shape==(255,3)\n", | |
" assert isinstance(data[numerical[0]][0],numbers.Complex) # Here I can check for every column of numerical or categorical\n", | |
" assert not isinstance(data[categorical[0]][0],numbers.Complex) # by making a loop, but here I've not done it, as by choice data contains similar named columns\n", | |
" assert isinstance(levels[0],int) # and for that it would be a long procedure to check how many columns are there under one name\n", | |
" # But yes if the data has unique column names then a loop can be made to check for every.." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 231, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def feature(data):\n", | |
" unidentical_features=list(range(0,len(data)))\n", | |
" identical_features=dict()\n", | |
" stop=0\n", | |
" for j,i in enumerate(column_names):\n", | |
" if j<len(column_names)-1:\n", | |
" if column_names[j]==column_names[j+1]:\n", | |
" col=len(data.iloc[:,j].unique())\n", | |
" try:\n", | |
" identical_features[stop].append([j,col])\n", | |
" except:\n", | |
" start=j\n", | |
" stop=start\n", | |
" identical_features[start]=[[j,col]]\n", | |
" if j==len(column_names)-2:\n", | |
" identical_features[stop].append([j+1,len(data.iloc[:,j+1].unique())])\n", | |
" elif column_names[j]!=column_names[j+2]:\n", | |
" identical_features[stop].append([j+1,len(data.iloc[:,j+1].unique())]) \n", | |
" else:\n", | |
" stop=stop+1\n", | |
" for key,val in identical_features.items():\n", | |
" for i in val:\n", | |
" if i[0] in unidentical_features:\n", | |
" unidentical_features.remove(i[0]) \n", | |
" return identical_features, unidentical_features\n", | |
"\n", | |
"identical_features, unidentical_features=feature(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 232, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase3\n", | |
"def test_feature(): # a loop can be put to test multiple time but random itself is appropriate\n", | |
" identical_features, unidentical_features=feature(data)\n", | |
" col_name=data.columns\n", | |
" key=rnd.choice(list(identical_features.keys()))\n", | |
" val=identical_features[key]\n", | |
" identical_features.pop(key) # so random don't returns same \n", | |
" key1=rnd.choice(list(identical_features.keys()))\n", | |
" val1=identical_features[key1]\n", | |
" assert col_name[key]!=col_name[key1] \n", | |
" assert col_name[rnd.choice(val)[0]]==col_name[rnd.choice(val)[0]]\n", | |
" assert col_name[rnd.choice(val1)[0]]==col_name[rnd.choice(val1)[0]]\n", | |
" uni1=rnd.choice(unidentical_features)\n", | |
" unidentical_features.remove(uni1)\n", | |
" uni2=rnd.choice(unidentical_features)\n", | |
" assert col_name[uni1]!=col_name[uni2]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"def univariate_distribution(data):\n", | |
" print(\"Non Funding Variables - two plots for each variable- First one count plot with y limit=no of samples for \\\n", | |
" getting idea of Nans\")\n", | |
" print(\"Second plot -label encoded \")\n", | |
" plot_unidentical(unidentical_features)\n", | |
" plot_identical(identical_features)\n", | |
" print(\"Plotting histograms ALSO separately for numerical features like Year/ Joint Funding\")\n", | |
" plot_numeric()\n", | |
"\n", | |
"#univariate_distribution(data)\n", | |
"\n", | |
"def plot_numeric():\n", | |
" data=pd.read_excel('output.xlsx')\n", | |
" numerical=describe_more(data)[1]\n", | |
" for i in numerical:\n", | |
" t=data[i].dropna()\n", | |
" fig,ax=plt.subplots()\n", | |
" try:\n", | |
" plt.hist(t)\n", | |
" ax.set_xlabel(i)\n", | |
" ax.set_ylabel('count')\n", | |
" plt.show(ax)\n", | |
" except:\n", | |
" print('Cant be plotted') \n", | |
" \n", | |
"def plot_unidentical(feature):\n", | |
" for i in unidentical_features:\n", | |
" t=data.iloc[:,i]\n", | |
" fig,ax=plt.subplots()\n", | |
" sns.countplot(t,ax=ax)\n", | |
" x=[0,50]\n", | |
" y=[255,255]\n", | |
" ax.plot(x,y)\n", | |
" ax.set_xlabel(column_names[i])\n", | |
" plt.show(ax)\n", | |
" t=t.dropna()\n", | |
" try:\n", | |
" label.fit(t)\n", | |
" uni_dist=label.transform(t)\n", | |
" fig,ax=plt.subplots()\n", | |
" sns.countplot(uni_dist,ax=ax)\n", | |
" ax.set_xlabel(column_names[i])\n", | |
" plt.show(ax)\n", | |
" except:\n", | |
" t=t.astype('category')\n", | |
" fig,ax=plt.subplots()\n", | |
" sns.countplot(t,ax=ax)\n", | |
" ax.set_xlabel(column_names[i])\n", | |
" plt.show(ax)\n", | |
" \n", | |
" \n", | |
"#plot_unidentical(unidentical_features)\n", | |
"\n", | |
"def plot_identical(feature):\n", | |
" for key in feature: \n", | |
" features=data[column_names[key]]\n", | |
" features_col=features.columns.values\n", | |
" for j,i in enumerate(features_col):\n", | |
" features_col[j]=i+str(j)\n", | |
" features.columns=features_col\n", | |
" t=pd.Series()\n", | |
" for j,i in enumerate(features):\n", | |
" t=t.append(features[i])\n", | |
" fig,ax=plt.subplots()\n", | |
" sns.countplot(t,ax=ax)\n", | |
" x=[0,50]\n", | |
" y=[255,255]\n", | |
" ax.plot(x,y)\n", | |
" ax.set_xlabel(column_names[key])\n", | |
" plt.show(ax)\n", | |
" t=t.dropna()\n", | |
" try:\n", | |
" label.fit(t)\n", | |
" uni_dist=label.transform(t)\n", | |
" fig,ax=plt.subplots()\n", | |
" sns.countplot(uni_dist,ax=ax)\n", | |
" ax.set_xlabel(column_names[key])\n", | |
" plt.show(ax)\n", | |
" except:\n", | |
" t=t.astype('category')\n", | |
" fig,ax=plt.subplots()\n", | |
" sns.countplot(uni_dist,ax=ax)\n", | |
" ax.set_xlabel(column_names[key])\n", | |
" plt.show(ax)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#no test cases for plot functions as their return value is none but the graphs which needs visual testing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 234, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"funding_variables=pd.DataFrame(data[['Funding FY2008','Funding FY2009','Funding FY2010']])\n", | |
"#funding_variables[funding_variables['Funding FY2008'].isnull().sum()]\n", | |
"data=data.drop(labels=['Funding FY2008','Funding FY2009','Funding FY2010'], axis=1 )\n", | |
"\n", | |
"writer = pd.ExcelWriter('output.xlsx') #Applying shortcut to change same column names\n", | |
"data.to_excel(writer,'Sheet1')\n", | |
"writer.save()\n", | |
"data=pd.read_excel('output.xlsx')\n", | |
"Y=pd.DataFrame(funding_variables)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 235, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"PRG_0056 NaN\n", | |
"PRG_0060 NaN\n", | |
"PRG_0061 NaN\n", | |
"PRG_0066 NaN\n", | |
"PRG_0079 NaN\n", | |
"PRG_0080 NaN\n", | |
"PRG_0083 NaN\n", | |
"PRG_0086 NaN\n", | |
"PRG_0092 NaN\n", | |
"PRG_0098 NaN\n", | |
"PRG_0099 NaN\n", | |
"PRG_0101 NaN\n", | |
"PRG_0105 NaN\n", | |
"PRG_0120 NaN\n", | |
"PRG_0142 NaN\n", | |
"PRG_0153 NaN\n", | |
"PRG_0154 NaN\n", | |
"PRG_0161 NaN\n", | |
"PRG_0184 NaN\n", | |
"PRG_0287 NaN\n", | |
"PRG_0313 NaN\n", | |
"PRG_0326 NaN\n", | |
"Name: Funding FY2008, dtype: object" | |
] | |
}, | |
"execution_count": 235, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"m=Y.iloc[:,0].isnull()\n", | |
"n=Y.iloc[:,1].isnull()\n", | |
"Z=Y[np.logical_or(m,n)]\n", | |
"Y.loc[n,'Funding FY2009']=Y.loc[n,'Funding FY2010']\n", | |
"Y.loc[Y.iloc[:,0].isnull(),'Funding FY2008']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 286, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def targetvariable(inputfile): #Data code is not added where I imputed the values using prediction and saved the files for use\n", | |
" Y=pd.read_excel(inputfile)\n", | |
" #print(Y[55:60])\n", | |
" T=pd.DataFrame()\n", | |
" T=Y['Funding FY2009']-Y['Funding FY2008']\n", | |
" T['Target']=T.map(lambda q:1 if q>0 else 0)\n", | |
" target=pd.DataFrame(T['Target'])\n", | |
" return target\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 287, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase4\n", | |
"def test_targetvariable():\n", | |
" target=targetvariable('funding2.xlsx')\n", | |
" assert target.shape==(252,1)\n", | |
" val=np.unique(target.values)\n", | |
" assert len(val)==2\n", | |
" try:\n", | |
" assert val[0]==0 and val[1]==1\n", | |
" except:\n", | |
" assert val[0]==1 and val[1]==0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 465, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"target=targetvariable('funding2.xlsx') # Use this one as after iterations it is found that which model predicted better values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 461, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data=pd.read_excel('output.xlsx') \n", | |
"\n", | |
"# this data should be used for test cases below- as the unique column names have now been set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 346, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def Mutual_info(X,y,NaN): #NaN=choose-'yes', 'no' or 'random fill'\n", | |
" if NaN=='yes':\n", | |
" print('Yes with NaNs')\n", | |
" encode_exclude_nans(X)\n", | |
" elif NaN=='random fill':\n", | |
" print('Yeah with Random fill of NaNs..good choice')\n", | |
" print('Please wait... Filling with Random generator so as to get accurate Mutual info score')\n", | |
" encode_exclude_nans(X)\n", | |
" fillnans_unique(X)\n", | |
" elif NaN=='no':\n", | |
" print('With NaNs of a particular column label encoded as single category ')\n", | |
" encode_include_nans(X) \n", | |
" NF_variable=[]\n", | |
" mi=[]\n", | |
" for i in X: \n", | |
" x=X[i].ravel() \n", | |
" score=metrics.mutual_info_score(x,y)\n", | |
" #print(type(score),score)\n", | |
" #break\n", | |
" mi.append(score)\n", | |
" NF_variable.append(i) \n", | |
" Mutual_info=pd.DataFrame({'Non Funding Variables':NF_variable,'MI Score':mi})\n", | |
" return Mutual_info,X\n", | |
"\n", | |
" \n", | |
"def encode_include_nans(data):\n", | |
" for i in data:\n", | |
" #t=X.loc[n_null,i].astype('category')\n", | |
" #X.loc[n_null,i]=t.cat.codes\n", | |
" #t=X[i].astype('category')\n", | |
" #X[i]=t.cat.codes \n", | |
" try:\n", | |
" data[i]=label.fit_transform(data[i])\n", | |
" except:\n", | |
" data[i]=label.fit_transform(data[i].astype('str'))\n", | |
" return data\n", | |
" \n", | |
"def encode_exclude_nans(data):\n", | |
" for i in data: \n", | |
" n_null=data.loc[:,i].notnull()\n", | |
" try:\n", | |
" data.loc[n_null,i]=label.fit_transform(data.loc[n_null,i])\n", | |
" except:\n", | |
" data.loc[n_null,i]=label.fit_transform(data.loc[n_null,i].astype('str'))\n", | |
" \n", | |
" return data\n", | |
"\n", | |
"def fillnans_unique(data):\n", | |
" #a=np.arange(255,1000)\n", | |
" for l,i in enumerate(data):\n", | |
" data.loc[data[i].isna(),i]=data.loc[data[i].isna(),i].apply(lambda z: int(rnd.random() * 1000))\n", | |
" return data\n", | |
" #for k,j in enumerate(data[i].isna()):\n", | |
" # if j==True:\n", | |
" # data[i][k]=rnd.choice(a)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 462, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase5\n", | |
"def test_encode_include_nans():\n", | |
" S=data.shape # same way for fillnans_unique but I don't think its required to test if every\n", | |
" Data=encode_include_nans(data) # every nan is filled with unique as the code itself used random functioin so I trust it.\n", | |
" assert Data.isnull().sum().sum()==0 # anyways code is written below\n", | |
" assert Data.shape==S" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 370, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase6\n", | |
"def test_encode_exclude_nans():\n", | |
" S=data.shape \n", | |
" data1=data # running previous test will change data values so load again to use this one..and so on\n", | |
" Data=encode_include_nans(data)\n", | |
" assert Data.isnull().sum().sum()==data1.isnull().sum().sum()\n", | |
" assert Data.shape==S" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 459, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase7\n", | |
"def test_fillnans_unique():\n", | |
" S=data.shape\n", | |
" Tot=data.isnull().sum().sum()\n", | |
" N=np.where(data.isnull())\n", | |
" Data=fillnans_unique(data)\n", | |
" m=N[0]\n", | |
" n=N[1]\n", | |
" i=0\n", | |
" val=[]\n", | |
" while i< Tot:\n", | |
" val.append(Data.iloc[m[i],n[i]])\n", | |
" i=i+1\n", | |
" Val=np.array(val)\n", | |
" assert len(np.unique(Val))>900 # as we have used range of 1000 so at most uniques values can be max 1000\n", | |
" assert len(val)==Tot\n", | |
" assert Data.isnull().values.sum()==0\n", | |
" assert Data.shape==S" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#TestCase8\n", | |
"def test_Mutual_info(): #isinstance Dataframe can also be included in the test\n", | |
" Mutualinfo=Mutual_info(X,y,NaN) #upper bound can be defined by min of entropy of two variables min(H(X),H(Y))\n", | |
" assert Mutualinfo['MI Score'].values.any()>0 # and sklearns MI uses natural log to calculate so 1 bit=0.693 \n", | |
" #So now I can't do that much calculation in so less time..\n", | |
" # and neither it is necessary as we just have to check that is should be non neg.\n", | |
" # or another option to check Normalized MI that lies b/w [0-1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 471, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 471, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Mutualinfo['MI Score'].values.any()>0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 466, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Yeah with Random fill of NaNs..good choice\n", | |
"Please wait... Filling with Random generator so as to get accurate Mutual info score\n" | |
] | |
} | |
], | |
"source": [ | |
"X=pd.read_excel('output.xlsx')\n", | |
"Target=np.array(target).ravel()\n", | |
"Mutualinfo,X=Mutual_info(X,Target,NaN='random fill')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#FEATURE SELECTION\n", | |
"def feature_selector(X,y,Method,n): #if Method=\"MI_Chi\" or 'MI_JMIM\" or \"Combine\"\n", | |
" if Method=='MI_Chi':\n", | |
" print('Selection using Chi Test ')\n", | |
" Mutualinfo,X=Mutual_info(X,y,NaN='random fill')\n", | |
" Predictors=fselect_Chi(X,y,Mutualinfo)\n", | |
"\n", | |
" elif Method=='MI_JMIM':\n", | |
" print('Using MI-JMIM')\n", | |
" Mutualinfo,X=Mutual_info(X,y,NaN='random fill')\n", | |
" Predictors=fselect_MI(X,y,n)\n", | |
" \n", | |
" elif Method=='Combine':\n", | |
" print('Good Choice= First selecting significance from Chi Square Test then using JointMI_Maxim')\n", | |
" Mutualinfo,X=Mutual_info(X,y,NaN='random fill')\n", | |
" Predictors=fselect_Chi(X,y,Mutualinfo)\n", | |
" X=X[Predictors]\n", | |
" #X=fillnans_unique(X)\n", | |
" Predictors=fselect_MI(X,y,n)\n", | |
" return Predictors\n", | |
"\n", | |
"def fselect_Chi(X,y,Mutualinfo): # use X with Nans\n", | |
" N=len(X)\n", | |
" S=2*N\n", | |
" predictors=[]\n", | |
" levels=describe_more(X)[3]\n", | |
" stop=len(Mutualinfo)\n", | |
" i=0\n", | |
" while i<stop:\n", | |
" v=Mutualinfo.loc[i,'MI Score']; m=Mutualinfo.loc[i,'Non Funding Variables']; l=levels[i]\n", | |
" if (l-1)<101:\n", | |
" g=int(chivalue.loc[chivalue['df']==l-1,'value'])\n", | |
" chi2=g\n", | |
" if v > (chi2)/S:\n", | |
" predictors.append(m)\n", | |
" else:\n", | |
" chi2=l+30\n", | |
" if v > (chi2)/S:\n", | |
" predictors.append(m)\n", | |
" i=i+1\n", | |
" return predictors\n", | |
"\n", | |
"def fselect_MI(X,y,n): #Use X without Nans\n", | |
" fselect =mifs.MutualInformationFeatureSelector(method='JMIM',verbose=2,n_features=n)\n", | |
" fselect.fit(X,y)\n", | |
" predictors=list(X.columns.values[fselect.support_])\n", | |
" #predictors_rank=fselect.ranking_\n", | |
" #fx=np.array(X)\n", | |
" #X_filtered = fselect.transform(fx)\n", | |
" return predictors\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Nothing concrete to test in above functions as all are just returning the selected predictors so it would be farce to\n", | |
"#check if returned predictors are there in column names list or not. Still..-" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#test_common():\n", | |
"#assert len(predictors)>0 to check if atleast 1 one predictor is selected\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"chivalue=pd.read_excel('chivalue0.05.xlsx')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Good Choice= First selecting significance from Chi Square Test then using JointMI_Maxim\n", | |
"Yeah with Random fill of NaNs..good choice\n", | |
"Please wait... Filling with Random generator so as to get accurate Mutual info score\n", | |
"The values of X seem to be discrete. MI_FS will treat themas continuous.\n", | |
"Auto selected feature #1 : 155, JMIM : 0.03464492755578519\n", | |
"Auto selected feature #2 : 227, JMIM : 0.13394089720159696\n", | |
"Auto selected feature #3 : 36, JMIM : 0.0937736445613373\n", | |
"Auto selected feature #4 : 136, JMIM : 0.08837118163397051\n", | |
"Auto selected feature #5 : 121, JMIM : 0.07086600509042462\n", | |
"Auto selected feature #6 : 32, JMIM : 0.06361626454367242\n", | |
"Auto selected feature #7 : 211, JMIM : 0.05793786191883221\n", | |
"Auto selected feature #8 : 112, JMIM : 0.05290299718183045\n", | |
"Auto selected feature #9 : 6, JMIM : 0.05095195899821814\n", | |
"Auto selected feature #10 : 75, JMIM : 0.04988986868153056\n", | |
"Auto selected feature #11 : 30, JMIM : 0.04724294437541943\n", | |
"Auto selected feature #12 : 233, JMIM : 0.046240099893933184\n", | |
"Auto selected feature #13 : 76, JMIM : 0.04545460837845994\n", | |
"Auto selected feature #14 : 198, JMIM : 0.04121550410249908\n", | |
"Auto selected feature #15 : 66, JMIM : 0.03782311915604497\n", | |
"Auto selected feature #16 : 10, JMIM : 0.036012851315351924\n", | |
"Auto selected feature #17 : 9, JMIM : 0.03491219062214723\n", | |
"Auto selected feature #18 : 63, JMIM : 0.034557795989806905\n", | |
"Auto selected feature #19 : 2, JMIM : 0.03384765236413401\n", | |
"Auto selected feature #20 : 228, JMIM : 0.033076790276745704\n" | |
] | |
} | |
], | |
"source": [ | |
"X=pd.read_excel('output.xlsx') #n= integer for no of features req or 'auto'\n", | |
"Target=np.array(target1).ravel()\n", | |
"Predictors=feature_selector(X,Target,'Combine',n='auto')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Selection using Chi Test \n", | |
"Yeah with Random fill of NaNs..good choice\n", | |
"Please wait... Filling with Random generator so as to get accurate Mutual info score\n" | |
] | |
} | |
], | |
"source": [ | |
"X1=pd.read_excel('output.xlsx')\n", | |
"Target=np.array(target1).ravel()\n", | |
"Predictors1=feature_selector(X1,Target,'MI_Chi','auto')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using MI-JMIM\n", | |
"Yeah with Random fill of NaNs..good choice\n", | |
"Please wait... Filling with Random generator so as to get accurate Mutual info score\n", | |
"The values of X seem to be discrete. MI_FS will treat themas continuous.\n", | |
"Auto selected feature #1 : 177, JMIM : 0.024703054814981584\n", | |
"Auto selected feature #2 : 201, JMIM : 0.1141356747785518\n", | |
"Auto selected feature #3 : 160, JMIM : 0.08063671995978172\n", | |
"Auto selected feature #4 : 246, JMIM : 0.07350125169901389\n", | |
"Auto selected feature #5 : 7, JMIM : 0.062003144548199884\n", | |
"Auto selected feature #6 : 147, JMIM : 0.059702424099167484\n", | |
"Auto selected feature #7 : 52, JMIM : 0.05930836194909306\n", | |
"Auto selected feature #8 : 211, JMIM : 0.05825950492165477\n", | |
"Auto selected feature #9 : 30, JMIM : 0.05556296114308612\n", | |
"Auto selected feature #10 : 207, JMIM : 0.05440233194913846\n", | |
"Auto selected feature #11 : 145, JMIM : 0.05191320524091614\n", | |
"Auto selected feature #12 : 200, JMIM : 0.050828749337887835\n", | |
"Auto selected feature #13 : 140, JMIM : 0.04937374021895735\n", | |
"Auto selected feature #14 : 11, JMIM : 0.04849695154590439\n", | |
"Auto selected feature #15 : 104, JMIM : 0.0461945742928509\n", | |
"Auto selected feature #16 : 156, JMIM : 0.045081608948996266\n", | |
"Auto selected feature #17 : 5, JMIM : 0.04317567207282069\n", | |
"Auto selected feature #18 : 1, JMIM : 0.043170961001380626\n", | |
"Auto selected feature #19 : 237, JMIM : 0.04105064924576718\n", | |
"Auto selected feature #20 : 206, JMIM : 0.040145750095715194\n", | |
"Auto selected feature #21 : 89, JMIM : 0.03993258944193023\n" | |
] | |
} | |
], | |
"source": [ | |
"X2=pd.read_excel('output.xlsx')\n", | |
"Target=np.array(target1).ravel()\n", | |
"Predictors2=feature_selector(X2,Target,'MI_JMIM','auto')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#One hot encoding with threshold of 2.5% of total sample values\n", | |
"def get_columns(data,predictors): #data with Nans-To get columns requiring one hot encd\n", | |
" col_onehot=[]\n", | |
" data=data[predictors]\n", | |
" threshold=int(0.025*len(data))\n", | |
" for x in data:\n", | |
" yum=data[x]\n", | |
" if yum.dtypes!=np.number:\n", | |
" count=yum.value_counts()\n", | |
" j=0\n", | |
" for i in count:\n", | |
" if int(i)>threshold:\n", | |
" j=j+1\n", | |
" if j>1:\n", | |
" col_onehot.append(x)\n", | |
" break\n", | |
" return col_onehot\n", | |
"\n", | |
"def split_columns(data,X,predictors): # One hot coding whith threshold of 2.5% of N\n", | |
" col_onehot=get_columns(data,predictors)\n", | |
" threshold = int(0.025*len(data))\n", | |
" X=X[predictors]\n", | |
" for i in col_onehot: \n", | |
" counts = X[i].value_counts()\n", | |
" repl = counts[counts <= threshold].index \n", | |
" X=pd.concat([pd.get_dummies(X[i].replace(repl,'uncommon'),prefix=i),X.drop(i,axis=1)],axis=1) \n", | |
" return X\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#get_columns- just return the column names that needs to be one hot encoded, so it's same case as in predictors still...\n", | |
"#TestCase9,10\n", | |
"def test_get_columns():\n", | |
" col_onehot=get_columns(data,predictors)\n", | |
" assert isinstance(col_onehot,list)\n", | |
"\n", | |
"def test_split_columns(): #dummies is not user built function so no need to test if no. of columns split for each\n", | |
" X=split_columns(data,X,predictors) # category == no. of levels for that category.\n", | |
" \n", | |
" assert X.shape[1]> data[predictors].shape[1] #assuming col_onehot is non empty else >= instead of >" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment