BabaCafe/TestCases.ipynb

## TestCases.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#A total of 10 test Cases added"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 291,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "label=preprocessing.LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 357,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import random as rnd\n",
    "import xgboost as xgb\n",
    "from xgboost.sklearn import XGBClassifier\n",
    "from sklearn import preprocessing\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn import metrics    \n",
    "from sklearn import feature_selection\n",
    "import mifs\n",
    "import seaborn as sns\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.pylab as pylab\n",
    "\n",
    "mpl.style.use('ggplot')\n",
    "sns.set_style('white')\n",
    "pylab.rcParams['figure.figsize']=12,8\n",
    "\n",
    "#len(data)\n",
    "#data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data=pd.read_excel(\"2010 Federal STEM Education Inventory Data Set.xls\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def columns(data):    \n",
    "    column=pd.Series(data.loc['Index Number'].values)\n",
    "    for j, i in column.iteritems():\n",
    "        if pd.isna(i):\n",
    "            column[j]=column[j-1]\n",
    "            \n",
    "    column[3:]=column[3:].map(lambda x: x.split(')')[1][1:])\n",
    "    #column.head()    \n",
    "    #data.columns    \n",
    "    #data.loc['Index Number']=column.values\n",
    "    data.columns=column.values\n",
    "    data=data[1:-1]\n",
    "    #data.columns.values \n",
    "    column_names=data.columns.values\n",
    "    return data,column_names\n",
    "#len(column_names)\n",
    "data,column_names=columns(data)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#TestCase1\n",
    "def test_columns():\n",
    "    Data,Columns=columns(data)\n",
    "    assert isinstance(data,pd.DataFrame)\n",
    "    assert Data.shape==(252,255)\n",
    "    assert Columns[0]=='Investment Name'\n",
    "    assert isinstance(Columns[0],str)\n",
    "    #assert Columns=list of columns names as desired\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def describe_more( df ):\n",
    "    var = [] ; l = [] ; t = [] ; numerical=[]; categorical=[]\n",
    "    for n,x in enumerate(df):\n",
    "        yup=pd.Series(df.iloc[:,n])        \n",
    "        var.append( x )\n",
    "        l.append( len( yup.value_counts() )) \n",
    "        t.append( yup.dtypes )\n",
    "        if (len(yup[yup.notnull()]))>0:\n",
    "            if isinstance(yup[yup.notnull()][0],numbers.Complex):  # this part of code changed after testing as Funding cols\n",
    "                numerical.append(x)                                 # were getting left out due to dtype== object.\n",
    "            else:\n",
    "                categorical.append(x)\n",
    "        else: categorical.append(x)\n",
    "    #print(var,l,t)\n",
    "    Info = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )\n",
    "    Info.sort_values( by = 'Levels' , inplace = True )\n",
    "    return Info, numerical,categorical,l\n",
    "\n",
    "#Info,numerical,categorical,levels=describe_more(data)\n",
    "#levels=describe_more(X)[3]\n",
    "#len(levels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#TestCase2\n",
    "def test_describe_more():                               \n",
    "    Info,numerical,categorical,levels=describe_more(data)\n",
    "    assert isinstance(Info,pd.DataFrame)\n",
    "    assert Info.shape==(255,3)\n",
    "    assert isinstance(data[numerical[0]][0],numbers.Complex)     # Here I can check for every column of numerical or categorical\n",
    "    assert not isinstance(data[categorical[0]][0],numbers.Complex) # by making a loop, but here I've not done it, as by choice data contains similar named columns\n",
    "    assert isinstance(levels[0],int)                        # and for that it would be a long procedure to check how many columns are there under one name\n",
    "                                         # But yes if the data has unique column names then a loop can be made to check for every.."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def feature(data):\n",
    "    unidentical_features=list(range(0,len(data)))\n",
    "    identical_features=dict()\n",
    "    stop=0\n",
    "    for j,i in enumerate(column_names):\n",
    "        if j<len(column_names)-1:\n",
    "            if column_names[j]==column_names[j+1]:\n",
    "                col=len(data.iloc[:,j].unique())\n",
    "                try:\n",
    "                    identical_features[stop].append([j,col])\n",
    "                except:\n",
    "                    start=j\n",
    "                    stop=start\n",
    "                    identical_features[start]=[[j,col]]\n",
    "                if j==len(column_names)-2:\n",
    "                    identical_features[stop].append([j+1,len(data.iloc[:,j+1].unique())])\n",
    "                elif column_names[j]!=column_names[j+2]:\n",
    "                    identical_features[stop].append([j+1,len(data.iloc[:,j+1].unique())])    \n",
    "            else:\n",
    "                stop=stop+1\n",
    "    for key,val in identical_features.items():\n",
    "        for i in val:\n",
    "            if i[0] in unidentical_features:\n",
    "                unidentical_features.remove(i[0])                \n",
    "    return identical_features, unidentical_features\n",
    "\n",
    "identical_features, unidentical_features=feature(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#TestCase3\n",
    "def test_feature():                          # a loop can be put to test multiple time but random itself is appropriate\n",
    "    identical_features, unidentical_features=feature(data)\n",
    "    col_name=data.columns\n",
    "    key=rnd.choice(list(identical_features.keys()))\n",
    "    val=identical_features[key]\n",
    "    identical_features.pop(key) # so random don't returns same \n",
    "    key1=rnd.choice(list(identical_features.keys()))\n",
    "    val1=identical_features[key1]\n",
    "    assert col_name[key]!=col_name[key1] \n",
    "    assert col_name[rnd.choice(val)[0]]==col_name[rnd.choice(val)[0]]\n",
    "    assert col_name[rnd.choice(val1)[0]]==col_name[rnd.choice(val1)[0]]\n",
    "    uni1=rnd.choice(unidentical_features)\n",
    "    unidentical_features.remove(uni1)\n",
    "    uni2=rnd.choice(unidentical_features)\n",
    "    assert col_name[uni1]!=col_name[uni2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\n",
    "def univariate_distribution(data):\n",
    "    print(\"Non Funding Variables - two plots for each variable- First one count plot with y limit=no of samples for \\\n",
    "    getting idea of Nans\")\n",
    "    print(\"Second plot -label encoded \")\n",
    "    plot_unidentical(unidentical_features)\n",
    "    plot_identical(identical_features)\n",
    "    print(\"Plotting histograms ALSO separately for numerical features like Year/ Joint Funding\")\n",
    "    plot_numeric()\n",
    "\n",
    "#univariate_distribution(data)\n",
    "\n",
    "def plot_numeric():\n",
    "    data=pd.read_excel('output.xlsx')\n",
    "    numerical=describe_more(data)[1]\n",
    "    for i in numerical:\n",
    "        t=data[i].dropna()\n",
    "        fig,ax=plt.subplots()\n",
    "        try:\n",
    "            plt.hist(t)\n",
    "            ax.set_xlabel(i)\n",
    "            ax.set_ylabel('count')\n",
    "            plt.show(ax)\n",
    "        except:\n",
    "            print('Cant be plotted')    \n",
    "    \n",
    "def plot_unidentical(feature):\n",
    "    for i in unidentical_features:\n",
    "        t=data.iloc[:,i]\n",
    "        fig,ax=plt.subplots()\n",
    "        sns.countplot(t,ax=ax)\n",
    "        x=[0,50]\n",
    "        y=[255,255]\n",
    "        ax.plot(x,y)\n",
    "        ax.set_xlabel(column_names[i])\n",
    "        plt.show(ax)\n",
    "        t=t.dropna()\n",
    "        try:\n",
    "            label.fit(t)\n",
    "            uni_dist=label.transform(t)\n",
    "            fig,ax=plt.subplots()\n",
    "            sns.countplot(uni_dist,ax=ax)\n",
    "            ax.set_xlabel(column_names[i])\n",
    "            plt.show(ax)\n",
    "        except:\n",
    "            t=t.astype('category')\n",
    "            fig,ax=plt.subplots()\n",
    "            sns.countplot(t,ax=ax)\n",
    "            ax.set_xlabel(column_names[i])\n",
    "            plt.show(ax)\n",
    "            \n",
    "            \n",
    "#plot_unidentical(unidentical_features)\n",
    "\n",
    "def plot_identical(feature):\n",
    "    for key in feature:    \n",
    "        features=data[column_names[key]]\n",
    "        features_col=features.columns.values\n",
    "        for j,i in enumerate(features_col):\n",
    "            features_col[j]=i+str(j)\n",
    "        features.columns=features_col\n",
    "        t=pd.Series()\n",
    "        for j,i in enumerate(features):\n",
    "            t=t.append(features[i])\n",
    "        fig,ax=plt.subplots()\n",
    "        sns.countplot(t,ax=ax)\n",
    "        x=[0,50]\n",
    "        y=[255,255]\n",
    "        ax.plot(x,y)\n",
    "        ax.set_xlabel(column_names[key])\n",
    "        plt.show(ax)\n",
    "        t=t.dropna()\n",
    "        try:\n",
    "            label.fit(t)\n",
    "            uni_dist=label.transform(t)\n",
    "            fig,ax=plt.subplots()\n",
    "            sns.countplot(uni_dist,ax=ax)\n",
    "            ax.set_xlabel(column_names[key])\n",
    "            plt.show(ax)\n",
    "        except:\n",
    "            t=t.astype('category')\n",
    "            fig,ax=plt.subplots()\n",
    "            sns.countplot(uni_dist,ax=ax)\n",
    "            ax.set_xlabel(column_names[key])\n",
    "            plt.show(ax)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#no test cases for plot functions as their return value is none but the graphs which needs visual testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "funding_variables=pd.DataFrame(data[['Funding FY2008','Funding FY2009','Funding FY2010']])\n",
    "#funding_variables[funding_variables['Funding FY2008'].isnull().sum()]\n",
    "data=data.drop(labels=['Funding FY2008','Funding FY2009','Funding FY2010'], axis=1 )\n",
    "\n",
    "writer = pd.ExcelWriter('output.xlsx') #Applying shortcut to change same column names\n",
    "data.to_excel(writer,'Sheet1')\n",
    "writer.save()\n",
    "data=pd.read_excel('output.xlsx')\n",
    "Y=pd.DataFrame(funding_variables)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PRG_0056    NaN\n",
       "PRG_0060    NaN\n",
       "PRG_0061    NaN\n",
       "PRG_0066    NaN\n",
       "PRG_0079    NaN\n",
       "PRG_0080    NaN\n",
       "PRG_0083    NaN\n",
       "PRG_0086    NaN\n",
       "PRG_0092    NaN\n",
       "PRG_0098    NaN\n",
       "PRG_0099    NaN\n",
       "PRG_0101    NaN\n",
       "PRG_0105    NaN\n",
       "PRG_0120    NaN\n",
       "PRG_0142    NaN\n",
       "PRG_0153    NaN\n",
       "PRG_0154    NaN\n",
       "PRG_0161    NaN\n",
       "PRG_0184    NaN\n",
       "PRG_0287    NaN\n",
       "PRG_0313    NaN\n",
       "PRG_0326    NaN\n",
       "Name: Funding FY2008, dtype: object"
      ]
     },
     "execution_count": 235,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m=Y.iloc[:,0].isnull()\n",
    "n=Y.iloc[:,1].isnull()\n",
    "Z=Y[np.logical_or(m,n)]\n",
    "Y.loc[n,'Funding FY2009']=Y.loc[n,'Funding FY2010']\n",
    "Y.loc[Y.iloc[:,0].isnull(),'Funding FY2008']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def targetvariable(inputfile):  #Data code is not added where I imputed the values using prediction and saved the files for use\n",
    "    Y=pd.read_excel(inputfile)\n",
    "    #print(Y[55:60])\n",
    "    T=pd.DataFrame()\n",
    "    T=Y['Funding FY2009']-Y['Funding FY2008']\n",
    "    T['Target']=T.map(lambda q:1 if q>0 else 0)\n",
    "    target=pd.DataFrame(T['Target'])\n",
    "    return target\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#TestCase4\n",
    "def test_targetvariable():\n",
    "    target=targetvariable('funding2.xlsx')\n",
    "    assert target.shape==(252,1)\n",
    "    val=np.unique(target.values)\n",
    "    assert len(val)==2\n",
    "    try:\n",
    "        assert val[0]==0 and val[1]==1\n",
    "    except:\n",
    "        assert val[0]==1 and val[1]==0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 465,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "target=targetvariable('funding2.xlsx') # Use this one as after iterations it is found that which model predicted better values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 461,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data=pd.read_excel('output.xlsx') \n",
    "\n",
    "# this data should be used for test cases below- as the unique column names have now been set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 346,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def Mutual_info(X,y,NaN): #NaN=choose-'yes', 'no' or 'random fill'\n",
    "    if NaN=='yes':\n",
    "        print('Yes with NaNs')\n",
    "        encode_exclude_nans(X)\n",
    "    elif NaN=='random fill':\n",
    "        print('Yeah with Random fill of NaNs..good choice')\n",
    "        print('Please wait... Filling with Random generator so as to get accurate Mutual info score')\n",
    "        encode_exclude_nans(X)\n",
    "        fillnans_unique(X)\n",
    "    elif NaN=='no':\n",
    "        print('With NaNs of a particular column label encoded as single category ')\n",
    "        encode_include_nans(X) \n",
    "    NF_variable=[]\n",
    "    mi=[]\n",
    "    for i in X: \n",
    "        x=X[i].ravel()    \n",
    "        score=metrics.mutual_info_score(x,y)\n",
    "        #print(type(score),score)\n",
    "        #break\n",
    "        mi.append(score)\n",
    "        NF_variable.append(i)    \n",
    "    Mutual_info=pd.DataFrame({'Non Funding Variables':NF_variable,'MI Score':mi})\n",
    "    return Mutual_info,X\n",
    "\n",
    " \n",
    "def encode_include_nans(data):\n",
    "    for i in data:\n",
    "        #t=X.loc[n_null,i].astype('category')\n",
    "        #X.loc[n_null,i]=t.cat.codes\n",
    "        #t=X[i].astype('category')\n",
    "        #X[i]=t.cat.codes        \n",
    "        try:\n",
    "            data[i]=label.fit_transform(data[i])\n",
    "        except:\n",
    "            data[i]=label.fit_transform(data[i].astype('str'))\n",
    "    return data\n",
    "    \n",
    "def encode_exclude_nans(data):\n",
    "    for i in data:    \n",
    "        n_null=data.loc[:,i].notnull()\n",
    "        try:\n",
    "            data.loc[n_null,i]=label.fit_transform(data.loc[n_null,i])\n",
    "        except:\n",
    "            data.loc[n_null,i]=label.fit_transform(data.loc[n_null,i].astype('str'))\n",
    "    \n",
    "    return data\n",
    "\n",
    "def fillnans_unique(data):\n",
    "    #a=np.arange(255,1000)\n",
    "    for l,i in enumerate(data):\n",
    "        data.loc[data[i].isna(),i]=data.loc[data[i].isna(),i].apply(lambda z: int(rnd.random() * 1000))\n",
    "    return data\n",
    "        #for k,j in enumerate(data[i].isna()):\n",
    "         #   if j==True:\n",
    "          #      data[i][k]=rnd.choice(a)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 462,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#TestCase5\n",
    "def test_encode_include_nans():\n",
    "    S=data.shape                        # same way for fillnans_unique but I don't think its required to test if every\n",
    "    Data=encode_include_nans(data)      # every nan is filled with unique as the code itself used random functioin so I trust it.\n",
    "    assert Data.isnull().sum().sum()==0   # anyways code is written below\n",
    "    assert Data.shape==S"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 370,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#TestCase6\n",
    "def test_encode_exclude_nans():\n",
    "    S=data.shape \n",
    "    data1=data                       # running previous test will change data values so load again to use this one..and so on\n",
    "    Data=encode_include_nans(data)\n",
    "    assert Data.isnull().sum().sum()==data1.isnull().sum().sum()\n",
    "    assert Data.shape==S"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 459,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#TestCase7\n",
    "def test_fillnans_unique():\n",
    "    S=data.shape\n",
    "    Tot=data.isnull().sum().sum()\n",
    "    N=np.where(data.isnull())\n",
    "    Data=fillnans_unique(data)\n",
    "    m=N[0]\n",
    "    n=N[1]\n",
    "    i=0\n",
    "    val=[]\n",
    "    while i< Tot:\n",
    "        val.append(Data.iloc[m[i],n[i]])\n",
    "        i=i+1\n",
    "    Val=np.array(val)\n",
    "    assert len(np.unique(Val))>900  # as we have used range of 1000 so at most uniques values can be max 1000\n",
    "    assert len(val)==Tot\n",
    "    assert Data.isnull().values.sum()==0\n",
    "    assert Data.shape==S"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#TestCase8\n",
    "def test_Mutual_info():   #isinstance Dataframe can also be included in the test\n",
    "    Mutualinfo=Mutual_info(X,y,NaN)      #upper bound can be defined by min of entropy of two variables min(H(X),H(Y))\n",
    "    assert Mutualinfo['MI Score'].values.any()>0   # and sklearns MI uses natural log to calculate so 1 bit=0.693 \n",
    "                                                    #So now I can't do that much calculation in so less time..\n",
    "                                                # and neither it is necessary as we just have to check that is should be non neg.\n",
    "                                            # or another option to check Normalized MI that lies b/w [0-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 471,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 471,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Mutualinfo['MI Score'].values.any()>0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 466,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Yeah with Random fill of NaNs..good choice\n",
      "Please wait... Filling with Random generator so as to get accurate Mutual info score\n"
     ]
    }
   ],
   "source": [
    "X=pd.read_excel('output.xlsx')\n",
    "Target=np.array(target).ravel()\n",
    "Mutualinfo,X=Mutual_info(X,Target,NaN='random fill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#FEATURE SELECTION\n",
    "def feature_selector(X,y,Method,n): #if Method=\"MI_Chi\" or 'MI_JMIM\" or \"Combine\"\n",
    "    if Method=='MI_Chi':\n",
    "        print('Selection using Chi Test ')\n",
    "        Mutualinfo,X=Mutual_info(X,y,NaN='random fill')\n",
    "        Predictors=fselect_Chi(X,y,Mutualinfo)\n",
    "\n",
    "    elif Method=='MI_JMIM':\n",
    "        print('Using MI-JMIM')\n",
    "        Mutualinfo,X=Mutual_info(X,y,NaN='random fill')\n",
    "        Predictors=fselect_MI(X,y,n)\n",
    "    \n",
    "    elif Method=='Combine':\n",
    "        print('Good Choice= First selecting significance from Chi Square Test then using JointMI_Maxim')\n",
    "        Mutualinfo,X=Mutual_info(X,y,NaN='random fill')\n",
    "        Predictors=fselect_Chi(X,y,Mutualinfo)\n",
    "        X=X[Predictors]\n",
    "        #X=fillnans_unique(X)\n",
    "        Predictors=fselect_MI(X,y,n)\n",
    "    return Predictors\n",
    "\n",
    "def fselect_Chi(X,y,Mutualinfo): # use X with Nans\n",
    "    N=len(X)\n",
    "    S=2*N\n",
    "    predictors=[]\n",
    "    levels=describe_more(X)[3]\n",
    "    stop=len(Mutualinfo)\n",
    "    i=0\n",
    "    while i<stop:\n",
    "        v=Mutualinfo.loc[i,'MI Score']; m=Mutualinfo.loc[i,'Non Funding Variables']; l=levels[i]\n",
    "        if (l-1)<101:\n",
    "            g=int(chivalue.loc[chivalue['df']==l-1,'value'])\n",
    "            chi2=g\n",
    "            if v > (chi2)/S:\n",
    "                predictors.append(m)\n",
    "        else:\n",
    "            chi2=l+30\n",
    "            if v > (chi2)/S:\n",
    "                predictors.append(m)\n",
    "        i=i+1\n",
    "    return predictors\n",
    "\n",
    "def fselect_MI(X,y,n): #Use X without Nans\n",
    "    fselect =mifs.MutualInformationFeatureSelector(method='JMIM',verbose=2,n_features=n)\n",
    "    fselect.fit(X,y)\n",
    "    predictors=list(X.columns.values[fselect.support_])\n",
    "    #predictors_rank=fselect.ranking_\n",
    "    #fx=np.array(X)\n",
    "    #X_filtered = fselect.transform(fx)\n",
    "    return predictors\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Nothing concrete to test in above functions as all are just returning the selected predictors so it would be farce to\n",
    "#check if returned predictors are there in column names list or not.   Still..-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#test_common():\n",
    "#assert len(predictors)>0    to check if atleast 1 one predictor is selected\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "chivalue=pd.read_excel('chivalue0.05.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Good Choice= First selecting significance from Chi Square Test then using JointMI_Maxim\n",
      "Yeah with Random fill of NaNs..good choice\n",
      "Please wait... Filling with Random generator so as to get accurate Mutual info score\n",
      "The values of X seem to be discrete. MI_FS will treat themas continuous.\n",
      "Auto selected feature #1 : 155, JMIM : 0.03464492755578519\n",
      "Auto selected feature #2 : 227, JMIM : 0.13394089720159696\n",
      "Auto selected feature #3 : 36, JMIM : 0.0937736445613373\n",
      "Auto selected feature #4 : 136, JMIM : 0.08837118163397051\n",
      "Auto selected feature #5 : 121, JMIM : 0.07086600509042462\n",
      "Auto selected feature #6 : 32, JMIM : 0.06361626454367242\n",
      "Auto selected feature #7 : 211, JMIM : 0.05793786191883221\n",
      "Auto selected feature #8 : 112, JMIM : 0.05290299718183045\n",
      "Auto selected feature #9 : 6, JMIM : 0.05095195899821814\n",
      "Auto selected feature #10 : 75, JMIM : 0.04988986868153056\n",
      "Auto selected feature #11 : 30, JMIM : 0.04724294437541943\n",
      "Auto selected feature #12 : 233, JMIM : 0.046240099893933184\n",
      "Auto selected feature #13 : 76, JMIM : 0.04545460837845994\n",
      "Auto selected feature #14 : 198, JMIM : 0.04121550410249908\n",
      "Auto selected feature #15 : 66, JMIM : 0.03782311915604497\n",
      "Auto selected feature #16 : 10, JMIM : 0.036012851315351924\n",
      "Auto selected feature #17 : 9, JMIM : 0.03491219062214723\n",
      "Auto selected feature #18 : 63, JMIM : 0.034557795989806905\n",
      "Auto selected feature #19 : 2, JMIM : 0.03384765236413401\n",
      "Auto selected feature #20 : 228, JMIM : 0.033076790276745704\n"
     ]
    }
   ],
   "source": [
    "X=pd.read_excel('output.xlsx')   #n= integer for no of features req or 'auto'\n",
    "Target=np.array(target1).ravel()\n",
    "Predictors=feature_selector(X,Target,'Combine',n='auto')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Selection using Chi Test \n",
      "Yeah with Random fill of NaNs..good choice\n",
      "Please wait... Filling with Random generator so as to get accurate Mutual info score\n"
     ]
    }
   ],
   "source": [
    "X1=pd.read_excel('output.xlsx')\n",
    "Target=np.array(target1).ravel()\n",
    "Predictors1=feature_selector(X1,Target,'MI_Chi','auto')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using MI-JMIM\n",
      "Yeah with Random fill of NaNs..good choice\n",
      "Please wait... Filling with Random generator so as to get accurate Mutual info score\n",
      "The values of X seem to be discrete. MI_FS will treat themas continuous.\n",
      "Auto selected feature #1 : 177, JMIM : 0.024703054814981584\n",
      "Auto selected feature #2 : 201, JMIM : 0.1141356747785518\n",
      "Auto selected feature #3 : 160, JMIM : 0.08063671995978172\n",
      "Auto selected feature #4 : 246, JMIM : 0.07350125169901389\n",
      "Auto selected feature #5 : 7, JMIM : 0.062003144548199884\n",
      "Auto selected feature #6 : 147, JMIM : 0.059702424099167484\n",
      "Auto selected feature #7 : 52, JMIM : 0.05930836194909306\n",
      "Auto selected feature #8 : 211, JMIM : 0.05825950492165477\n",
      "Auto selected feature #9 : 30, JMIM : 0.05556296114308612\n",
      "Auto selected feature #10 : 207, JMIM : 0.05440233194913846\n",
      "Auto selected feature #11 : 145, JMIM : 0.05191320524091614\n",
      "Auto selected feature #12 : 200, JMIM : 0.050828749337887835\n",
      "Auto selected feature #13 : 140, JMIM : 0.04937374021895735\n",
      "Auto selected feature #14 : 11, JMIM : 0.04849695154590439\n",
      "Auto selected feature #15 : 104, JMIM : 0.0461945742928509\n",
      "Auto selected feature #16 : 156, JMIM : 0.045081608948996266\n",
      "Auto selected feature #17 : 5, JMIM : 0.04317567207282069\n",
      "Auto selected feature #18 : 1, JMIM : 0.043170961001380626\n",
      "Auto selected feature #19 : 237, JMIM : 0.04105064924576718\n",
      "Auto selected feature #20 : 206, JMIM : 0.040145750095715194\n",
      "Auto selected feature #21 : 89, JMIM : 0.03993258944193023\n"
     ]
    }
   ],
   "source": [
    "X2=pd.read_excel('output.xlsx')\n",
    "Target=np.array(target1).ravel()\n",
    "Predictors2=feature_selector(X2,Target,'MI_JMIM','auto')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#One hot encoding with threshold of 2.5% of total sample values\n",
    "def get_columns(data,predictors): #data with Nans-To get columns requiring one hot encd\n",
    "    col_onehot=[]\n",
    "    data=data[predictors]\n",
    "    threshold=int(0.025*len(data))\n",
    "    for x in data:\n",
    "        yum=data[x]\n",
    "        if yum.dtypes!=np.number:\n",
    "            count=yum.value_counts()\n",
    "            j=0\n",
    "            for i in count:\n",
    "                if int(i)>threshold:\n",
    "                    j=j+1\n",
    "                    if j>1:\n",
    "                        col_onehot.append(x)\n",
    "                        break\n",
    "    return col_onehot\n",
    "\n",
    "def split_columns(data,X,predictors): # One hot coding whith threshold of 2.5% of N\n",
    "    col_onehot=get_columns(data,predictors)\n",
    "    threshold = int(0.025*len(data))\n",
    "    X=X[predictors]\n",
    "    for i in col_onehot:    \n",
    "        counts = X[i].value_counts()\n",
    "        repl = counts[counts <= threshold].index        \n",
    "        X=pd.concat([pd.get_dummies(X[i].replace(repl,'uncommon'),prefix=i),X.drop(i,axis=1)],axis=1)            \n",
    "    return X\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#get_columns- just return the column names that needs to be one hot encoded, so it's same case as in predictors still...\n",
    "#TestCase9,10\n",
    "def test_get_columns():\n",
    "    col_onehot=get_columns(data,predictors)\n",
    "    assert isinstance(col_onehot,list)\n",
    "\n",
    "def test_split_columns():           #dummies is not user built function so no need to test if no. of columns split for each\n",
    "    X=split_columns(data,X,predictors)   # category == no. of levels for that category.\n",
    "    \n",
    "    assert X.shape[1]> data[predictors].shape[1]  #assuming col_onehot is non empty else >= instead of >"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}