gsampath127/chi.py

## chi.py
#!/usr/bin/env python
# coding: utf-8

# ## Perform Chi-Square test for Bank Churn prediction (find out different patterns on customer leaves the bank)  . Here I am considering only few columns to make things clear

# ### Import libraries

# In[2]:


import numpy as numpy
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder


# ### Get the data

# In[6]:


churn_df = pd.read_csv('bank.csv')


# In[7]:


churn_df.head()


# ### Here we have 4 category predictors and one category response. Exited, the response column represnts customer left the bank or not.

# ## Before performig Ch-Square test we have to make sure data is label encoded.

# In[9]:


label_encoder = LabelEncoder()
churn_df['Geography'] = label_encoder.fit_transform(churn_df['Geography'])
churn_df['Gender'] = label_encoder.fit_transform(churn_df['Gender'])


# In[11]:


churn_df.head()


# ## Chi-Square test

# In[13]:


from sklearn.feature_selection import chi2


# In[14]:


X = churn_df.drop('Exited',axis=1)
y = churn_df['Exited']


# In[15]:


chi_scores = chi2(X,y)


# In[16]:


chi_scores


# ### here first array represents chi square values and second array represnts p-values

# In[17]:


p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)


# In[19]:


p_values.plot.bar()


# ### Since HasCrCard has higher the p-value, it says that this variables is independent of the repsone and can not be considered for model training

# In[ ]:


## gistfile1.txt
#!/usr/bin/env python
# coding: utf-8

# ## Perform Chi-Square test for Bank Churn prediction (find out different patterns on customer leaves the bank)  . Here I am considering only few columns to make things clear

# ### Import libraries

# In[2]:


import numpy as numpy
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder


# ### Get the data

# In[6]:


churn_df = pd.read_csv('bank.csv')


# In[7]:


churn_df.head()


# ### Here we have 4 category predictors and one category response. Exited, the response column represnts customer left the bank or not.

# ## Before performig Ch-Square test we have to make sure data is label encoded.

# In[9]:


label_encoder = LabelEncoder()
churn_df['Geography'] = label_encoder.fit_transform(churn_df['Geography'])
churn_df['Gender'] = label_encoder.fit_transform(churn_df['Gender'])


# In[11]:


churn_df.head()


# ## Chi-Square test

# In[13]:


from sklearn.feature_selection import chi2


# In[14]:


X = churn_df.drop('Exited',axis=1)
y = churn_df['Exited']


# In[15]:


chi_scores = chi2(X,y)


# In[16]:


chi_scores


# ### here first array represents chi square values and second array represnts p-values

# In[17]:


p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)


# In[19]:


p_values.plot.bar()


# ### Since HasCrCard has higher the p-value, it says that this variables is independent of the repsone and can not be considered for model training

# In[ ]:
	#!/usr/bin/env python
	# coding: utf-8

	# ## Perform Chi-Square test for Bank Churn prediction (find out different patterns on customer leaves the bank) . Here I am considering only few columns to make things clear

	# ### Import libraries

	# In[2]:


	import numpy as numpy
	import pandas as pd
	import seaborn as sns
	from sklearn.preprocessing import LabelEncoder


	# ### Get the data

	# In[6]:


	churn_df = pd.read_csv('bank.csv')


	# In[7]:


	churn_df.head()


	# ### Here we have 4 category predictors and one category response. Exited, the response column represnts customer left the bank or not.

	# ## Before performig Ch-Square test we have to make sure data is label encoded.

	# In[9]:


	label_encoder = LabelEncoder()
	churn_df['Geography'] = label_encoder.fit_transform(churn_df['Geography'])
	churn_df['Gender'] = label_encoder.fit_transform(churn_df['Gender'])


	# In[11]:


	churn_df.head()


	# ## Chi-Square test

	# In[13]:


	from sklearn.feature_selection import chi2


	# In[14]:


	X = churn_df.drop('Exited',axis=1)
	y = churn_df['Exited']


	# In[15]:


	chi_scores = chi2(X,y)


	# In[16]:


	chi_scores


	# ### here first array represents chi square values and second array represnts p-values

	# In[17]:


	p_values = pd.Series(chi_scores[1],index = X.columns)
	p_values.sort_values(ascending = False , inplace = True)


	# In[19]:


	p_values.plot.bar()


	# ### Since HasCrCard has higher the p-value, it says that this variables is independent of the repsone and can not be considered for model training

	# In[ ]: