parvathysarat/Income Classification_DecisionTreeClassifier.py

## Income Classification_DecisionTreeClassifier.py
import pandas as pd
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
train.dtypes

#continuous variables
train.describe()
#categorical variables
categorical=train.dtypes.loc[train.dtypes=="object"].index
categorical
train[categorical].apply(lambda x: len(x.unique()))
#Inter Quartile Range of Age
np.percentile(train["Age"],75)-np.percentile(train["Age"],25)
#Univariate analysis
train["Race"].value_counts()
train["Race"].value_counts()/train.shape[0]
train["Native.Country"].value_counts()
train["Native.Country"].value_counts()/train.shape[0]

#Multivariate analysis
#BOTH Categorical variables = crosstab or confusion matrix
ct=pd.crosstab(train["Sex"],train["Income.Group"],margins=True)
ct
%matplotlib inline
ct.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","blue"],grid=False)

#Convert to percent
def percConvert(ser):
    return ser/float(ser[-1])
ct2=ct.apply(percConvert,axis=1)
#BOTH categorical - stacked bar chart
ct2.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","black"],grid=False)

#Both continuous
#Plot - scatter
train.plot("Age","Hours.Per.Week",kind="scatter")

#Categorical - Continuous = BOX PLOTS
train.boxplot(column="Hours.Per.Week",by="Sex")

#Filling up missing values
train.apply(lambda x:sum(x.isnull()))
test.apply(lambda x: sum(x.isnull()))

from scipy.stats import mode
train["Workclass"].mode().values[0]
#Impute the value
var_to_impute=["Workclass","Occupation","Native.Country"]
for var in var_to_impute:
    train[var].fillna(train[var].mode().values[0],inplace=True)
    test[var].fillna(train[var].mode().values[0],inplace=True)

#checking if missing values
train.apply(lambda x:sum(x.isnull()))
test.apply(lambda x: sum(x.isnull()))

#OUTLIER TREATMENT

%matplotlib inline
train.plot("ID","Age",kind="scatter")
#no outliers

#Variable Trasnformation
train.dtypes
categorical_variables=list(train.dtypes.loc[train.dtypes=="object"].index)
categorical_variables
train[categorical_variables].apply(lambda x: len(x.unique()))

#Run loop over these values, combine less frequency values wala ones into Others

for column in categorical_variables:
    #Determining categories to combine:
        freq=train[column].value_counts()/train.shape[0]
        categories_to_combine=freq.loc[freq.values<0.05].index

        #loop over all categories and combine them as others
        for cat in categories_to_combine:
            train[column].replace({cat: "Others"},inplace=True)
            test[column].replace({cat: "Others"},inplace=True)

#checking new categories
train["Workclass"].value_counts()/train.shape[0]
test["Workclass"].value_counts()/test.shape[0]

#Data preprocessing
from sklearn.preprocessing import LabelEncoder
categorical_variables=train.dtypes.loc[train.dtypes=="object"].index

le=LabelEncoder()
for var in categorical_variables:
    train[var]=le.fit_transform(train[var])
    test[var]=le.fit_transform(test[var])

train.dtypes

#Fitting the model
from sklearn.tree import DecisionTreeClassifier
dependent_variable="Income.Group"
independent_variable=[x for x in train.columns if x not in ["ID",dependent_variable]]

#Initialize algorithm
model=DecisionTreeClassifier(max_depth=10,min_samples_leaf=100,max_features="sqrt")
model.fit(train[independent_variable],train[dependent_variable])
predictions_train=model.predict(train[independent_variable])
acc1=accuracy_score(train[dependent_variable],predictions_train)
le.inverse_transform(train["Income.Group"])

#inverse transform - get income group as < or >50K
predictions_test=le.inverse_transform(predictions_test)
submission={"ID": test["ID"], "Income.Group": predictions_test}
submission=pd.DataFrame(submission)

#save submission file
submission.to_csv("submission.csv")
acc1
# 0.8080525782377691
#accuracy of test prediction 0.8051716725016891
	import pandas as pd
	train=pd.read_csv("train.csv")
	test=pd.read_csv("test.csv")
	train.dtypes

	#continuous variables
	train.describe()
	#categorical variables
	categorical=train.dtypes.loc[train.dtypes=="object"].index
	categorical
	train[categorical].apply(lambda x: len(x.unique()))
	#Inter Quartile Range of Age
	np.percentile(train["Age"],75)-np.percentile(train["Age"],25)
	#Univariate analysis
	train["Race"].value_counts()
	train["Race"].value_counts()/train.shape[0]
	train["Native.Country"].value_counts()
	train["Native.Country"].value_counts()/train.shape[0]

	#Multivariate analysis
	#BOTH Categorical variables = crosstab or confusion matrix
	ct=pd.crosstab(train["Sex"],train["Income.Group"],margins=True)
	ct
	%matplotlib inline
	ct.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","blue"],grid=False)

	#Convert to percent
	def percConvert(ser):
	return ser/float(ser[-1])
	ct2=ct.apply(percConvert,axis=1)
	#BOTH categorical - stacked bar chart
	ct2.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","black"],grid=False)

	#Both continuous
	#Plot - scatter
	train.plot("Age","Hours.Per.Week",kind="scatter")

	#Categorical - Continuous = BOX PLOTS
	train.boxplot(column="Hours.Per.Week",by="Sex")

	#Filling up missing values
	train.apply(lambda x:sum(x.isnull()))
	test.apply(lambda x: sum(x.isnull()))

	from scipy.stats import mode
	train["Workclass"].mode().values[0]
	#Impute the value
	var_to_impute=["Workclass","Occupation","Native.Country"]
	for var in var_to_impute:
	train[var].fillna(train[var].mode().values[0],inplace=True)
	test[var].fillna(train[var].mode().values[0],inplace=True)

	#checking if missing values
	train.apply(lambda x:sum(x.isnull()))
	test.apply(lambda x: sum(x.isnull()))

	#OUTLIER TREATMENT

	%matplotlib inline
	train.plot("ID","Age",kind="scatter")
	#no outliers

	#Variable Trasnformation
	train.dtypes
	categorical_variables=list(train.dtypes.loc[train.dtypes=="object"].index)
	categorical_variables
	train[categorical_variables].apply(lambda x: len(x.unique()))

	#Run loop over these values, combine less frequency values wala ones into Others

	for column in categorical_variables:
	#Determining categories to combine:
	freq=train[column].value_counts()/train.shape[0]
	categories_to_combine=freq.loc[freq.values<0.05].index

	#loop over all categories and combine them as others
	for cat in categories_to_combine:
	train[column].replace({cat: "Others"},inplace=True)
	test[column].replace({cat: "Others"},inplace=True)

	#checking new categories
	train["Workclass"].value_counts()/train.shape[0]
	test["Workclass"].value_counts()/test.shape[0]

	#Data preprocessing
	from sklearn.preprocessing import LabelEncoder
	categorical_variables=train.dtypes.loc[train.dtypes=="object"].index

	le=LabelEncoder()
	for var in categorical_variables:
	train[var]=le.fit_transform(train[var])
	test[var]=le.fit_transform(test[var])

	train.dtypes

	#Fitting the model
	from sklearn.tree import DecisionTreeClassifier
	dependent_variable="Income.Group"
	independent_variable=[x for x in train.columns if x not in ["ID",dependent_variable]]

	#Initialize algorithm
	model=DecisionTreeClassifier(max_depth=10,min_samples_leaf=100,max_features="sqrt")
	model.fit(train[independent_variable],train[dependent_variable])
	predictions_train=model.predict(train[independent_variable])
	acc1=accuracy_score(train[dependent_variable],predictions_train)
	le.inverse_transform(train["Income.Group"])

	#inverse transform - get income group as < or >50K
	predictions_test=le.inverse_transform(predictions_test)
	submission={"ID": test["ID"], "Income.Group": predictions_test}
	submission=pd.DataFrame(submission)

	#save submission file
	submission.to_csv("submission.csv")
	acc1
	# 0.8080525782377691
	#accuracy of test prediction 0.8051716725016891