Created
September 18, 2017 08:51
-
-
Save parvathysarat/2296210724987f3874779d382f0261c2 to your computer and use it in GitHub Desktop.
Analytics Vidhya workshop problem - Classify income as < or >=50K - accuracy 0.8051716725016891
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
train=pd.read_csv("train.csv") | |
test=pd.read_csv("test.csv") | |
train.dtypes | |
#continuous variables | |
train.describe() | |
#categorical variables | |
categorical=train.dtypes.loc[train.dtypes=="object"].index | |
categorical | |
train[categorical].apply(lambda x: len(x.unique())) | |
#Inter Quartile Range of Age | |
np.percentile(train["Age"],75)-np.percentile(train["Age"],25) | |
#Univariate analysis | |
train["Race"].value_counts() | |
train["Race"].value_counts()/train.shape[0] | |
train["Native.Country"].value_counts() | |
train["Native.Country"].value_counts()/train.shape[0] | |
#Multivariate analysis | |
#BOTH Categorical variables = crosstab or confusion matrix | |
ct=pd.crosstab(train["Sex"],train["Income.Group"],margins=True) | |
ct | |
%matplotlib inline | |
ct.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","blue"],grid=False) | |
#Convert to percent | |
def percConvert(ser): | |
return ser/float(ser[-1]) | |
ct2=ct.apply(percConvert,axis=1) | |
#BOTH categorical - stacked bar chart | |
ct2.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","black"],grid=False) | |
#Both continuous | |
#Plot - scatter | |
train.plot("Age","Hours.Per.Week",kind="scatter") | |
#Categorical - Continuous = BOX PLOTS | |
train.boxplot(column="Hours.Per.Week",by="Sex") | |
#Filling up missing values | |
train.apply(lambda x:sum(x.isnull())) | |
test.apply(lambda x: sum(x.isnull())) | |
from scipy.stats import mode | |
train["Workclass"].mode().values[0] | |
#Impute the value | |
var_to_impute=["Workclass","Occupation","Native.Country"] | |
for var in var_to_impute: | |
train[var].fillna(train[var].mode().values[0],inplace=True) | |
test[var].fillna(train[var].mode().values[0],inplace=True) | |
#checking if missing values | |
train.apply(lambda x:sum(x.isnull())) | |
test.apply(lambda x: sum(x.isnull())) | |
#OUTLIER TREATMENT | |
%matplotlib inline | |
train.plot("ID","Age",kind="scatter") | |
#no outliers | |
#Variable Trasnformation | |
train.dtypes | |
categorical_variables=list(train.dtypes.loc[train.dtypes=="object"].index) | |
categorical_variables | |
train[categorical_variables].apply(lambda x: len(x.unique())) | |
#Run loop over these values, combine less frequency values wala ones into Others | |
for column in categorical_variables: | |
#Determining categories to combine: | |
freq=train[column].value_counts()/train.shape[0] | |
categories_to_combine=freq.loc[freq.values<0.05].index | |
#loop over all categories and combine them as others | |
for cat in categories_to_combine: | |
train[column].replace({cat: "Others"},inplace=True) | |
test[column].replace({cat: "Others"},inplace=True) | |
#checking new categories | |
train["Workclass"].value_counts()/train.shape[0] | |
test["Workclass"].value_counts()/test.shape[0] | |
#Data preprocessing | |
from sklearn.preprocessing import LabelEncoder | |
categorical_variables=train.dtypes.loc[train.dtypes=="object"].index | |
le=LabelEncoder() | |
for var in categorical_variables: | |
train[var]=le.fit_transform(train[var]) | |
test[var]=le.fit_transform(test[var]) | |
train.dtypes | |
#Fitting the model | |
from sklearn.tree import DecisionTreeClassifier | |
dependent_variable="Income.Group" | |
independent_variable=[x for x in train.columns if x not in ["ID",dependent_variable]] | |
#Initialize algorithm | |
model=DecisionTreeClassifier(max_depth=10,min_samples_leaf=100,max_features="sqrt") | |
model.fit(train[independent_variable],train[dependent_variable]) | |
predictions_train=model.predict(train[independent_variable]) | |
acc1=accuracy_score(train[dependent_variable],predictions_train) | |
le.inverse_transform(train["Income.Group"]) | |
#inverse transform - get income group as < or >50K | |
predictions_test=le.inverse_transform(predictions_test) | |
submission={"ID": test["ID"], "Income.Group": predictions_test} | |
submission=pd.DataFrame(submission) | |
#save submission file | |
submission.to_csv("submission.csv") | |
acc1 | |
# 0.8080525782377691 | |
#accuracy of test prediction 0.8051716725016891 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment