Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save parvathysarat/2296210724987f3874779d382f0261c2 to your computer and use it in GitHub Desktop.
Save parvathysarat/2296210724987f3874779d382f0261c2 to your computer and use it in GitHub Desktop.
Analytics Vidhya workshop problem - Classify income as < or >=50K - accuracy 0.8051716725016891
import pandas as pd
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
train.dtypes
#continuous variables
train.describe()
#categorical variables
categorical=train.dtypes.loc[train.dtypes=="object"].index
categorical
train[categorical].apply(lambda x: len(x.unique()))
#Inter Quartile Range of Age
np.percentile(train["Age"],75)-np.percentile(train["Age"],25)
#Univariate analysis
train["Race"].value_counts()
train["Race"].value_counts()/train.shape[0]
train["Native.Country"].value_counts()
train["Native.Country"].value_counts()/train.shape[0]
#Multivariate analysis
#BOTH Categorical variables = crosstab or confusion matrix
ct=pd.crosstab(train["Sex"],train["Income.Group"],margins=True)
ct
%matplotlib inline
ct.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","blue"],grid=False)
#Convert to percent
def percConvert(ser):
return ser/float(ser[-1])
ct2=ct.apply(percConvert,axis=1)
#BOTH categorical - stacked bar chart
ct2.iloc[:-1,:-1].plot(kind="bar",stacked=True,color=["red","black"],grid=False)
#Both continuous
#Plot - scatter
train.plot("Age","Hours.Per.Week",kind="scatter")
#Categorical - Continuous = BOX PLOTS
train.boxplot(column="Hours.Per.Week",by="Sex")
#Filling up missing values
train.apply(lambda x:sum(x.isnull()))
test.apply(lambda x: sum(x.isnull()))
from scipy.stats import mode
train["Workclass"].mode().values[0]
#Impute the value
var_to_impute=["Workclass","Occupation","Native.Country"]
for var in var_to_impute:
train[var].fillna(train[var].mode().values[0],inplace=True)
test[var].fillna(train[var].mode().values[0],inplace=True)
#checking if missing values
train.apply(lambda x:sum(x.isnull()))
test.apply(lambda x: sum(x.isnull()))
#OUTLIER TREATMENT
%matplotlib inline
train.plot("ID","Age",kind="scatter")
#no outliers
#Variable Trasnformation
train.dtypes
categorical_variables=list(train.dtypes.loc[train.dtypes=="object"].index)
categorical_variables
train[categorical_variables].apply(lambda x: len(x.unique()))
#Run loop over these values, combine less frequency values wala ones into Others
for column in categorical_variables:
#Determining categories to combine:
freq=train[column].value_counts()/train.shape[0]
categories_to_combine=freq.loc[freq.values<0.05].index
#loop over all categories and combine them as others
for cat in categories_to_combine:
train[column].replace({cat: "Others"},inplace=True)
test[column].replace({cat: "Others"},inplace=True)
#checking new categories
train["Workclass"].value_counts()/train.shape[0]
test["Workclass"].value_counts()/test.shape[0]
#Data preprocessing
from sklearn.preprocessing import LabelEncoder
categorical_variables=train.dtypes.loc[train.dtypes=="object"].index
le=LabelEncoder()
for var in categorical_variables:
train[var]=le.fit_transform(train[var])
test[var]=le.fit_transform(test[var])
train.dtypes
#Fitting the model
from sklearn.tree import DecisionTreeClassifier
dependent_variable="Income.Group"
independent_variable=[x for x in train.columns if x not in ["ID",dependent_variable]]
#Initialize algorithm
model=DecisionTreeClassifier(max_depth=10,min_samples_leaf=100,max_features="sqrt")
model.fit(train[independent_variable],train[dependent_variable])
predictions_train=model.predict(train[independent_variable])
acc1=accuracy_score(train[dependent_variable],predictions_train)
le.inverse_transform(train["Income.Group"])
#inverse transform - get income group as < or >50K
predictions_test=le.inverse_transform(predictions_test)
submission={"ID": test["ID"], "Income.Group": predictions_test}
submission=pd.DataFrame(submission)
#save submission file
submission.to_csv("submission.csv")
acc1
# 0.8080525782377691
#accuracy of test prediction 0.8051716725016891
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment