Last active
January 31, 2017 08:17
-
-
Save HackerEarthBlog/5e5f407d1257d74a92f6a9cb64f980de to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
path = "~/mydata/deeplearning" | |
setwd(path) | |
#load libraries | |
library(data.table) | |
library(mlr) | |
#set variable names | |
setcol <- c("age", | |
"workclass", | |
"fnlwgt", | |
"education", | |
"education-num", | |
"marital-status", | |
"occupation", | |
"relationship", | |
"race", | |
"sex", | |
"capital-gain", | |
"capital-loss", | |
"hours-per-week", | |
"native-country", | |
"target") | |
#load data | |
train <- read.table("adultdata.txt",header = F,sep = ",",col.names = setcol,na.strings = c(" ?"),stringsAsFactors = F) | |
test <- read.table("adulttest.txt",header = F,sep = ",",col.names = setcol,skip = 1, na.strings = c(" ?"),stringsAsFactors = F) | |
setDT(train) | |
setDT(test) | |
#Data Sanity | |
dim(train) #32561 X 15 | |
dim(test) #16281 X 15 | |
str(train) | |
str(test) | |
#check missing values | |
table(is.na(train)) | |
sapply(train, function(x) sum(is.na(x))/length(x))*100 | |
table(is.na(test)) | |
sapply(test, function(x) sum(is.na(x))/length(x))*100 | |
#check target variable | |
#binary in nature check if data is imbalanced | |
train[,.N/nrow(train),target] | |
test[,.N/nrow(test),target] | |
#remove extra characters | |
test[,target := substr(target,start = 1,stop = nchar(target)-1)] | |
#remove leading whitespace | |
library(stringr) | |
char_col <- colnames(train)[sapply(test,is.character)] | |
for(i in char_col) | |
set(train,j=i,value = str_trim(train[[i]],side = "left")) | |
#set all character variables as factor | |
fact_col <- colnames(train)[sapply(train,is.character)] | |
for(i in fact_col) | |
set(train,j=i,value = factor(train[[i]])) | |
for(i in fact_col) | |
set(test,j=i,value = factor(test[[i]])) | |
#impute missing values | |
imp1 <- impute(data = train,target = "target",classes = list(integer = imputeMedian(), factor = imputeMode())) | |
imp2 <- impute(data = test,target = "target",classes = list(integer = imputeMedian(), factor = imputeMode())) | |
train <- setDT(imp1$data) | |
test <- setDT(imp2$data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment