Skip to content

Instantly share code, notes, and snippets.

@HackerEarthBlog
Last active January 31, 2017 08:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HackerEarthBlog/5e5f407d1257d74a92f6a9cb64f980de to your computer and use it in GitHub Desktop.
Save HackerEarthBlog/5e5f407d1257d74a92f6a9cb64f980de to your computer and use it in GitHub Desktop.
path = "~/mydata/deeplearning"
setwd(path)
#load libraries
library(data.table)
library(mlr)
#set variable names
setcol <- c("age",
"workclass",
"fnlwgt",
"education",
"education-num",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"target")
#load data
train <- read.table("adultdata.txt",header = F,sep = ",",col.names = setcol,na.strings = c(" ?"),stringsAsFactors = F)
test <- read.table("adulttest.txt",header = F,sep = ",",col.names = setcol,skip = 1, na.strings = c(" ?"),stringsAsFactors = F)
setDT(train)
setDT(test)
#Data Sanity
dim(train) #32561 X 15
dim(test) #16281 X 15
str(train)
str(test)
#check missing values
table(is.na(train))
sapply(train, function(x) sum(is.na(x))/length(x))*100
table(is.na(test))
sapply(test, function(x) sum(is.na(x))/length(x))*100
#check target variable
#binary in nature check if data is imbalanced
train[,.N/nrow(train),target]
test[,.N/nrow(test),target]
#remove extra characters
test[,target := substr(target,start = 1,stop = nchar(target)-1)]
#remove leading whitespace
library(stringr)
char_col <- colnames(train)[sapply(test,is.character)]
for(i in char_col)
set(train,j=i,value = str_trim(train[[i]],side = "left"))
#set all character variables as factor
fact_col <- colnames(train)[sapply(train,is.character)]
for(i in fact_col)
set(train,j=i,value = factor(train[[i]]))
for(i in fact_col)
set(test,j=i,value = factor(test[[i]]))
#impute missing values
imp1 <- impute(data = train,target = "target",classes = list(integer = imputeMedian(), factor = imputeMode()))
imp2 <- impute(data = test,target = "target",classes = list(integer = imputeMedian(), factor = imputeMode()))
train <- setDT(imp1$data)
test <- setDT(imp2$data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment