Created
July 25, 2017 14:49
-
-
Save bwv988/670d351c07e17524b845d6091002de98 to your computer and use it in GitHub Desktop.
Data Manipulation in H20
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data manipulation with H2O | |
# RS25072017 | |
require(h2o) | |
h2o.init() | |
# 1. The R way. --------------------- | |
# Using the 1984 Congressional Voting Records from UCI. | |
voting.url = "http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data" | |
# Load the data. | |
party.data = read.table(voting.url, sep = ",") | |
colnames(party.data ) = c("party", paste("vote", 1:16, sep="")) | |
# I want to this to treat the "NA" values: | |
party.data[party.data == "?"] = "n" | |
# No more "?" in the df: | |
head(party.data) | |
# 2. The H2O way. --------------------- | |
feature.cols = paste0("vote", 1:16) | |
# Use some extra args to specify column names and types. | |
voting.data.raw = h2o.importFile(path = voting.url, | |
col.names = c("party", feature.cols), | |
col.types = rep("string", 17)) | |
# | |
# THIS IS THE PART I DON'T LIKE. | |
# | |
# Messy way of cleaning the data set. | |
fix.nas = function(df, col, na.val = "?", rep.val = "n") { | |
df[df[, col] == na.val, col] = rep.val | |
return(df) | |
} | |
# Very ugly code, not efficient at all! | |
# Fix for each column. | |
for (col in feature.cols) { | |
voting.data.raw = fix.nas(voting.data.raw, col) | |
} | |
# Want variables to be of a categorical nature. | |
voting.data = h2o.asfactor(voting.data.raw) | |
# Looks better now. | |
head(voting.data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment