bwv988/h2o_sol.R

## h2o_sol.R
# Data manipulation with H2O
# RS25072017

require(h2o)

h2o.init()


# 1. The R way. ---------------------

# Using the 1984 Congressional Voting Records from UCI.
voting.url = "http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"

# Load the data.
party.data = read.table(voting.url, sep = ",")
colnames(party.data ) = c("party", paste("vote", 1:16, sep=""))

# I want to this to treat the "NA" values:
party.data[party.data  == "?"] = "n"

# No more "?" in the df:
head(party.data)

# 2. The H2O way. ---------------------

feature.cols = paste0("vote", 1:16)

# Use some extra args to specify column names and types.
voting.data.raw = h2o.importFile(path = voting.url,
                                 col.names = c("party", feature.cols),
                                 col.types = rep("string", 17))

#
# THIS IS THE PART I DON'T LIKE.
#
# Messy way of cleaning the data set.
fix.nas = function(df, col, na.val = "?", rep.val = "n") {
  df[df[, col] == na.val, col] = rep.val
  return(df)
}

# Very ugly code, not efficient at all!
# Fix for each column.
for (col in feature.cols) {
  voting.data.raw = fix.nas(voting.data.raw, col)
}

# Want variables to be of a categorical nature.
voting.data = h2o.asfactor(voting.data.raw)

# Looks better now.
head(voting.data)
	# Data manipulation with H2O
	# RS25072017

	require(h2o)

	h2o.init()


	# 1. The R way. ---------------------

	# Using the 1984 Congressional Voting Records from UCI.
	voting.url = "http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"

	# Load the data.
	party.data = read.table(voting.url, sep = ",")
	colnames(party.data ) = c("party", paste("vote", 1:16, sep=""))

	# I want to this to treat the "NA" values:
	party.data[party.data == "?"] = "n"

	# No more "?" in the df:
	head(party.data)

	# 2. The H2O way. ---------------------

	feature.cols = paste0("vote", 1:16)

	# Use some extra args to specify column names and types.
	voting.data.raw = h2o.importFile(path = voting.url,
	col.names = c("party", feature.cols),
	col.types = rep("string", 17))

	#
	# THIS IS THE PART I DON'T LIKE.
	#
	# Messy way of cleaning the data set.
	fix.nas = function(df, col, na.val = "?", rep.val = "n") {
	df[df[, col] == na.val, col] = rep.val
	return(df)
	}

	# Very ugly code, not efficient at all!
	# Fix for each column.
	for (col in feature.cols) {
	voting.data.raw = fix.nas(voting.data.raw, col)
	}

	# Want variables to be of a categorical nature.
	voting.data = h2o.asfactor(voting.data.raw)

	# Looks better now.
	head(voting.data)