bryantrobbins/filter.R

## filter.R
# I am a novice at R, so please forgive this example.
# After hours of Google-ing, tweaking, and trying a bunch of stuff, this was the only
# way I could come up with for constructing a set of columns to be dropped from a data.table,
# then dropping those columns.
#
# I expect that any R pros that stumble across this may have comments. Please feel free to suggest edits/leave comments.
#
# Library loading
library('data.table')

# The input file for this test (you'll need to provide your own on your local)
input.file="subset.csv"

# Load massive data file from csv
data=fread(input.file, stringsAsFactors=TRUE)

# Construct list of columns to be dropped
filter = c()
for (ix in names(data)) {
  str <- sprintf("data$\"%s\"", ix)
  # You can do whatever kind of test here that you need to do
  # In my case, all of my variables are factors, and I needed to drop any that had less than two levels
  # Having less than two levels can occur when a) you have a useless feature or b) you're trying to take
  # a subset of your full feature set.
  #
  # Running an eval on an expression like data$colName was the only way I could get nlevels to work
  # I expected nlevels(data[,ix,with=FALSE]) to work, but it did not, presumably because data[,n] returns a vector and
  # not the factor variable itself.
  #
  if(nlevels(eval(parse(text=str))) < 2) {
    # There also may be better ways to do this. I did not have any luck with "Filter" or "sapply" versions of this, but
    # there could have been other confounding issues.
    # Please feel free to comment!
    #
    filter <- c(filter, ix)
  }
}

#
# Drop unused columns
# This worked as expected
#
data[,(filter):=NULL]

# Check the length of the filtered data.table, which should be less if any cols cleared
length(names(data))
	# I am a novice at R, so please forgive this example.
	# After hours of Google-ing, tweaking, and trying a bunch of stuff, this was the only
	# way I could come up with for constructing a set of columns to be dropped from a data.table,
	# then dropping those columns.
	#
	# I expect that any R pros that stumble across this may have comments. Please feel free to suggest edits/leave comments.
	#
	# Library loading
	library('data.table')

	# The input file for this test (you'll need to provide your own on your local)
	input.file="subset.csv"

	# Load massive data file from csv
	data=fread(input.file, stringsAsFactors=TRUE)

	# Construct list of columns to be dropped
	filter = c()
	for (ix in names(data)) {
	str <- sprintf("data$\"%s\"", ix)
	# You can do whatever kind of test here that you need to do
	# In my case, all of my variables are factors, and I needed to drop any that had less than two levels
	# Having less than two levels can occur when a) you have a useless feature or b) you're trying to take
	# a subset of your full feature set.
	#
	# Running an eval on an expression like data$colName was the only way I could get nlevels to work
	# I expected nlevels(data[,ix,with=FALSE]) to work, but it did not, presumably because data[,n] returns a vector and
	# not the factor variable itself.
	#
	if(nlevels(eval(parse(text=str))) < 2) {
	# There also may be better ways to do this. I did not have any luck with "Filter" or "sapply" versions of this, but
	# there could have been other confounding issues.
	# Please feel free to comment!
	#
	filter <- c(filter, ix)
	}
	}

	#
	# Drop unused columns
	# This worked as expected
	#
	data[,(filter):=NULL]

	# Check the length of the filtered data.table, which should be less if any cols cleared
	length(names(data))