dylanjf/gist:5774446

## gistfile1.txt
amazon_test$ACTION1 = 1

#this is really just a placeholder variable so that the training examples wont be removed when I call match.
#also, make sure you put this in the same column as it is in the training set...


amazon_test = sparse.model.matrix(~. - 1, data = amazon_test)


#turn the test set into a sparse matrix. the training is in this format already.


amazon_train = amazon_train[, c(which(complete.cases(match(colnames(amazon_train), colnames(amazon_test))) == TRUE))]


#to break down what this does, match takes the second argument and returns the numerical position of where it
#exists in the first argument.  If it doesn't find it, it returns NA.  so I'm telling it to look through a
#character vector of column names in the test file, relate them to what columns they correspond to in
#training, and only keep those.  The complete.cases wrapper removes all the NAs from the vector and gives
#me a nice numerical list.


amazon_test = amazon_test[, c(which(complete.cases(match(colnames(amazon_test), colnames(amazon_train))) == TRUE))]


#not only that, you need to run it again to take away the dummies that are in the test set but not in training


dim(amazon_train) ; dim(amazon_test)

#should have the same number of columns now
	amazon_test$ACTION1 = 1

	#this is really just a placeholder variable so that the training examples wont be removed when I call match.
	#also, make sure you put this in the same column as it is in the training set...



	amazon_test = sparse.model.matrix(~. - 1, data = amazon_test)


	#turn the test set into a sparse matrix. the training is in this format already.



	amazon_train = amazon_train[, c(which(complete.cases(match(colnames(amazon_train), colnames(amazon_test))) == TRUE))]



	#to break down what this does, match takes the second argument and returns the numerical position of where it
	#exists in the first argument. If it doesn't find it, it returns NA. so I'm telling it to look through a
	#character vector of column names in the test file, relate them to what columns they correspond to in
	#training, and only keep those. The complete.cases wrapper removes all the NAs from the vector and gives
	#me a nice numerical list.




	amazon_test = amazon_test[, c(which(complete.cases(match(colnames(amazon_test), colnames(amazon_train))) == TRUE))]



	#not only that, you need to run it again to take away the dummies that are in the test set but not in training



	dim(amazon_train) ; dim(amazon_test)

	#should have the same number of columns now