Skip to content

Instantly share code, notes, and snippets.

@truncs
Created April 6, 2012 05:24
Show Gist options
  • Save truncs/2317226 to your computer and use it in GitHub Desktop.
Save truncs/2317226 to your computer and use it in GitHub Desktop.
Regression Tree using random samples
# Transform days spent to log(1 + x)
y1_data$DaysInHospital_Y2 <- log1p(y1_data$DaysInHospital_Y2)
# Divide the set, 80% for train and 20% for test
indexes <- sample(1:nrow(y1_data), size=0.2*nrow(y1_data))
test <- y1_data[indexes,]
train <- y1_data[-indexes,]
# Remove unwanted features from both the sets
train <- subset(train, select=-c(MemberID_t, YEAR_t, DaysInHospital, trainset, DaysInHospital_Y3, age_05, PayDelay_max, PayDelay_min, PayDelay_stdev, LOS_max, LOS_min,
LOS_stdev, LOS_TOT_UNKNOWN, LOS_TOT_SUPRESSED, LOS_TOT_KNOWN, dsfs_max, dsfs_min, dsfs_range, dsfs_stdev, CharlsonIndexI_max, CharlsonIndexI_min, CharlsonIndexI_range, CharlsonIndexI_stdev,drugCount_max, drugCount_min, memberID_lc, YEAR_lc, labCount_max, labCount_min, labNull, drugNull))
test <- subset(test, select=-c(MemberID_t, YEAR_t, DaysInHospital, trainset, DaysInHospital_Y2, DaysInHospital_Y3, age_05, PayDelay_max, PayDelay_min, PayDelay_stdev, LOS_max, LOS_min,
LOS_stdev, LOS_TOT_UNKNOWN, LOS_TOT_SUPRESSED, LOS_TOT_KNOWN, dsfs_max, dsfs_min, dsfs_range, dsfs_stdev, CharlsonIndexI_max, CharlsonIndexI_min, CharlsonIndexI_range, CharlsonIndexI_stdev,drugCount_max, drugCount_min, memberID_lc, YEAR_lc, labCount_max, labCount_min, labNull, drugNull))
# Model a regression tree using the training data
tr <- tree(DaysInHospital_Y2 ~ ., train)
# Get the visual for the tree
plot(tr,type="uniform"); text(tr,pretty=0)
# Use the tree to predict the test set values and
# append the predicted values to the test set
result <- predict(tr, test, type="vector")
test$predicted <- result
# Calculate the RMSE without truncation
sqrt(mean((y1_data[indexes,]$DaysInHospital_Y2 - test$predicted)^2))
# Truncate the values and then calculate the RMSE again
test$new_predicted <- log1p(trunc(exp(test$predicted) - 1))
sqrt(mean((y1_data[indexes,]$DaysInHospital_Y2 - test$new_predicted)^2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment