Skip to content

Instantly share code, notes, and snippets.

@primaryobjects
Last active March 6, 2018 19:33
Show Gist options
  • Save primaryobjects/6288c2a5e9ff4e1b9a4fbd0edbc500c4 to your computer and use it in GitHub Desktop.
Save primaryobjects/6288c2a5e9ff4e1b9a4fbd0edbc500c4 to your computer and use it in GitHub Desktop.
Linear regression on California housing data for median house value. Creation of a synthetic variable. Plotting predictions vs actuals and removing outliers. See also https://colab.research.google.com/notebooks/mlcc/first_steps_with_tensor_flow.ipynb and https://colab.research.google.com/notebooks/mlcc/synthetic_features_and_outliers.ipynb
library(Metrics)
# Load dataset.
data <- read.csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv')
# Scale housing_median_value for an easier learning rate.
data$median_house_value_scaled <- data$median_house_value / 1000
# Randomize the dataset.
data <- data[sample(nrow(data)),]
# Build a model to predict the median_house_value_scaled from the total_rooms.
fit <- lm(median_house_value_scaled ~ total_rooms, data = data)
# Get the resulting predictions.
predictions <- predict(fit, newdata = data)
# Calculate the MSE and RMSE to determine how well our model predicts.
mean_squared_error <- mse(data$median_house_value_scaled, predictions)
root_mean_squared_error <- sqrt(mean_squared_error)
# Compare against the min and max housing median values.
min_house_value <- min(data$median_house_value_scaled)
max_house_value <- max(data$median_house_value_scaled)
min_max_difference <- max_house_value - min_house_value
c(min=min_house_value, max=max_house_value, difference=min_max_difference, rmse=root_mean_squared_error)
# Plot a sample of the median_house_value_scaled.
small <- data[sample(300),]
plot(median_house_value_scaled ~ total_rooms, data = small, pch=16, col='darkblue')
abline(fit, col='red')
# Try using more features to see if we can get a lower RMSE.
fit2 <- lm(median_house_value_scaled ~ . - median_house_value - population - households, data = data)
# Get the predictions and calculate the new RMSE.
predictions2 <- predict(fit2, newdata = data)
mean_squared_error2 <- mse(data$median_house_value_scaled, predictions2)
root_mean_squared_error2 <- sqrt(mean_squared_error2)
c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2)
min max difference original_rmse new_rmse
14.99900 500.00100 485.00200 114.98101 71.85101
min max difference original_rmse new_rmse rmse_roomsPerPerson rmse_roomsPerPerson2_outliers_removed
14.99900 500.00100 485.00200 114.98101 71.85101 113.46910 104.86142
#
# This code continues from the above housing.R code.
#
# Try a synthetic feature of total_rooms / population.
data$roomsPerPerson <- data$total_rooms / data$population
fit3 <- lm(median_house_value_scaled ~ roomsPerPerson, data = data)
predictions3 <- predict(fit3, newdata = data)
mean_squared_error3 <- mse(data$median_house_value_scaled, predictions3)
root_mean_squared_error3 <- sqrt(mean_squared_error3)
c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2, rmse_roomsPerPerson=root_mean_squared_error3)
# Visualize outliers by plotting the predictions against the actual media house values.
plot(data$median_house_value_scaled ~ predictions3)
# Most dots align to a vertical line. However, some outlier predictions emerge towards the right.
# Look at how far the x-axis (predictions) scale outward all the way past 1200, while the actuals max at 500.
# Let's look at a histogram of our synthetic data.
hist(data$roomsPerPerson)
# There indeed appears to be a few outliers in the tiny right bar.
# The majority of values are < 5.
# Let's clip the data at a minimum value of 5 and try to predict again.
data2 <- data[data$roomsPerPerson < 5,]
print(paste('Outliers', nrow(data) - nrow(data2), sep = ': '))
# Train again, this time, with the outliers removed. Our RMSE is even lower now!
fit4 <- lm(median_house_value_scaled ~ roomsPerPerson, data = data2)
predictions4 <- predict(fit4, newdata = data2)
mean_squared_error4 <- mse(data2$median_house_value_scaled, predictions4)
root_mean_squared_error4 <- sqrt(mean_squared_error4)
c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2, rmse_roomsPerPerson=root_mean_squared_error3, rmse_roomsPerPerson2=root_mean_squared_error4)
# Now plot the data and notice the uniform distribution.
# Notice how the predictions on the x-axis now max out at about 500; the same as the actual median house values.
plot(data2$median_house_value_scaled ~ predictions4)
hist(data2$roomsPerPerson)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment