primaryobjects/housing.R

## housing.R
library(Metrics)

# Load dataset.
data <- read.csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv')

# Scale housing_median_value for an easier learning rate.
data$median_house_value_scaled <- data$median_house_value / 1000

# Randomize the dataset.
data <- data[sample(nrow(data)),]

# Build a model to predict the median_house_value_scaled from the total_rooms.
fit <- lm(median_house_value_scaled ~ total_rooms, data = data)

# Get the resulting predictions.
predictions <- predict(fit, newdata = data)

# Calculate the MSE and RMSE to determine how well our model predicts.
mean_squared_error <- mse(data$median_house_value_scaled, predictions)
root_mean_squared_error <- sqrt(mean_squared_error)

# Compare against the min and max housing median values.
min_house_value <- min(data$median_house_value_scaled)
max_house_value <- max(data$median_house_value_scaled)
min_max_difference <- max_house_value - min_house_value

c(min=min_house_value, max=max_house_value, difference=min_max_difference, rmse=root_mean_squared_error)

# Plot a sample of the median_house_value_scaled.
small <- data[sample(300),]
plot(median_house_value_scaled ~ total_rooms, data = small, pch=16, col='darkblue')
abline(fit, col='red')

# Try using more features to see if we can get a lower RMSE.
fit2 <- lm(median_house_value_scaled ~ . - median_house_value - population - households, data = data)

# Get the predictions and calculate the new RMSE.
predictions2 <- predict(fit2, newdata = data)
mean_squared_error2 <- mse(data$median_house_value_scaled, predictions2)
root_mean_squared_error2 <- sqrt(mean_squared_error2)
c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2)

## output.txt
min          max           difference    original_rmse  new_rmse
14.99900     500.00100     485.00200     114.98101      71.85101

min          max           difference    original_rmse  new_rmse  rmse_roomsPerPerson rmse_roomsPerPerson2_outliers_removed
14.99900     500.00100     485.00200     114.98101      71.85101  113.46910           104.86142

## plot-1.png

      
    Raw
  

              plot-1.png
            
          
## plot-2.png

      
    Raw
  

              plot-2.png
            
          
## plot-3.png

      
    Raw
  

              plot-3.png
            
          
## synthetic.R
#
# This code continues from the above housing.R code.
#

# Try a synthetic feature of total_rooms / population.
data$roomsPerPerson <- data$total_rooms / data$population
fit3 <- lm(median_house_value_scaled ~ roomsPerPerson, data = data)
predictions3 <- predict(fit3, newdata = data)
mean_squared_error3 <- mse(data$median_house_value_scaled, predictions3)
root_mean_squared_error3 <- sqrt(mean_squared_error3)
c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2, rmse_roomsPerPerson=root_mean_squared_error3)

# Visualize outliers by plotting the predictions against the actual media house values.
plot(data$median_house_value_scaled ~ predictions3)

# Most dots align to a vertical line. However, some outlier predictions emerge towards the right.
# Look at how far the x-axis (predictions) scale outward all the way past 1200, while the actuals max at 500.
# Let's look at a histogram of our synthetic data.
hist(data$roomsPerPerson)

# There indeed appears to be a few outliers in the tiny right bar.
# The majority of values are < 5.
# Let's clip the data at a minimum value of 5 and try to predict again.
data2 <- data[data$roomsPerPerson < 5,]

print(paste('Outliers', nrow(data) - nrow(data2), sep = ': '))

# Train again, this time, with the outliers removed. Our RMSE is even lower now!
fit4 <- lm(median_house_value_scaled ~ roomsPerPerson, data = data2)
predictions4 <- predict(fit4, newdata = data2)
mean_squared_error4 <- mse(data2$median_house_value_scaled, predictions4)
root_mean_squared_error4 <- sqrt(mean_squared_error4)
c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2, rmse_roomsPerPerson=root_mean_squared_error3, rmse_roomsPerPerson2=root_mean_squared_error4)

# Now plot the data and notice the uniform distribution.
# Notice how the predictions on the x-axis now max out at about 500; the same as the actual median house values.
plot(data2$median_house_value_scaled ~ predictions4)
hist(data2$roomsPerPerson)
	library(Metrics)

	# Load dataset.
	data <- read.csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv')

	# Scale housing_median_value for an easier learning rate.
	data$median_house_value_scaled <- data$median_house_value / 1000

	# Randomize the dataset.
	data <- data[sample(nrow(data)),]

	# Build a model to predict the median_house_value_scaled from the total_rooms.
	fit <- lm(median_house_value_scaled ~ total_rooms, data = data)

	# Get the resulting predictions.
	predictions <- predict(fit, newdata = data)

	# Calculate the MSE and RMSE to determine how well our model predicts.
	mean_squared_error <- mse(data$median_house_value_scaled, predictions)
	root_mean_squared_error <- sqrt(mean_squared_error)

	# Compare against the min and max housing median values.
	min_house_value <- min(data$median_house_value_scaled)
	max_house_value <- max(data$median_house_value_scaled)
	min_max_difference <- max_house_value - min_house_value

	c(min=min_house_value, max=max_house_value, difference=min_max_difference, rmse=root_mean_squared_error)

	# Plot a sample of the median_house_value_scaled.
	small <- data[sample(300),]
	plot(median_house_value_scaled ~ total_rooms, data = small, pch=16, col='darkblue')
	abline(fit, col='red')

	# Try using more features to see if we can get a lower RMSE.
	fit2 <- lm(median_house_value_scaled ~ . - median_house_value - population - households, data = data)

	# Get the predictions and calculate the new RMSE.
	predictions2 <- predict(fit2, newdata = data)
	mean_squared_error2 <- mse(data$median_house_value_scaled, predictions2)
	root_mean_squared_error2 <- sqrt(mean_squared_error2)
	c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2)
	min max difference original_rmse new_rmse
	14.99900 500.00100 485.00200 114.98101 71.85101

	min max difference original_rmse new_rmse rmse_roomsPerPerson rmse_roomsPerPerson2_outliers_removed
	14.99900 500.00100 485.00200 114.98101 71.85101 113.46910 104.86142
	#
	# This code continues from the above housing.R code.
	#

	# Try a synthetic feature of total_rooms / population.
	data$roomsPerPerson <- data$total_rooms / data$population
	fit3 <- lm(median_house_value_scaled ~ roomsPerPerson, data = data)
	predictions3 <- predict(fit3, newdata = data)
	mean_squared_error3 <- mse(data$median_house_value_scaled, predictions3)
	root_mean_squared_error3 <- sqrt(mean_squared_error3)
	c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2, rmse_roomsPerPerson=root_mean_squared_error3)

	# Visualize outliers by plotting the predictions against the actual media house values.
	plot(data$median_house_value_scaled ~ predictions3)

	# Most dots align to a vertical line. However, some outlier predictions emerge towards the right.
	# Look at how far the x-axis (predictions) scale outward all the way past 1200, while the actuals max at 500.
	# Let's look at a histogram of our synthetic data.
	hist(data$roomsPerPerson)

	# There indeed appears to be a few outliers in the tiny right bar.
	# The majority of values are < 5.
	# Let's clip the data at a minimum value of 5 and try to predict again.
	data2 <- data[data$roomsPerPerson < 5,]

	print(paste('Outliers', nrow(data) - nrow(data2), sep = ': '))

	# Train again, this time, with the outliers removed. Our RMSE is even lower now!
	fit4 <- lm(median_house_value_scaled ~ roomsPerPerson, data = data2)
	predictions4 <- predict(fit4, newdata = data2)
	mean_squared_error4 <- mse(data2$median_house_value_scaled, predictions4)
	root_mean_squared_error4 <- sqrt(mean_squared_error4)
	c(min=min_house_value, max=max_house_value, difference=min_max_difference, original_rmse=root_mean_squared_error, new_rmse=root_mean_squared_error2, rmse_roomsPerPerson=root_mean_squared_error3, rmse_roomsPerPerson2=root_mean_squared_error4)

	# Now plot the data and notice the uniform distribution.
	# Notice how the predictions on the x-axis now max out at about 500; the same as the actual median house values.
	plot(data2$median_house_value_scaled ~ predictions4)
	hist(data2$roomsPerPerson)