Skip to content

Instantly share code, notes, and snippets.

@ngould
Created February 16, 2014 14:16
Show Gist options
  • Save ngould/9034907 to your computer and use it in GitHub Desktop.
Save ngould/9034907 to your computer and use it in GitHub Desktop.
Cool R article/tutorial on PCA pitfalls that I did as an exercise.
# Code below is based on the tutorial here:
#
# http://www.r-bloggers.com/unprincipled-component-analysis/
#
# Load and prepare the data
urlBase <- 'http://www.win-vector.com/dfiles/PCA/'
pv <- read.table(paste(urlBase, 'ProxyVariables.csv', sep=''),
sep=',', header=T, comment.char='')
ot <- read.table(paste(urlBase, 'ObservedTemps.csv', sep=''),
sep=',', header=T, comment.char='')
keyName <- 'Year'
varNames <- setdiff(colnames(pv), keyName)
yName <- setdiff(colnames(ot), keyName)
d <- merge(pv, ot, by=c(keyName))
# Perform a regression on new PCA variables
pcomp <- prcomp(d[,varNames]) #Should be: pcomp <- prcomp(d[, varNames], scale.=T))
synthNames <- colnames(pcomp$rotation)
d <- cbind(d, as.matrix(d[,varNames]) %*% pcomp$rotation)
f <- as.formula(paste(yName, paste(synthNames, collapse=' + '), sep=' ~ '))
model <- step(lm(f, data=d))
# Print and plot a little
print(summary(model)$r.squared)
d$pred <- predict(model, newdata=d)
library(ggplot2)
ggplot(data=d) +
geom_point(aes_string(x=keyName,y=yName)) +
geom_line(aes_string(x=keyName,y='pred'))
# Test and train
d$trainGroup <- d[, keyName] >= median(d[,keyName])
dTrain <- subset(d, trainGroup)
dTest <- subset(d, !trainGroup)
model <- step(lm(f, data=dTrain))
dTest$pred <- predict(model, newdata=dTest)
dTrain$pred <- predict(model, newdata=dTrain)
ggplot() +
geom_point(data=dTest,aes_string(x=keyName,y=yName)) +
geom_line(data=dTest,aes_string(x=keyName,y='pred'),
color='blue',linetype=2) +
geom_point(data=dTrain,aes_string(x=keyName,y=yName)) +
geom_line(data=dTrain,aes_string(x=keyName,y='pred'),
color='red',linetype=1)
# Use RMSE to quantify how badly we just did!
rmse <- function(x, y) { sqrt(mean((x - y)^2))}
print(rmse(dTrain[, yName], mean(dTrain[, yName]))) # Prediction better than just using mean
print(rmse(dTrain[, yName], dTrain$pred))
print(rmse(dTest[, yName], mean(dTest[, yName]))) # Prediction worse than just using mean
print(rmse(dTest[, yName], dTest$pred))
# Re-plot the data with a smoothing line fit through it
ggplot() +
geom_point(data=dTest,aes_string(x=keyName,y=yName)) +
geom_line(data=dTest,aes_string(x=keyName,y='pred'),
color='blue',linetype=2) +
geom_segment(aes(x=min(dTest[,keyName]),
xend=max(dTest[,keyName]),
y=mean(dTest[,yName]),
yend=mean(dTest[,yName]))) +
geom_point(data=dTrain,aes_string(x=keyName,y=yName)) +
geom_line(data=dTrain,aes_string(x=keyName,y='pred'),
color='red',linetype=1) +
geom_segment(aes(x=min(dTrain[,keyName]),
xend=max(dTrain[,keyName]),
y=mean(dTrain[,yName]),
yend=mean(dTrain[,yName]))) +
geom_smooth(data=d,aes_string(x=keyName,y=yName),
color='black')
# Did PCA help us at all?
g <- as.formula(paste(yName, paste(varNames, collapse=' + '), sep=' ~ '))
model <- lm(g, data=dTrain)
dTest$pred <- predict(model, newdata=dTest)
dTrain$pred <- predict(model, newdata=dTrain)
print(rmse(dTrain[, yName], mean(dTrain[, yName])))
print(rmse(dTrain[, yName], dTrain$pred))
print(rmse(dTest[, yName], mean(dTest[, yName])))
print(rmse(dTest[, yName], dTest$pred)) # 0.1973 compared to 0.2004 with PCA. (i.e. little benefit)
# Rerun the PCA fixing scaling and throwing out synth varibles with small variation
pcomp <- prcomp(d[,varNames],scale.=T)
synthNames <- colnames(pcomp$rotation)[pcomp$sdev>1]
f <- as.formula(paste(yName, paste(synthNames, collapse=' + '),sep=' ~ '))
model <- step(lm(f,data=dTrain))
dTest$pred <- predict(model,newdata=dTest)
dTrain$pred <- predict(model,newdata=dTrain)
print(rmse(dTrain[, yName], mean(dTrain[, yName])))
print(rmse(dTrain[, yName], dTrain$pred))
print(rmse(dTest[, yName], mean(dTest[, yName])))
print(rmse(dTest[, yName], dTest$pred)) # RMSE worstens to 0.2778, somewhat unexpectedly.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment