ngould/Unprincipled PCA article

## Unprincipled PCA article
# Code below is based on the tutorial here:
#
#     http://www.r-bloggers.com/unprincipled-component-analysis/
#

# Load and prepare the data
urlBase <- 'http://www.win-vector.com/dfiles/PCA/'

pv <- read.table(paste(urlBase, 'ProxyVariables.csv', sep=''),
								 sep=',', header=T, comment.char='')

ot <- read.table(paste(urlBase, 'ObservedTemps.csv', sep=''),
								 sep=',', header=T, comment.char='')
keyName <- 'Year'
varNames <- setdiff(colnames(pv), keyName)
yName <- setdiff(colnames(ot), keyName)
d <- merge(pv, ot, by=c(keyName))

# Perform a regression on new PCA variables
pcomp <- prcomp(d[,varNames]) #Should be: pcomp <- prcomp(d[, varNames], scale.=T))
synthNames <- colnames(pcomp$rotation)
d <- cbind(d, as.matrix(d[,varNames]) %*% pcomp$rotation)
f <- as.formula(paste(yName, paste(synthNames, collapse=' + '), sep=' ~ '))
model <- step(lm(f, data=d))

# Print and plot a little
print(summary(model)$r.squared)
d$pred <- predict(model, newdata=d)
library(ggplot2)
ggplot(data=d) +
	geom_point(aes_string(x=keyName,y=yName)) +
	geom_line(aes_string(x=keyName,y='pred'))

# Test and train
d$trainGroup <- d[, keyName] >= median(d[,keyName])
dTrain <- subset(d, trainGroup)
dTest <- subset(d, !trainGroup)
model <- step(lm(f, data=dTrain))
dTest$pred <- predict(model, newdata=dTest)
dTrain$pred <- predict(model, newdata=dTrain)
ggplot() +
	geom_point(data=dTest,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTest,aes_string(x=keyName,y='pred'),
						color='blue',linetype=2) +
	geom_point(data=dTrain,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTrain,aes_string(x=keyName,y='pred'),
						color='red',linetype=1)

# Use RMSE to quantify how badly we just did!
rmse <- function(x, y) { sqrt(mean((x - y)^2))}

print(rmse(dTrain[, yName], mean(dTrain[, yName]))) # Prediction better than just using mean
print(rmse(dTrain[, yName], dTrain$pred))

print(rmse(dTest[, yName], mean(dTest[, yName]))) # Prediction worse than just using mean
print(rmse(dTest[, yName], dTest$pred))

# Re-plot the data with a smoothing line fit through it
ggplot() +
	geom_point(data=dTest,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTest,aes_string(x=keyName,y='pred'),
						color='blue',linetype=2) +
	geom_segment(aes(x=min(dTest[,keyName]),
									 xend=max(dTest[,keyName]),
									 y=mean(dTest[,yName]),
									 yend=mean(dTest[,yName]))) +
	geom_point(data=dTrain,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTrain,aes_string(x=keyName,y='pred'),
						color='red',linetype=1) +
	geom_segment(aes(x=min(dTrain[,keyName]),
									 xend=max(dTrain[,keyName]),
									 y=mean(dTrain[,yName]),
									 yend=mean(dTrain[,yName]))) +
	geom_smooth(data=d,aes_string(x=keyName,y=yName),
							color='black')

# Did PCA help us at all?
g <- as.formula(paste(yName, paste(varNames, collapse=' + '), sep=' ~ '))
model <- lm(g, data=dTrain)
dTest$pred <- predict(model, newdata=dTest)
dTrain$pred <- predict(model, newdata=dTrain)

print(rmse(dTrain[, yName], mean(dTrain[, yName])))
print(rmse(dTrain[, yName], dTrain$pred))

print(rmse(dTest[, yName], mean(dTest[, yName])))
print(rmse(dTest[, yName], dTest$pred)) # 0.1973 compared to 0.2004 with PCA. (i.e. little benefit)

# Rerun the PCA fixing scaling and throwing out synth varibles with small variation
pcomp <- prcomp(d[,varNames],scale.=T)
synthNames <- colnames(pcomp$rotation)[pcomp$sdev>1]
f <- as.formula(paste(yName, paste(synthNames, collapse=' + '),sep=' ~ '))
model <- step(lm(f,data=dTrain))
dTest$pred <- predict(model,newdata=dTest)
dTrain$pred <- predict(model,newdata=dTrain)

print(rmse(dTrain[, yName], mean(dTrain[, yName])))
print(rmse(dTrain[, yName], dTrain$pred))

print(rmse(dTest[, yName], mean(dTest[, yName])))
print(rmse(dTest[, yName], dTest$pred)) # RMSE worstens to 0.2778, somewhat unexpectedly.
	# Code below is based on the tutorial here:
	#
	# http://www.r-bloggers.com/unprincipled-component-analysis/
	#

	# Load and prepare the data
	urlBase <- 'http://www.win-vector.com/dfiles/PCA/'

	pv <- read.table(paste(urlBase, 'ProxyVariables.csv', sep=''),
	sep=',', header=T, comment.char='')

	ot <- read.table(paste(urlBase, 'ObservedTemps.csv', sep=''),
	sep=',', header=T, comment.char='')
	keyName <- 'Year'
	varNames <- setdiff(colnames(pv), keyName)
	yName <- setdiff(colnames(ot), keyName)
	d <- merge(pv, ot, by=c(keyName))

	# Perform a regression on new PCA variables
	pcomp <- prcomp(d[,varNames]) #Should be: pcomp <- prcomp(d[, varNames], scale.=T))
	synthNames <- colnames(pcomp$rotation)
	d <- cbind(d, as.matrix(d[,varNames]) %*% pcomp$rotation)
	f <- as.formula(paste(yName, paste(synthNames, collapse=' + '), sep=' ~ '))
	model <- step(lm(f, data=d))

	# Print and plot a little
	print(summary(model)$r.squared)
	d$pred <- predict(model, newdata=d)
	library(ggplot2)
	ggplot(data=d) +
	geom_point(aes_string(x=keyName,y=yName)) +
	geom_line(aes_string(x=keyName,y='pred'))

	# Test and train
	d$trainGroup <- d[, keyName] >= median(d[,keyName])
	dTrain <- subset(d, trainGroup)
	dTest <- subset(d, !trainGroup)
	model <- step(lm(f, data=dTrain))
	dTest$pred <- predict(model, newdata=dTest)
	dTrain$pred <- predict(model, newdata=dTrain)
	ggplot() +
	geom_point(data=dTest,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTest,aes_string(x=keyName,y='pred'),
	color='blue',linetype=2) +
	geom_point(data=dTrain,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTrain,aes_string(x=keyName,y='pred'),
	color='red',linetype=1)

	# Use RMSE to quantify how badly we just did!
	rmse <- function(x, y) { sqrt(mean((x - y)^2))}

	print(rmse(dTrain[, yName], mean(dTrain[, yName]))) # Prediction better than just using mean
	print(rmse(dTrain[, yName], dTrain$pred))

	print(rmse(dTest[, yName], mean(dTest[, yName]))) # Prediction worse than just using mean
	print(rmse(dTest[, yName], dTest$pred))

	# Re-plot the data with a smoothing line fit through it
	ggplot() +
	geom_point(data=dTest,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTest,aes_string(x=keyName,y='pred'),
	color='blue',linetype=2) +
	geom_segment(aes(x=min(dTest[,keyName]),
	xend=max(dTest[,keyName]),
	y=mean(dTest[,yName]),
	yend=mean(dTest[,yName]))) +
	geom_point(data=dTrain,aes_string(x=keyName,y=yName)) +
	geom_line(data=dTrain,aes_string(x=keyName,y='pred'),
	color='red',linetype=1) +
	geom_segment(aes(x=min(dTrain[,keyName]),
	xend=max(dTrain[,keyName]),
	y=mean(dTrain[,yName]),
	yend=mean(dTrain[,yName]))) +
	geom_smooth(data=d,aes_string(x=keyName,y=yName),
	color='black')

	# Did PCA help us at all?
	g <- as.formula(paste(yName, paste(varNames, collapse=' + '), sep=' ~ '))
	model <- lm(g, data=dTrain)
	dTest$pred <- predict(model, newdata=dTest)
	dTrain$pred <- predict(model, newdata=dTrain)

	print(rmse(dTrain[, yName], mean(dTrain[, yName])))
	print(rmse(dTrain[, yName], dTrain$pred))

	print(rmse(dTest[, yName], mean(dTest[, yName])))
	print(rmse(dTest[, yName], dTest$pred)) # 0.1973 compared to 0.2004 with PCA. (i.e. little benefit)

	# Rerun the PCA fixing scaling and throwing out synth varibles with small variation
	pcomp <- prcomp(d[,varNames],scale.=T)
	synthNames <- colnames(pcomp$rotation)[pcomp$sdev>1]
	f <- as.formula(paste(yName, paste(synthNames, collapse=' + '),sep=' ~ '))
	model <- step(lm(f,data=dTrain))
	dTest$pred <- predict(model,newdata=dTest)
	dTrain$pred <- predict(model,newdata=dTrain)

	print(rmse(dTrain[, yName], mean(dTrain[, yName])))
	print(rmse(dTrain[, yName], dTrain$pred))

	print(rmse(dTest[, yName], mean(dTest[, yName])))
	print(rmse(dTest[, yName], dTest$pred)) # RMSE worstens to 0.2778, somewhat unexpectedly.