Created
October 28, 2012 20:30
-
-
Save michaelbarton/3969797 to your computer and use it in GitHub Desktop.
R code for examining principle components analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.png | |
*.tab |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
S3 = s3cmd -c ~/.s3.bioinformaticszen | |
BUCKET = s3://bioinformatics-zen/principal_components_analysis/ | |
all: upload | |
upload: regression pca | |
ls *.png | xargs -I {} $(S3) put {} $(BUCKET) | |
ls *.png | xargs -I {} $(S3) setacl $(BUCKET){} --acl-public | |
regression: regression.r | |
./$< | |
pca: pca.r | |
./$< | |
clean: | |
rm -f *.png *.tab |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# load the crab data | |
library(ggplot2) | |
library(MASS) | |
data(crabs) | |
crabs <- within(crabs,{ | |
levels(sp) <- c('Blue','Orange') | |
levels(sex) <- c('Female','Male') | |
}) | |
# Perform PCA on the data | |
# retx returns the scores for each crab | |
# Append the components for each crab to the original data | |
crab.pca <- prcomp(crabs[,4:8],retx=TRUE) | |
crabs$PC1 <- crab.pca$x[,1] | |
crabs$PC2 <- crab.pca$x[,2] | |
crabs$PC3 <- crab.pca$x[,3] | |
# Print the rounded scores for the first three components | |
write.table(signif(crab.pca$rotation[,1:3],3),"rotations.tab") | |
# | |
# Plot discrimating factors found in the second component | |
# | |
plot <- ggplot(crabs, aes(y = CW, x = RW)) + | |
geom_point() + | |
scale_x_continuous("Rear width (mm)") + | |
scale_y_continuous("Carapace width (mm)") | |
png("second_component_dotplot.png"); print(plot) | |
# | |
# The second principle component corresponds to differences in | |
# the ratio of carapace width to rear width | |
# | |
plot <- ggplot(crabs, aes(x = CW/RW)) + | |
stat_density() + | |
scale_x_continuous("Ratio of Carapace width to Rear width") + | |
scale_y_continuous("Density") | |
png("second_component_density.png"); print(plot) | |
# | |
# Discriminating factors for the third component | |
# | |
plot <- ggplot(crabs, aes(x = CW, y = BD)) + | |
geom_point() + | |
scale_x_continuous("Carapace width (mm)") + | |
scale_y_continuous("Body depth (mm)") | |
png("third_component_dotplot.png"); print(plot) | |
# | |
# The third component corresponds to body depth and carapace width | |
# | |
plot <- ggplot(crabs, aes(x = BD/CW)) + | |
stat_density() + | |
scale_x_continuous("Ratio of Body Depth to Carapace Width") + | |
scale_y_continuous("Density") | |
png("third_component_density.png"); print(plot) | |
# | |
# Plot crab species for each of the two variables | |
# | |
plot <- ggplot(crabs, aes(x = BD/CW, y = CW/RW, pch=sex, col=sp, size=sex)) + | |
geom_point() + | |
scale_x_continuous("Ratio of body depth to carapace width") + | |
scale_y_continuous("Ratio of carapace width to rear width") + | |
scale_shape_manual("Sex",values=c(1,15)) + | |
scale_color_manual("Crab\nSpecies",values=c("blue","orange3")) + | |
scale_size_manual(values=c(4,3),guide=FALSE) + | |
theme(legend.position = "bottom") | |
png("morphology.png",height=600); print(plot) | |
# | |
# Plot first two components | |
# | |
plot <- ggplot(crabs, aes(x = PC1, y = PC2, pch=sex, col=sp, size=sex)) + | |
geom_point() + | |
scale_x_continuous("First Principle Component") + | |
scale_y_continuous("Second Principle Component") + | |
scale_shape_manual("Sex",values=c(1,15)) + | |
scale_color_manual("Crab\nSpecies",values=c("blue","orange3")) + | |
scale_size_manual(values=c(4,3),guide=FALSE) + | |
theme(legend.position = "bottom") | |
png("first_components.png",height=600); print(plot) | |
# | |
# Plot second two components | |
# | |
plot <- ggplot(crabs, aes(x = PC2, y = PC3, pch=sex, col=sp, size=sex)) + | |
geom_point() + | |
scale_x_continuous("Second Principle Component") + | |
scale_y_continuous("Third Principle Component") + | |
scale_shape_manual("Sex",values=c(1,15)) + | |
scale_color_manual("Crab\nSpecies",values=c("blue","orange3")) + | |
scale_size_manual(values=c(4,3),guide=FALSE) + | |
theme(legend.position = "bottom") | |
png("second_components.png",height=600); print(plot) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
library(ggplot2) | |
# load the crab data | |
library(MASS) | |
data(crabs) | |
crabs <- within(crabs,{ | |
levels(sp) <- c('Blue','Orange') | |
levels(sex) <- c('Male','Female') | |
orange <- ifelse(sp == 'Orange',1,0) | |
male <- ifelse(sex == 'Male' ,1,0) | |
}) | |
accuracy <- function(model,variable,testing){ | |
predictions <- ifelse(predict(model,testing) > 0.5, 1, 0) | |
1 - sum((testing[,variable] - predictions)^2) / dim(testing)[1] | |
} | |
simulate.accuracy <- function(formula,variable){ | |
replicate(100,{ | |
n <- nrow(crabs) | |
n.training <- 1:120 | |
n.testing <- 121:200 | |
shuffle <- sample(n) | |
training <- crabs[shuffle[n.training],] | |
testing <- crabs[shuffle[n.testing],] | |
accuracy(glm(formula, data=training, family="binomial"),variable,testing) | |
}) | |
} | |
simulations <- data.frame( | |
accuracy = c( | |
simulate.accuracy(male ~ CW + RW, 'male'), | |
simulate.accuracy(male ~ CW + RW + I(CW/RW),'male'), | |
simulate.accuracy(orange ~ CW + BD, 'orange'), | |
simulate.accuracy(orange ~ CW + BD + I(BD/CW),'orange')), | |
predictor = rep(c('sex', 'species'),1,each=200), | |
model.type = rep(c('simple','ratio'), 2,each=100)) | |
p <- ggplot(simulations,aes(x=predictor, y=accuracy * 100, fill=model.type)) + | |
geom_boxplot() + | |
scale_x_discrete("Predicted Variable") + | |
scale_y_continuous("Model Accuracy (%)") | |
png('accuracy.png') | |
print(p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment