Create a gist now

Instantly share code, notes, and snippets.

R code for examining principle components analysis
*.png
*.tab
S3 = s3cmd -c ~/.s3.bioinformaticszen
BUCKET = s3://bioinformatics-zen/principal_components_analysis/
all: upload
upload: regression pca
ls *.png | xargs -I {} $(S3) put {} $(BUCKET)
ls *.png | xargs -I {} $(S3) setacl $(BUCKET){} --acl-public
regression: regression.r
./$<
pca: pca.r
./$<
clean:
rm -f *.png *.tab
#!/usr/bin/env Rscript
# load the crab data
library(ggplot2)
library(MASS)
data(crabs)
crabs <- within(crabs,{
levels(sp) <- c('Blue','Orange')
levels(sex) <- c('Female','Male')
})
# Perform PCA on the data
# retx returns the scores for each crab
# Append the components for each crab to the original data
crab.pca <- prcomp(crabs[,4:8],retx=TRUE)
crabs$PC1 <- crab.pca$x[,1]
crabs$PC2 <- crab.pca$x[,2]
crabs$PC3 <- crab.pca$x[,3]
# Print the rounded scores for the first three components
write.table(signif(crab.pca$rotation[,1:3],3),"rotations.tab")
#
# Plot discrimating factors found in the second component
#
plot <- ggplot(crabs, aes(y = CW, x = RW)) +
geom_point() +
scale_x_continuous("Rear width (mm)") +
scale_y_continuous("Carapace width (mm)")
png("second_component_dotplot.png"); print(plot)
#
# The second principle component corresponds to differences in
# the ratio of carapace width to rear width
#
plot <- ggplot(crabs, aes(x = CW/RW)) +
stat_density() +
scale_x_continuous("Ratio of Carapace width to Rear width") +
scale_y_continuous("Density")
png("second_component_density.png"); print(plot)
#
# Discriminating factors for the third component
#
plot <- ggplot(crabs, aes(x = CW, y = BD)) +
geom_point() +
scale_x_continuous("Carapace width (mm)") +
scale_y_continuous("Body depth (mm)")
png("third_component_dotplot.png"); print(plot)
#
# The third component corresponds to body depth and carapace width
#
plot <- ggplot(crabs, aes(x = BD/CW)) +
stat_density() +
scale_x_continuous("Ratio of Body Depth to Carapace Width") +
scale_y_continuous("Density")
png("third_component_density.png"); print(plot)
#
# Plot crab species for each of the two variables
#
plot <- ggplot(crabs, aes(x = BD/CW, y = CW/RW, pch=sex, col=sp, size=sex)) +
geom_point() +
scale_x_continuous("Ratio of body depth to carapace width") +
scale_y_continuous("Ratio of carapace width to rear width") +
scale_shape_manual("Sex",values=c(1,15)) +
scale_color_manual("Crab\nSpecies",values=c("blue","orange3")) +
scale_size_manual(values=c(4,3),guide=FALSE) +
theme(legend.position = "bottom")
png("morphology.png",height=600); print(plot)
#
# Plot first two components
#
plot <- ggplot(crabs, aes(x = PC1, y = PC2, pch=sex, col=sp, size=sex)) +
geom_point() +
scale_x_continuous("First Principle Component") +
scale_y_continuous("Second Principle Component") +
scale_shape_manual("Sex",values=c(1,15)) +
scale_color_manual("Crab\nSpecies",values=c("blue","orange3")) +
scale_size_manual(values=c(4,3),guide=FALSE) +
theme(legend.position = "bottom")
png("first_components.png",height=600); print(plot)
#
# Plot second two components
#
plot <- ggplot(crabs, aes(x = PC2, y = PC3, pch=sex, col=sp, size=sex)) +
geom_point() +
scale_x_continuous("Second Principle Component") +
scale_y_continuous("Third Principle Component") +
scale_shape_manual("Sex",values=c(1,15)) +
scale_color_manual("Crab\nSpecies",values=c("blue","orange3")) +
scale_size_manual(values=c(4,3),guide=FALSE) +
theme(legend.position = "bottom")
png("second_components.png",height=600); print(plot)
#!/usr/bin/env Rscript
library(ggplot2)
# load the crab data
library(MASS)
data(crabs)
crabs <- within(crabs,{
levels(sp) <- c('Blue','Orange')
levels(sex) <- c('Male','Female')
orange <- ifelse(sp == 'Orange',1,0)
male <- ifelse(sex == 'Male' ,1,0)
})
accuracy <- function(model,variable,testing){
predictions <- ifelse(predict(model,testing) > 0.5, 1, 0)
1 - sum((testing[,variable] - predictions)^2) / dim(testing)[1]
}
simulate.accuracy <- function(formula,variable){
replicate(100,{
n <- nrow(crabs)
n.training <- 1:120
n.testing <- 121:200
shuffle <- sample(n)
training <- crabs[shuffle[n.training],]
testing <- crabs[shuffle[n.testing],]
accuracy(glm(formula, data=training, family="binomial"),variable,testing)
})
}
simulations <- data.frame(
accuracy = c(
simulate.accuracy(male ~ CW + RW, 'male'),
simulate.accuracy(male ~ CW + RW + I(CW/RW),'male'),
simulate.accuracy(orange ~ CW + BD, 'orange'),
simulate.accuracy(orange ~ CW + BD + I(BD/CW),'orange')),
predictor = rep(c('sex', 'species'),1,each=200),
model.type = rep(c('simple','ratio'), 2,each=100))
p <- ggplot(simulations,aes(x=predictor, y=accuracy * 100, fill=model.type)) +
geom_boxplot() +
scale_x_discrete("Predicted Variable") +
scale_y_continuous("Model Accuracy (%)")
png('accuracy.png')
print(p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment