Skip to content

Instantly share code, notes, and snippets.

@vsoch
Last active December 26, 2015 00:09
Show Gist options
  • Save vsoch/7061899 to your computer and use it in GitHub Desktop.
Save vsoch/7061899 to your computer and use it in GitHub Desktop.
R Functions in the Cloud!
# This function will use plotly to create an online boxplot!
plotlyBox = function(username,key,datastore) {
# Username is your plotly username
# API key is found via plot.ly, login --> Access plotly --> settings
# datalist is a list of data
# The number of plots, N, is determined by length(N)
# Eg, put your data into a list
# datastore = list()
# datastore[[1]] = boxplot1
# datastore[[2]] = boxplot2
# Other packages that you need installed!
# Rcurl (for postForm)
# RJSONIO (for toJSON)
# You can install all with devtools:
# install.packages('devtools')
# library('devtools')
# devtools::install_github("plotly/R-api")
# Plotly functions code lovely and available at:
# https://plot.ly/r/script-demos/spectral-density-box-plot-example/
# ------------------------------------------------
# Install plotly (or re-install, heh)
require('devtools')
require('plotly')
py <- plotly(username=username, key=key)
# Define number of boxes
N = length(datastore)
# Source makeColors function to define boxplot colors
source('http://tinyurl.com/vsochR')
# Define colors
col = makeColors(N)
# Now package up colors and data!
package = lapply(
seq(N),
function(i){
list(
y = datastore[[i]],type = 'box', marker=list(color = col[i][[1]][1])
)
}
)
# Define the layout
layout <- list(
xaxis = list(
showgrid = FALSE,
zeroline = FALSE,
tickangle = 60,
showticklabels = FALSE
),
yaxis = list(
zeroline = FALSE,
gridcolor = 'white'
),
paper_bgcolor = 'rgb(233,233,233)',
plot_bgcolor = 'rgb(233,233,233)',
showlegend = FALSE
)
# Now send to plotly!
response <- py$plotly(package, kwargs = list(layout=layout))
return(response)
}
#-- PREPROCESSING------------------------------------------------
# Normalize data
normalize <- function(x) {
x <- sweep(x, 2, apply(x, 2, min))
sweep(x, 2, apply(x, 2, max), "/")
}
# Merge data based on row name and drop nonshared rows
mergeByRows = function(data1,data2){
# Find values shared by both data
shared.names = intersect(rownames(data1),rownames(data2))
data1.sub = data1[which(rownames(data1) %in% shared.names),]
data2.sub = data2[which(rownames(data2) %in% shared.names),]
data = cbind(data1.sub,data2.sub)
return(data)
}
#-K-MEANS CLUSTERING --------------------------------------------
# Run kmeans and visualize error against number of clusters
runKmeans = function(data) {
require(graphics)
x = normalize(data)
# Determine number of clusters
wss <- (nrow(x)-1)*sum(apply(x,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(x, centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
# cluster based on 2,3,4,5 classes dimensional example
for (k in 1:5) {
(cl <- kmeans(x, k))
points(cl$centers, col = 1:2, pch = 8, cex = 2)
}
}
# Run kmeans and color based on a particular metric
runKmeansColor = function(data,colorBy) {
require(graphics)
# Normalize data
normalize <- function(x) {
x <- sweep(x, 2, apply(x, 2, min))
sweep(x, 2, apply(x, 2, max), "/")
}
x = normalize(data)
# Determine number of clusters
wss <- (nrow(x)-1)*sum(apply(x,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(x, centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
# cluster based on 2,3,4,5 classes dimensional example
for (k in 1:5) {
(cl <- kmeans(x, k))
plot(colorBy, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)
}
}
#-HIERARCHICAL CLUSTERING -----------------------------------------
# Standard Hierarchical Clustering
# data matrix, number of clusters, and labels
WardHierCluster = function(data,numclus,lab) {
# Ward Hierarchical Clustering
d <- dist(data, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward")
plot(fit,main="Clustering of Cortical Metrics",labels=lab) # display dendogram
groups <- cutree(fit, k=numclus) # cut tree into 6 clusters
# draw dendogram with red borders around the 5 clusters
rect.hclust(fit, k=numclus, border="red")
return(fit)
}
# Sparse Hierarchical clustering (sparcl)
# This function returns a primary and complementary sparse hierarchical clustering,
# colored by some labels, y. You should next see if the groups have biological validity
runSparclHier = function(x,y) {
# Load sparcl library
library('sparcl')
# Do tuning parameter selection for sparse hierarchical clustering
perm.out <- HierarchicalSparseCluster.permute(as.matrix(x), wbounds=c(1.5,2:6),nperms=5)
print(perm.out)
plot(perm.out)
# Perform sparse hierarchical clustering
sparsehc <- HierarchicalSparseCluster(dists=perm.out$dists,wbound=perm.out$bestw, method="complete")
par(mfrow=c(1,2))
plot(sparsehc)
plot(sparsehc$hc, labels=rep("", length(y)))
print(sparsehc)
# Plot using knowledge of class labels in order to compare true class
# labels to clustering obtained
par(mfrow=c(1,1))
ColorDendrogram(sparsehc$hc,y=y,main="Cortical Morphometry Clustering",branchlength=.007)
# Now look for secondary clustering after removing signal from the first
sparsehc.comp <- HierarchicalSparseCluster(as.matrix(x),wbound=perm.out$bestw,method="complete",uorth=sparsehc$u)
# Redo the analysis, but this time use "absolute value" dissimilarity:
perm.out <- HierarchicalSparseCluster.permute(as.matrix(x), wbounds=c(1.5,2:6),nperms=5, dissimilarity="absolute.value")
print(perm.out)
plot(perm.out)
# Perform sparse hierarchical clustering
sparsehc.comp <- HierarchicalSparseCluster(dists=perm.out$dists, wbound=perm.out$bestw, method="complete", dissimilarity="absolute.value")
par(mfrow=c(1,2))
plot(sparsehc.comp)
toreturn = list("sparsehc" = sparsehc, "comphc" = sparsehc.comp, "label" = y)
return(toreturn)
}
# - CLUSTERING VALIDATION-----------------------------------------------
# EVALUATE SPARCL CLUSTERING
evalSparcl = function(sparcl,y) {
library('clValid')
# First validate with internal metrics
internal <- clValid(sparcl$u,validation="internal",neighbSize=50, nClust=2:20)
summary(internal)
plot(internal)
# Validate biologically - first create two labels based on Rx
fc <- tapply(rownames(sparcl$u),y, c)
bio <- clValid(sparcl$u, nClust = 2,validation="biological", annotation=fc)
optimalScores(bio)
toreturn = list("bio" = bio, "internal" = internal)
return(toreturn)
}
# Some useful functions to remember for R!
# 1) DELETE FROM A DATA FRAME
# First define the indices that you want to delete
delete = c(16,25,26,73)
# Now use subset to do it
df = subset(df, select = -delete )
# - MAKE COLORS FOR N PLOTS -------------------------------------------------
makeColors = function(N) {
cols = lapply(seq(0,360,length = N),
function(i){
paste('hsl(',as.numeric(i),',50%,50%)',sep='')
}
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment