Skip to content

Instantly share code, notes, and snippets.

@dinovski
Last active December 23, 2020 09:19
Show Gist options
  • Save dinovski/e9aaa99880a3354b58dcbd50a59d5039 to your computer and use it in GitHub Desktop.
Save dinovski/e9aaa99880a3354b58dcbd50a59d5039 to your computer and use it in GitHub Desktop.
import cell sorting data and perform automated gating using unsupervised learning (k-means)
#!/usr/bin/Rscript
## Perform automated gating of FACS data using unsupervised clustering and regression analysis.
## Data is first filtered to exclude debris (low FSC and high SSC) and doublets (FSC-A v FSC-H)
## and the CSV is exported using FlowJo (flowjo.com).
## This analysis was designed to assess the effects of various microsatellite lengths on gene expression
## using a reporter assay. Briefly, varying 'eSTRs' (gBlock gene fragments) were cloned upstream of a CMV
## promoter driving GFP reporter gene expression and co-transfected with an RFP expression plasmid at a
## raio of 2:1. Reference samples of GFP:RFP were co-transfected at various ratios (1:1, 2:1, 3:1, 4:1) and
## prepared in triplicate for each experiment.
## WORKFLOW
## 1. Import CSV with fluorescent measures
## 2. Filter extreme values
## 3. Perform k-means clustering to gate cell populations
## 4. Fit linear model and compute MSE
## 5. Export slope of regression to compare signal across conditions
## LOAD ENVIRONMENT
library(mclust)
library(fpc)
library(tclust)
library(dplyr)
library(plotrix)
interactive()
## INPUT
inPath='/path/to/csvs/'
outPath='/path/to/export/'
sampMarker='FITC.A'
refMarker='E.Texas.Red.A'
## References
refs<-list.files(path=inPath, recursive=FALSE, pattern=".csv", full.names=TRUE)
x<-c(1, 1.5, 2, 2.5, 3, 4) ## Transfection ratio of sampMarker to refMarker
xlab<-"Tfxn ratio GFP:RFP"
main<-"Reference plate"
lrmain=Sys.date()
## Samples
samps<-list.files(path=inPath, recursive=FALSE, pattern=".csv", full.names=TRUE)
x<-c(0, 0, 0, 11, 11, 11, 16, 16, 16, 21, 21, 21) ## STR allele length
xlab<-"STR allele length"
main<-"500ng gblock 2:1 GFP:RFP" ## Experiment name
lrmain<-Sys.date()
##---------------
## FUNCTIONS
## fit lm with automated gating
clust_analysis <- function (filenames) {
slopes<-numeric()
up<-numeric()
low<-numeric()
lr<-numeric()
for (i in 1:length(filenames)) {
print(filenames[i])
f <- read.csv(filenames[i], header=TRUE)
## Filter extreme values
f<-f[f[,c(sampMarker)] < max(f[,c(sampMarker)]),]
#f<-f[f[,c(refMarker)] < max(f[,c(refMarker)]),]
## Select samp and ref marker columns
g<-f[,c(sampMarker)]
r<-f[,c(refMarker)]
## BEGIN LOG
#plot log scale
plot(xy.coords(r,g),xlim=c(1,max(r)),ylim=c(1,max(g)),log="xy",xlab="RFP",ylab="GFP")
#plot linear
#gr<-cbind(r,g)
#plot(gr[,1], gr[,2])
grlinear<-cbind(r, g)
## gate on log scale
gr<-grlinear + 1-min(grlinear)
gr<-log10(gr)
## END LOG
#within SS
#wss <- (nrow(gr)-1)*sum(apply(gr,2,var))
#for (i in 2:10) wss[i] <- sum(kmeans(gr, centers=i)$withinss) #SS
#plot(1:10, wss, type="b", xlab="num clusters", ylab="within group sum of squares")
#tclus<-tkmeans(gr, k=2, alpha = 0.01)
tclus<-tclust(gr, k=2, alpha=0.01)
plot(tclus, main="tclust", xlab=c(refMarker), ylab=c(sampMarker))
#readline(prompt="enter")
#append cluster info to linear data
gr <- data.frame(grlinear, tclus$cluster)
gr1<-gr[gr[,3]==1,]
gr2<-gr[gr[,3]==2,]
#gr3<-gr[gr[,3]==3,]
#gr4<-gr[gr[,3]==4,]
## plot diff clusters
#plot(gr1[,1], gr1[,2], main="cluster 1")
#readline(prompt="enter to continue")
#plot(gr2[,1], gr2[,2], main="cluster 2")
#readline(prompt="enter to continue")
#plot(gr3[,1], gr3[,2], main="cluster 3")
#readline(prompt="enter to continue")
#plot(gr4[,1], gr4[,2], main="cluster 4")
#readline(prompt="enter to continue")
## linear regression sampMarker ~ refMarker
fit<-lm(gr2[,c(sampMarker)] ~ gr2[,c(refMarker)])
## slope and intercept
b <- fit$coefficients[2]
a <- fit$coefficients[1]
## plot sampMarker v. refMarker cluster
plot(gr1[,1], gr1[,2], pch=20, bg='blue', col='blue')
abline(a,b, col = 'red')
slopes<-append(slopes, b)
## confidence intervals
ci <- confint(fit, level=0.95) # CI for slope
u <-ci[4]
l <- ci[2]
up<-append(up, u)
low<-append(low, l)
}
plotCI(x, slopes, up-slopes, slopes-low, main=main, xlab=xlab, ylab=paste("slope(", sampMarker, "~" ,refMarker, ")"))
print(u)
print(l)
}
## fit lm without automated gating
facs_analyze <- function (filenames) {
slopes <- numeric()
up<-numeric()
low<-numeric()
summaries<-NULL
for (i in 1:length(filenames)) {
print(filenames[i])
f <- read.csv(filenames[i], header=TRUE)
## remove extreme values
f<-f[f[,(sampMarker)]<max(f[,c(sampMarker)]),]
f<-f[f[,c(refMarker)]<max(f[,c(refMarker)]),]
#c<-cor(f[,c(refMarker)], f[,c(sampMarker)])
#c<-cor.test(f[,c(refMarker)], f[,c(sampMarker)], method="pearson")
#print(c)
## fit linear model
fit <- lm(f[,c(sampMarker)] ~ f[,c(refMarker)])
## MSE
sm<-summary(fit)
fitsum<-mean(sm$residuals^2)
summaries<-append(summaries, fitsum)
## slope and intercept
b <- fit$coefficients[2]
a <- fit$coefficients[1]
## plot sampMarker v. refMarker
#lr<-plot(f[,c(refMarker)], f[,c(sampMarker)], pch=20, bg='blue', col='blue', main=lrmain)
#abline(a,b, col = 'red')
#read <- readline( paste(sampMarker, "~", refMarker, ": press [enter] to continue"))
## append slope
slopes<-append(slopes, b)
## confidence intervals
ci <- confint(fit, "f[,c(refMarker)]", level=0.95) # CI of slope
u <-ci[2]
l <- ci[1]
up<-append(up, u)
low<-append(low, l)
}
## print MSE info with condition IDs(x)
names(summaries)<-c(x)
print(summaries)
names(slopes)<-c(x)
print(slopes)
## plot transfection ratios and slopes of samp:ref regression with CIs
plotCI(x, slopes, up-slopes, slopes-low, main=main, xlab=xlab, ylab=paste("slope(", sampMarker, "~" ,refMarker, ")"))
}
## Execute functions
facs_analyze(refs)
facs_analyze(samps)
clust_analysis(refs)
clust_analysis(samps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment