SRA Sequencing Platform Statistics
#' --- | |
#' title: "SRA Sequencing Platform Analysis" | |
#' author: "Phil Ewels" | |
#' date: September 2014 | |
#' --- | |
#' The Sequence Read Archive [(SRA)](http://www.ncbi.nlm.nih.gov/sra) | |
#' contains the raw DNA-sequencing data for many published datasets. | |
#' We were interested in which sequencing platforms were prevalent, | |
#' or rather, how prevalent Illumina Sequencing is. We were specifically | |
#' interested in Human data, notably Whole Genome Sequencing (WGS). | |
#+ include=FALSE | |
setwd('~/Work/SRA_illumina_stats/') | |
#+ par=TRUE | |
# SETUP | |
library('scales') | |
#' To get at the metadata contained within the SRA, I did a search for | |
#' all human samples - `"homo sapiens"[Organism]`. This returns 184586 | |
#' results at the time of writing. The summaries of these search results | |
#' can be downloaded as a `.csv` file, which is used in this analysis. | |
# Load in the data | |
data <- read.csv('sra_result_human.csv') | |
# Look at the column headers | |
colnames(data) | |
# Split up by Illumina and Not Illumina | |
illumina_machines <- c('Illumina Genome Analyzer', 'Illumina Genome Analyzer II', 'Illumina Genome Analyzer IIx', | |
'Illumina HiScanSQ', 'Illumina HiSeq 1000', 'Illumina HiSeq 1500', 'Illumina HiSeq 2000', | |
'Illumina HiSeq 2500', 'Illumina MiSeq') | |
total_illumina <- data[(data$Instrument %in% illumina_machines),] | |
total_not_illumina <- data[!(data$Instrument %in% illumina_machines),] | |
#' # Initial Overview | |
#' We're most interested in the number of samples and the total number | |
#' of base pairs sequenced - these give a feel of the dominancy of certain | |
#' platforms. | |
# Count samples and base pairs | |
num_samples <- nrow(data) | |
num_samples.illumina <- nrow(total_illumina) | |
num_samples.not_illumina <- nrow(total_not_illumina) | |
num_bp <-sum(data$Total.Bases, na.rm=TRUE) | |
num_bp.illumina <- sum(total_illumina$Total.Bases, na.rm=TRUE) | |
num_bp.not_illumina <- sum(total_not_illumina$Total.Bases, na.rm=TRUE) | |
# What numbers are we talking about here? | |
num_samples | |
num_bp | |
# Plot the number of samples for Illumina / Not Illumina | |
par(mai=c(1,1,1,1)) | |
pie(c(num_samples.illumina, num_samples.not_illumina), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)), | |
paste('Not Illumina -', percent(num_samples.not_illumina/num_samples)))) | |
title("Number of Samples per Platform") | |
# Plot the total base pairs sequenced for Illumina / Not Illumina | |
pie(c(num_bp.illumina, num_bp.not_illumina), init.angle=0, | |
labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)), | |
paste('Not Illumina -', percent(num_bp.not_illumina/num_bp)))) | |
title("Number of Base Pairs per Platform") | |
#' ## Hang on, what was that? | |
#' The above plot is really surprising! The vast majority of samples are | |
#' sequences using Illumina machines, but when looking at the total | |
#' number of base pairs this drops significantly. | |
#' | |
#' # Explaining the skew in percentage of base pairs | |
#+ fig.width=8, fig.height=8, par=TRUE | |
# Back to the original data - plot number of samples for all instruments | |
par(mai=c(1,3,1,1)) | |
samples_by_instrument <- table(data$Instrument) | |
samples_by_instrument <- sort(samples_by_instrument) | |
barplot(samples_by_instrument, horiz=TRUE, las=1, main="Samples per Instrument", cex.names=0.7) | |
# Plot the total base pairs for all instruments | |
bp_by_instrument <- rowsum(data$Total.Bases, data$Instrument, na.rm = TRUE) | |
bp_by_instrument <- bp_by_instrument[order(bp_by_instrument[,1]),] # sort by frequency | |
barplot(bp_by_instrument, horiz=TRUE, las=1, main="BP Per Instrument", cex.names=0.7) | |
#' Ok, So it looks like this discrepancy is caused by a handful of samples | |
#' sequenced using the `Complete Genomics` platform. What kinds of studies | |
#' were these? | |
#+ fig.width=8, fig.height=8, par=TRUE | |
# Split this up for library types | |
# I should group the low frequency types into "other". For now, it's just | |
# sorted by frequency so ignore the stuff at the bottom of the legend. | |
par(mai=c(1,3,1,1)) | |
samples_type_per_instrument <- table(data$Library.Strategy, data$Instrument) | |
samples_type_per_instrument <- samples_type_per_instrument[order(rowSums(samples_type_per_instrument),decreasing=TRUE),order(colSums(samples_type_per_instrument))] | |
library('RColorBrewer') | |
colours<-brewer.pal(11, 'Spectral') | |
cols<-colorRampPalette(colours)(nrow(samples_type_per_instrument)) | |
barplot(samples_type_per_instrument, horiz=TRUE, las=1, main="Samples per Instrument", col=colours, cex.names=0.7) | |
legend("bottomright", rownames(samples_type_per_instrument), fill=colours, cex=0.7) | |
# Base pairs per platform split by library types | |
# This is horrible code. If anyone has a better way of writing this I'd love to know it! | |
library('plyr') | |
library('reshape2') | |
bp_type_per_instrument <- aggregate(Total.Bases ~ Instrument + Library.Strategy, data = data, sum, simplify=TRUE) | |
# Unmelt these two variables to make a matrix | |
bp_type_per_instrument <- dcast(bp_type_per_instrument, Library.Strategy ~ Instrument, fill=0) | |
# Massage into the correct format | |
rownames(bp_type_per_instrument) <- bp_type_per_instrument[,1] | |
bp_type_per_instrument <- as.matrix(bp_type_per_instrument[,-1]) | |
# Sort that bad boy | |
bp_type_per_instrument <- bp_type_per_instrument[order(rowSums(bp_type_per_instrument),decreasing=TRUE),order(colSums(bp_type_per_instrument))] | |
cols<-colorRampPalette(colours)(nrow(bp_type_per_instrument)) | |
barplot(bp_type_per_instrument, horiz=TRUE, las=1, main="Base Pairs per Instrument", col=colours, cex.names=0.7) | |
legend("bottomright", rownames(bp_type_per_instrument), fill=colours, cex=0.7) | |
#' Ok, so that explains it - there are some Whole Genome Sequencing (WGS) | |
#' and Whole Exome Sequencing (WXS) projects done by Complete Genomics which | |
#' are massive and are accounting for a large proportion of the base pairs | |
#' stored on the SRA. | |
#' | |
#' [Complete Genomics](http://www.completegenomics.com/) are a sequencing | |
#' facility based in the USA. They have a very complicated library prep | |
#' and bioinformatics pipline. As such, they don't sell sequencing machines, | |
#' but do #' genome sequencing of samples sent in. They were recently bought by | |
#' [BGI](http://www.genomics.cn/). | |
#' | |
#' # Repeating the Summary Stats with Complete Genomics | |
# Repeat counts with Complete genomics | |
total_cg <- data[data$Instrument == 'Complete Genomics',] | |
num_samples.cg <- nrow(total_cg) | |
num_samples.not_illumina_cg <- num_samples.not_illumina - num_samples.cg | |
num_bp.cg <- sum(total_cg$Total.Bases, na.rm=TRUE) | |
num_bp.not_illumina_cg <- num_bp.not_illumina - num_bp.cg | |
# Summarise again with Complete genomics | |
par(mai=c(1,1,1,1)) | |
pie(c(num_samples.illumina, num_samples.cg, num_samples.not_illumina_cg), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)), | |
paste('Complete Genomics -', percent(num_samples.cg/num_samples)), | |
paste('Not Illumina -', percent(num_samples.not_illumina_cg/num_samples)))) | |
title("Number of Samples per Platform") | |
pie(c(num_bp.illumina, num_bp.cg, num_bp.not_illumina_cg), init.angle=140, | |
labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)), | |
paste('Complete Genomics -', percent(num_bp.cg/num_bp)), | |
paste('Not Illumina -', percent(num_bp.not_illumina_cg/num_bp)))) | |
title("Number of Base Pairs per Platform") | |
#' # WGS Summary, without Complete Genomics | |
#' Finally, it would be nice to know the precentages for just Whole | |
#' Genome Sequencing (WGS), ignoring the Complete Genomics platform. | |
# Find summary stats in WGS only, without Complete Genomics | |
total_illumina.WGS <- total_illumina[total_illumina$Library.Strategy=='WGS',] | |
total_not_illumina_cg <- data[!(data$Instrument %in% c(illumina_machines, 'Complete Genomics')),] | |
total_not_illumina_cg.WGS <- total_not_illumina_cg[total_not_illumina_cg$Library.Strategy=='WGS',] | |
num_samples.wgs.illumina <- nrow(total_illumina.WGS) | |
num_samples.wgs.not_illumina_cg <- nrow(total_not_illumina_cg.WGS) | |
num_samples.wgs.not_cg <- num_samples.wgs.illumina + num_samples.wgs.not_illumina_cg | |
num_bp.wgs.illumina <- sum(total_illumina.WGS$Total.Bases, na.rm=TRUE) | |
num_bp.wgs.not_illumina_cg <- sum(total_not_illumina_cg.WGS$Total.Bases, na.rm=TRUE) | |
num_bp.wgs.not_cg <- num_bp.wgs.illumina + num_bp.wgs.not_illumina_cg | |
pie(c(num_samples.wgs.illumina, num_samples.wgs.not_illumina_cg), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_samples.wgs.illumina/num_samples.wgs.not_cg)), | |
paste('Not Illumina or CG -', percent(num_samples.wgs.not_illumina_cg/num_samples.wgs.not_cg)))) | |
title("Number of Samples per Platform - WGS, no CG") | |
pie(c(num_bp.wgs.illumina, num_bp.wgs.not_illumina_cg), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_bp.wgs.illumina/num_bp.wgs.not_cg)), | |
paste('Not Illumina or CG -', percent(num_bp.wgs.not_illumina_cg/num_bp.wgs.not_cg)))) | |
title("Number of Base Pairs per Platform - WGS, no CG") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment