Created
September 18, 2014 16:57
-
-
Save ewels/6fc0717eac64c7fbc8bc to your computer and use it in GitHub Desktop.
SRA Sequencing Platform Statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' --- | |
#' title: "SRA Sequencing Platform Analysis" | |
#' author: "Phil Ewels" | |
#' date: September 2014 | |
#' --- | |
#' The Sequence Read Archive [(SRA)](http://www.ncbi.nlm.nih.gov/sra) | |
#' contains the raw DNA-sequencing data for many published datasets. | |
#' We were interested in which sequencing platforms were prevalent, | |
#' or rather, how prevalent Illumina Sequencing is. We were specifically | |
#' interested in Human data, notably Whole Genome Sequencing (WGS). | |
#+ include=FALSE | |
setwd('~/Work/SRA_illumina_stats/') | |
#+ par=TRUE | |
# SETUP | |
library('scales') | |
#' To get at the metadata contained within the SRA, I did a search for | |
#' all human samples - `"homo sapiens"[Organism]`. This returns 184586 | |
#' results at the time of writing. The summaries of these search results | |
#' can be downloaded as a `.csv` file, which is used in this analysis. | |
# Load in the data | |
data <- read.csv('sra_result_human.csv') | |
# Look at the column headers | |
colnames(data) | |
# Split up by Illumina and Not Illumina | |
illumina_machines <- c('Illumina Genome Analyzer', 'Illumina Genome Analyzer II', 'Illumina Genome Analyzer IIx', | |
'Illumina HiScanSQ', 'Illumina HiSeq 1000', 'Illumina HiSeq 1500', 'Illumina HiSeq 2000', | |
'Illumina HiSeq 2500', 'Illumina MiSeq') | |
total_illumina <- data[(data$Instrument %in% illumina_machines),] | |
total_not_illumina <- data[!(data$Instrument %in% illumina_machines),] | |
#' # Initial Overview | |
#' We're most interested in the number of samples and the total number | |
#' of base pairs sequenced - these give a feel of the dominancy of certain | |
#' platforms. | |
# Count samples and base pairs | |
num_samples <- nrow(data) | |
num_samples.illumina <- nrow(total_illumina) | |
num_samples.not_illumina <- nrow(total_not_illumina) | |
num_bp <-sum(data$Total.Bases, na.rm=TRUE) | |
num_bp.illumina <- sum(total_illumina$Total.Bases, na.rm=TRUE) | |
num_bp.not_illumina <- sum(total_not_illumina$Total.Bases, na.rm=TRUE) | |
# What numbers are we talking about here? | |
num_samples | |
num_bp | |
# Plot the number of samples for Illumina / Not Illumina | |
par(mai=c(1,1,1,1)) | |
pie(c(num_samples.illumina, num_samples.not_illumina), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)), | |
paste('Not Illumina -', percent(num_samples.not_illumina/num_samples)))) | |
title("Number of Samples per Platform") | |
# Plot the total base pairs sequenced for Illumina / Not Illumina | |
pie(c(num_bp.illumina, num_bp.not_illumina), init.angle=0, | |
labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)), | |
paste('Not Illumina -', percent(num_bp.not_illumina/num_bp)))) | |
title("Number of Base Pairs per Platform") | |
#' ## Hang on, what was that? | |
#' The above plot is really surprising! The vast majority of samples are | |
#' sequences using Illumina machines, but when looking at the total | |
#' number of base pairs this drops significantly. | |
#' | |
#' # Explaining the skew in percentage of base pairs | |
#+ fig.width=8, fig.height=8, par=TRUE | |
# Back to the original data - plot number of samples for all instruments | |
par(mai=c(1,3,1,1)) | |
samples_by_instrument <- table(data$Instrument) | |
samples_by_instrument <- sort(samples_by_instrument) | |
barplot(samples_by_instrument, horiz=TRUE, las=1, main="Samples per Instrument", cex.names=0.7) | |
# Plot the total base pairs for all instruments | |
bp_by_instrument <- rowsum(data$Total.Bases, data$Instrument, na.rm = TRUE) | |
bp_by_instrument <- bp_by_instrument[order(bp_by_instrument[,1]),] # sort by frequency | |
barplot(bp_by_instrument, horiz=TRUE, las=1, main="BP Per Instrument", cex.names=0.7) | |
#' Ok, So it looks like this discrepancy is caused by a handful of samples | |
#' sequenced using the `Complete Genomics` platform. What kinds of studies | |
#' were these? | |
#+ fig.width=8, fig.height=8, par=TRUE | |
# Split this up for library types | |
# I should group the low frequency types into "other". For now, it's just | |
# sorted by frequency so ignore the stuff at the bottom of the legend. | |
par(mai=c(1,3,1,1)) | |
samples_type_per_instrument <- table(data$Library.Strategy, data$Instrument) | |
samples_type_per_instrument <- samples_type_per_instrument[order(rowSums(samples_type_per_instrument),decreasing=TRUE),order(colSums(samples_type_per_instrument))] | |
library('RColorBrewer') | |
colours<-brewer.pal(11, 'Spectral') | |
cols<-colorRampPalette(colours)(nrow(samples_type_per_instrument)) | |
barplot(samples_type_per_instrument, horiz=TRUE, las=1, main="Samples per Instrument", col=colours, cex.names=0.7) | |
legend("bottomright", rownames(samples_type_per_instrument), fill=colours, cex=0.7) | |
# Base pairs per platform split by library types | |
# This is horrible code. If anyone has a better way of writing this I'd love to know it! | |
library('plyr') | |
library('reshape2') | |
bp_type_per_instrument <- aggregate(Total.Bases ~ Instrument + Library.Strategy, data = data, sum, simplify=TRUE) | |
# Unmelt these two variables to make a matrix | |
bp_type_per_instrument <- dcast(bp_type_per_instrument, Library.Strategy ~ Instrument, fill=0) | |
# Massage into the correct format | |
rownames(bp_type_per_instrument) <- bp_type_per_instrument[,1] | |
bp_type_per_instrument <- as.matrix(bp_type_per_instrument[,-1]) | |
# Sort that bad boy | |
bp_type_per_instrument <- bp_type_per_instrument[order(rowSums(bp_type_per_instrument),decreasing=TRUE),order(colSums(bp_type_per_instrument))] | |
cols<-colorRampPalette(colours)(nrow(bp_type_per_instrument)) | |
barplot(bp_type_per_instrument, horiz=TRUE, las=1, main="Base Pairs per Instrument", col=colours, cex.names=0.7) | |
legend("bottomright", rownames(bp_type_per_instrument), fill=colours, cex=0.7) | |
#' Ok, so that explains it - there are some Whole Genome Sequencing (WGS) | |
#' and Whole Exome Sequencing (WXS) projects done by Complete Genomics which | |
#' are massive and are accounting for a large proportion of the base pairs | |
#' stored on the SRA. | |
#' | |
#' [Complete Genomics](http://www.completegenomics.com/) are a sequencing | |
#' facility based in the USA. They have a very complicated library prep | |
#' and bioinformatics pipline. As such, they don't sell sequencing machines, | |
#' but do #' genome sequencing of samples sent in. They were recently bought by | |
#' [BGI](http://www.genomics.cn/). | |
#' | |
#' # Repeating the Summary Stats with Complete Genomics | |
# Repeat counts with Complete genomics | |
total_cg <- data[data$Instrument == 'Complete Genomics',] | |
num_samples.cg <- nrow(total_cg) | |
num_samples.not_illumina_cg <- num_samples.not_illumina - num_samples.cg | |
num_bp.cg <- sum(total_cg$Total.Bases, na.rm=TRUE) | |
num_bp.not_illumina_cg <- num_bp.not_illumina - num_bp.cg | |
# Summarise again with Complete genomics | |
par(mai=c(1,1,1,1)) | |
pie(c(num_samples.illumina, num_samples.cg, num_samples.not_illumina_cg), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)), | |
paste('Complete Genomics -', percent(num_samples.cg/num_samples)), | |
paste('Not Illumina -', percent(num_samples.not_illumina_cg/num_samples)))) | |
title("Number of Samples per Platform") | |
pie(c(num_bp.illumina, num_bp.cg, num_bp.not_illumina_cg), init.angle=140, | |
labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)), | |
paste('Complete Genomics -', percent(num_bp.cg/num_bp)), | |
paste('Not Illumina -', percent(num_bp.not_illumina_cg/num_bp)))) | |
title("Number of Base Pairs per Platform") | |
#' # WGS Summary, without Complete Genomics | |
#' Finally, it would be nice to know the precentages for just Whole | |
#' Genome Sequencing (WGS), ignoring the Complete Genomics platform. | |
# Find summary stats in WGS only, without Complete Genomics | |
total_illumina.WGS <- total_illumina[total_illumina$Library.Strategy=='WGS',] | |
total_not_illumina_cg <- data[!(data$Instrument %in% c(illumina_machines, 'Complete Genomics')),] | |
total_not_illumina_cg.WGS <- total_not_illumina_cg[total_not_illumina_cg$Library.Strategy=='WGS',] | |
num_samples.wgs.illumina <- nrow(total_illumina.WGS) | |
num_samples.wgs.not_illumina_cg <- nrow(total_not_illumina_cg.WGS) | |
num_samples.wgs.not_cg <- num_samples.wgs.illumina + num_samples.wgs.not_illumina_cg | |
num_bp.wgs.illumina <- sum(total_illumina.WGS$Total.Bases, na.rm=TRUE) | |
num_bp.wgs.not_illumina_cg <- sum(total_not_illumina_cg.WGS$Total.Bases, na.rm=TRUE) | |
num_bp.wgs.not_cg <- num_bp.wgs.illumina + num_bp.wgs.not_illumina_cg | |
pie(c(num_samples.wgs.illumina, num_samples.wgs.not_illumina_cg), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_samples.wgs.illumina/num_samples.wgs.not_cg)), | |
paste('Not Illumina or CG -', percent(num_samples.wgs.not_illumina_cg/num_samples.wgs.not_cg)))) | |
title("Number of Samples per Platform - WGS, no CG") | |
pie(c(num_bp.wgs.illumina, num_bp.wgs.not_illumina_cg), init.angle=90, | |
labels=c(paste('Illumina -', percent(num_bp.wgs.illumina/num_bp.wgs.not_cg)), | |
paste('Not Illumina or CG -', percent(num_bp.wgs.not_illumina_cg/num_bp.wgs.not_cg)))) | |
title("Number of Base Pairs per Platform - WGS, no CG") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment