Skip to content

Instantly share code, notes, and snippets.

@ewels
Created September 18, 2014 16:57
Show Gist options
  • Save ewels/6fc0717eac64c7fbc8bc to your computer and use it in GitHub Desktop.
Save ewels/6fc0717eac64c7fbc8bc to your computer and use it in GitHub Desktop.
SRA Sequencing Platform Statistics
#' ---
#' title: "SRA Sequencing Platform Analysis"
#' author: "Phil Ewels"
#' date: September 2014
#' ---
#' The Sequence Read Archive [(SRA)](http://www.ncbi.nlm.nih.gov/sra)
#' contains the raw DNA-sequencing data for many published datasets.
#' We were interested in which sequencing platforms were prevalent,
#' or rather, how prevalent Illumina Sequencing is. We were specifically
#' interested in Human data, notably Whole Genome Sequencing (WGS).
#+ include=FALSE
setwd('~/Work/SRA_illumina_stats/')
#+ par=TRUE
# SETUP
library('scales')
#' To get at the metadata contained within the SRA, I did a search for
#' all human samples - `"homo sapiens"[Organism]`. This returns 184586
#' results at the time of writing. The summaries of these search results
#' can be downloaded as a `.csv` file, which is used in this analysis.
# Load in the data
data <- read.csv('sra_result_human.csv')
# Look at the column headers
colnames(data)
# Split up by Illumina and Not Illumina
illumina_machines <- c('Illumina Genome Analyzer', 'Illumina Genome Analyzer II', 'Illumina Genome Analyzer IIx',
'Illumina HiScanSQ', 'Illumina HiSeq 1000', 'Illumina HiSeq 1500', 'Illumina HiSeq 2000',
'Illumina HiSeq 2500', 'Illumina MiSeq')
total_illumina <- data[(data$Instrument %in% illumina_machines),]
total_not_illumina <- data[!(data$Instrument %in% illumina_machines),]
#' # Initial Overview
#' We're most interested in the number of samples and the total number
#' of base pairs sequenced - these give a feel of the dominancy of certain
#' platforms.
# Count samples and base pairs
num_samples <- nrow(data)
num_samples.illumina <- nrow(total_illumina)
num_samples.not_illumina <- nrow(total_not_illumina)
num_bp <-sum(data$Total.Bases, na.rm=TRUE)
num_bp.illumina <- sum(total_illumina$Total.Bases, na.rm=TRUE)
num_bp.not_illumina <- sum(total_not_illumina$Total.Bases, na.rm=TRUE)
# What numbers are we talking about here?
num_samples
num_bp
# Plot the number of samples for Illumina / Not Illumina
par(mai=c(1,1,1,1))
pie(c(num_samples.illumina, num_samples.not_illumina), init.angle=90,
labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
paste('Not Illumina -', percent(num_samples.not_illumina/num_samples))))
title("Number of Samples per Platform")
# Plot the total base pairs sequenced for Illumina / Not Illumina
pie(c(num_bp.illumina, num_bp.not_illumina), init.angle=0,
labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
paste('Not Illumina -', percent(num_bp.not_illumina/num_bp))))
title("Number of Base Pairs per Platform")
#' ## Hang on, what was that?
#' The above plot is really surprising! The vast majority of samples are
#' sequences using Illumina machines, but when looking at the total
#' number of base pairs this drops significantly.
#'
#' # Explaining the skew in percentage of base pairs
#+ fig.width=8, fig.height=8, par=TRUE
# Back to the original data - plot number of samples for all instruments
par(mai=c(1,3,1,1))
samples_by_instrument <- table(data$Instrument)
samples_by_instrument <- sort(samples_by_instrument)
barplot(samples_by_instrument, horiz=TRUE, las=1, main="Samples per Instrument", cex.names=0.7)
# Plot the total base pairs for all instruments
bp_by_instrument <- rowsum(data$Total.Bases, data$Instrument, na.rm = TRUE)
bp_by_instrument <- bp_by_instrument[order(bp_by_instrument[,1]),] # sort by frequency
barplot(bp_by_instrument, horiz=TRUE, las=1, main="BP Per Instrument", cex.names=0.7)
#' Ok, So it looks like this discrepancy is caused by a handful of samples
#' sequenced using the `Complete Genomics` platform. What kinds of studies
#' were these?
#+ fig.width=8, fig.height=8, par=TRUE
# Split this up for library types
# I should group the low frequency types into "other". For now, it's just
# sorted by frequency so ignore the stuff at the bottom of the legend.
par(mai=c(1,3,1,1))
samples_type_per_instrument <- table(data$Library.Strategy, data$Instrument)
samples_type_per_instrument <- samples_type_per_instrument[order(rowSums(samples_type_per_instrument),decreasing=TRUE),order(colSums(samples_type_per_instrument))]
library('RColorBrewer')
colours<-brewer.pal(11, 'Spectral')
cols<-colorRampPalette(colours)(nrow(samples_type_per_instrument))
barplot(samples_type_per_instrument, horiz=TRUE, las=1, main="Samples per Instrument", col=colours, cex.names=0.7)
legend("bottomright", rownames(samples_type_per_instrument), fill=colours, cex=0.7)
# Base pairs per platform split by library types
# This is horrible code. If anyone has a better way of writing this I'd love to know it!
library('plyr')
library('reshape2')
bp_type_per_instrument <- aggregate(Total.Bases ~ Instrument + Library.Strategy, data = data, sum, simplify=TRUE)
# Unmelt these two variables to make a matrix
bp_type_per_instrument <- dcast(bp_type_per_instrument, Library.Strategy ~ Instrument, fill=0)
# Massage into the correct format
rownames(bp_type_per_instrument) <- bp_type_per_instrument[,1]
bp_type_per_instrument <- as.matrix(bp_type_per_instrument[,-1])
# Sort that bad boy
bp_type_per_instrument <- bp_type_per_instrument[order(rowSums(bp_type_per_instrument),decreasing=TRUE),order(colSums(bp_type_per_instrument))]
cols<-colorRampPalette(colours)(nrow(bp_type_per_instrument))
barplot(bp_type_per_instrument, horiz=TRUE, las=1, main="Base Pairs per Instrument", col=colours, cex.names=0.7)
legend("bottomright", rownames(bp_type_per_instrument), fill=colours, cex=0.7)
#' Ok, so that explains it - there are some Whole Genome Sequencing (WGS)
#' and Whole Exome Sequencing (WXS) projects done by Complete Genomics which
#' are massive and are accounting for a large proportion of the base pairs
#' stored on the SRA.
#'
#' [Complete Genomics](http://www.completegenomics.com/) are a sequencing
#' facility based in the USA. They have a very complicated library prep
#' and bioinformatics pipline. As such, they don't sell sequencing machines,
#' but do #' genome sequencing of samples sent in. They were recently bought by
#' [BGI](http://www.genomics.cn/).
#'
#' # Repeating the Summary Stats with Complete Genomics
# Repeat counts with Complete genomics
total_cg <- data[data$Instrument == 'Complete Genomics',]
num_samples.cg <- nrow(total_cg)
num_samples.not_illumina_cg <- num_samples.not_illumina - num_samples.cg
num_bp.cg <- sum(total_cg$Total.Bases, na.rm=TRUE)
num_bp.not_illumina_cg <- num_bp.not_illumina - num_bp.cg
# Summarise again with Complete genomics
par(mai=c(1,1,1,1))
pie(c(num_samples.illumina, num_samples.cg, num_samples.not_illumina_cg), init.angle=90,
labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
paste('Complete Genomics -', percent(num_samples.cg/num_samples)),
paste('Not Illumina -', percent(num_samples.not_illumina_cg/num_samples))))
title("Number of Samples per Platform")
pie(c(num_bp.illumina, num_bp.cg, num_bp.not_illumina_cg), init.angle=140,
labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
paste('Complete Genomics -', percent(num_bp.cg/num_bp)),
paste('Not Illumina -', percent(num_bp.not_illumina_cg/num_bp))))
title("Number of Base Pairs per Platform")
#' # WGS Summary, without Complete Genomics
#' Finally, it would be nice to know the precentages for just Whole
#' Genome Sequencing (WGS), ignoring the Complete Genomics platform.
# Find summary stats in WGS only, without Complete Genomics
total_illumina.WGS <- total_illumina[total_illumina$Library.Strategy=='WGS',]
total_not_illumina_cg <- data[!(data$Instrument %in% c(illumina_machines, 'Complete Genomics')),]
total_not_illumina_cg.WGS <- total_not_illumina_cg[total_not_illumina_cg$Library.Strategy=='WGS',]
num_samples.wgs.illumina <- nrow(total_illumina.WGS)
num_samples.wgs.not_illumina_cg <- nrow(total_not_illumina_cg.WGS)
num_samples.wgs.not_cg <- num_samples.wgs.illumina + num_samples.wgs.not_illumina_cg
num_bp.wgs.illumina <- sum(total_illumina.WGS$Total.Bases, na.rm=TRUE)
num_bp.wgs.not_illumina_cg <- sum(total_not_illumina_cg.WGS$Total.Bases, na.rm=TRUE)
num_bp.wgs.not_cg <- num_bp.wgs.illumina + num_bp.wgs.not_illumina_cg
pie(c(num_samples.wgs.illumina, num_samples.wgs.not_illumina_cg), init.angle=90,
labels=c(paste('Illumina -', percent(num_samples.wgs.illumina/num_samples.wgs.not_cg)),
paste('Not Illumina or CG -', percent(num_samples.wgs.not_illumina_cg/num_samples.wgs.not_cg))))
title("Number of Samples per Platform - WGS, no CG")
pie(c(num_bp.wgs.illumina, num_bp.wgs.not_illumina_cg), init.angle=90,
labels=c(paste('Illumina -', percent(num_bp.wgs.illumina/num_bp.wgs.not_cg)),
paste('Not Illumina or CG -', percent(num_bp.wgs.not_illumina_cg/num_bp.wgs.not_cg))))
title("Number of Base Pairs per Platform - WGS, no CG")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment