ewels/sra_illumina_stats.r

## sra_illumina_stats.r
#' ---
#' title: "SRA Sequencing Platform Analysis"
#' author: "Phil Ewels"
#' date: September 2014
#' ---

#' The Sequence Read Archive [(SRA)](http://www.ncbi.nlm.nih.gov/sra)
#' contains the raw DNA-sequencing data for many published datasets.
#' We were interested in which sequencing platforms were prevalent,
#' or rather, how prevalent Illumina Sequencing is. We were specifically
#' interested in Human data, notably Whole Genome Sequencing (WGS).

#+ include=FALSE
setwd('~/Work/SRA_illumina_stats/')

#+ par=TRUE
# SETUP
library('scales')

#' To get at the metadata contained within the SRA, I did a search for
#' all human samples - `"homo sapiens"[Organism]`. This returns 184586
#' results at the time of writing. The summaries of these search results
#' can be downloaded as a `.csv` file, which is used in this analysis.
# Load in the data
data <- read.csv('sra_result_human.csv')

# Look at the column headers
colnames(data)

# Split up by Illumina and Not Illumina
illumina_machines <- c('Illumina Genome Analyzer', 'Illumina Genome Analyzer II', 'Illumina Genome Analyzer IIx',
                       'Illumina HiScanSQ', 'Illumina HiSeq 1000', 'Illumina HiSeq 1500', 'Illumina HiSeq 2000',
                       'Illumina HiSeq 2500', 'Illumina MiSeq')
total_illumina <- data[(data$Instrument %in% illumina_machines),]
total_not_illumina <- data[!(data$Instrument %in% illumina_machines),]

#' # Initial Overview
#' We're most interested in the number of samples and the total number
#' of base pairs sequenced - these give a feel of the dominancy of certain
#' platforms.

# Count samples and base pairs
num_samples <- nrow(data)
num_samples.illumina <- nrow(total_illumina)
num_samples.not_illumina <- nrow(total_not_illumina)
num_bp <-sum(data$Total.Bases, na.rm=TRUE)
num_bp.illumina <- sum(total_illumina$Total.Bases, na.rm=TRUE)
num_bp.not_illumina <- sum(total_not_illumina$Total.Bases, na.rm=TRUE)

# What numbers are we talking about here?
num_samples
num_bp

# Plot the number of samples for Illumina / Not Illumina
par(mai=c(1,1,1,1))
pie(c(num_samples.illumina, num_samples.not_illumina), init.angle=90,
    labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
             paste('Not Illumina -', percent(num_samples.not_illumina/num_samples))))
title("Number of Samples per Platform")

# Plot the total base pairs sequenced for Illumina / Not Illumina
pie(c(num_bp.illumina, num_bp.not_illumina), init.angle=0,
    labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
             paste('Not Illumina -', percent(num_bp.not_illumina/num_bp))))
title("Number of Base Pairs per Platform")

#' ## Hang on, what was that?
#' The above plot is really surprising! The vast majority of samples are
#' sequences using Illumina machines, but when looking at the total
#' number of base pairs this drops significantly.
#'
#' # Explaining the skew in percentage of base pairs
#+ fig.width=8, fig.height=8, par=TRUE
# Back to the original data - plot number of samples for all instruments
par(mai=c(1,3,1,1))
samples_by_instrument <- table(data$Instrument)
samples_by_instrument <- sort(samples_by_instrument)
barplot(samples_by_instrument, horiz=TRUE, las=1, main="Samples per Instrument", cex.names=0.7)

# Plot the total base pairs for all instruments
bp_by_instrument <- rowsum(data$Total.Bases, data$Instrument, na.rm = TRUE)
bp_by_instrument <- bp_by_instrument[order(bp_by_instrument[,1]),] # sort by frequency
barplot(bp_by_instrument, horiz=TRUE, las=1, main="BP Per Instrument", cex.names=0.7)

#' Ok, So it looks like this discrepancy is caused by a handful of samples
#' sequenced using the `Complete Genomics` platform. What kinds of studies
#' were these?

#+ fig.width=8, fig.height=8, par=TRUE
# Split this up for library types
# I should group the low frequency types into "other". For now, it's just
# sorted by frequency so ignore the stuff at the bottom of the legend.
par(mai=c(1,3,1,1))
samples_type_per_instrument <- table(data$Library.Strategy, data$Instrument)
samples_type_per_instrument <- samples_type_per_instrument[order(rowSums(samples_type_per_instrument),decreasing=TRUE),order(colSums(samples_type_per_instrument))]
library('RColorBrewer')
colours<-brewer.pal(11, 'Spectral')
cols<-colorRampPalette(colours)(nrow(samples_type_per_instrument))
barplot(samples_type_per_instrument, horiz=TRUE, las=1, main="Samples per Instrument", col=colours, cex.names=0.7)
legend("bottomright", rownames(samples_type_per_instrument), fill=colours, cex=0.7)

# Base pairs per platform split by library types
# This is horrible code. If anyone has a better way of writing this I'd love to know it!
library('plyr')
library('reshape2')
bp_type_per_instrument <- aggregate(Total.Bases ~ Instrument + Library.Strategy, data = data, sum, simplify=TRUE)
# Unmelt these two variables to make a matrix
bp_type_per_instrument <- dcast(bp_type_per_instrument, Library.Strategy ~ Instrument, fill=0)
# Massage into the correct format
rownames(bp_type_per_instrument) <- bp_type_per_instrument[,1]
bp_type_per_instrument <- as.matrix(bp_type_per_instrument[,-1])
# Sort that bad boy
bp_type_per_instrument <- bp_type_per_instrument[order(rowSums(bp_type_per_instrument),decreasing=TRUE),order(colSums(bp_type_per_instrument))]
cols<-colorRampPalette(colours)(nrow(bp_type_per_instrument))
barplot(bp_type_per_instrument, horiz=TRUE, las=1, main="Base Pairs per Instrument", col=colours, cex.names=0.7)
legend("bottomright", rownames(bp_type_per_instrument), fill=colours, cex=0.7)

#' Ok, so that explains it - there are some Whole Genome Sequencing (WGS)
#' and Whole Exome Sequencing (WXS) projects done by Complete Genomics which
#' are massive and are accounting for a large proportion of the base pairs
#' stored on the SRA.
#'
#' [Complete Genomics](http://www.completegenomics.com/) are a sequencing
#' facility based in the USA. They have a very complicated library prep
#' and bioinformatics pipline. As such, they don't sell sequencing machines,
#' but do #' genome sequencing of samples sent in. They were recently bought by
#' [BGI](http://www.genomics.cn/).
#'
#' # Repeating the Summary Stats with Complete Genomics

# Repeat counts with Complete genomics
total_cg <- data[data$Instrument == 'Complete Genomics',]
num_samples.cg <- nrow(total_cg)
num_samples.not_illumina_cg <- num_samples.not_illumina - num_samples.cg
num_bp.cg <- sum(total_cg$Total.Bases, na.rm=TRUE)
num_bp.not_illumina_cg <- num_bp.not_illumina - num_bp.cg

# Summarise again with Complete genomics
par(mai=c(1,1,1,1))

pie(c(num_samples.illumina, num_samples.cg, num_samples.not_illumina_cg), init.angle=90,
    labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
             paste('Complete Genomics -', percent(num_samples.cg/num_samples)),
             paste('Not Illumina -', percent(num_samples.not_illumina_cg/num_samples))))
title("Number of Samples per Platform")

pie(c(num_bp.illumina, num_bp.cg, num_bp.not_illumina_cg), init.angle=140,
    labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
             paste('Complete Genomics -', percent(num_bp.cg/num_bp)),
             paste('Not Illumina -', percent(num_bp.not_illumina_cg/num_bp))))
title("Number of Base Pairs per Platform")

#' # WGS Summary, without Complete Genomics
#' Finally, it would be nice to know the precentages for just Whole
#' Genome Sequencing (WGS), ignoring the Complete Genomics platform.
# Find summary stats in WGS only, without Complete Genomics
total_illumina.WGS <- total_illumina[total_illumina$Library.Strategy=='WGS',]
total_not_illumina_cg <- data[!(data$Instrument %in% c(illumina_machines, 'Complete Genomics')),]
total_not_illumina_cg.WGS <- total_not_illumina_cg[total_not_illumina_cg$Library.Strategy=='WGS',]

num_samples.wgs.illumina <- nrow(total_illumina.WGS)
num_samples.wgs.not_illumina_cg <- nrow(total_not_illumina_cg.WGS)
num_samples.wgs.not_cg <- num_samples.wgs.illumina + num_samples.wgs.not_illumina_cg
num_bp.wgs.illumina <- sum(total_illumina.WGS$Total.Bases, na.rm=TRUE)
num_bp.wgs.not_illumina_cg <- sum(total_not_illumina_cg.WGS$Total.Bases, na.rm=TRUE)
num_bp.wgs.not_cg <- num_bp.wgs.illumina + num_bp.wgs.not_illumina_cg

pie(c(num_samples.wgs.illumina, num_samples.wgs.not_illumina_cg), init.angle=90,
    labels=c(paste('Illumina -', percent(num_samples.wgs.illumina/num_samples.wgs.not_cg)),
             paste('Not Illumina or CG -', percent(num_samples.wgs.not_illumina_cg/num_samples.wgs.not_cg))))
title("Number of Samples per Platform - WGS, no CG")

pie(c(num_bp.wgs.illumina, num_bp.wgs.not_illumina_cg), init.angle=90,
    labels=c(paste('Illumina -', percent(num_bp.wgs.illumina/num_bp.wgs.not_cg)),
             paste('Not Illumina or CG -', percent(num_bp.wgs.not_illumina_cg/num_bp.wgs.not_cg))))
title("Number of Base Pairs per Platform - WGS, no CG")
	#' ---
	#' title: "SRA Sequencing Platform Analysis"
	#' author: "Phil Ewels"
	#' date: September 2014
	#' ---

	#' The Sequence Read Archive [(SRA)](http://www.ncbi.nlm.nih.gov/sra)
	#' contains the raw DNA-sequencing data for many published datasets.
	#' We were interested in which sequencing platforms were prevalent,
	#' or rather, how prevalent Illumina Sequencing is. We were specifically
	#' interested in Human data, notably Whole Genome Sequencing (WGS).

	#+ include=FALSE
	setwd('~/Work/SRA_illumina_stats/')

	#+ par=TRUE
	# SETUP
	library('scales')

	#' To get at the metadata contained within the SRA, I did a search for
	#' all human samples - `"homo sapiens"[Organism]`. This returns 184586
	#' results at the time of writing. The summaries of these search results
	#' can be downloaded as a `.csv` file, which is used in this analysis.
	# Load in the data
	data <- read.csv('sra_result_human.csv')

	# Look at the column headers
	colnames(data)

	# Split up by Illumina and Not Illumina
	illumina_machines <- c('Illumina Genome Analyzer', 'Illumina Genome Analyzer II', 'Illumina Genome Analyzer IIx',
	'Illumina HiScanSQ', 'Illumina HiSeq 1000', 'Illumina HiSeq 1500', 'Illumina HiSeq 2000',
	'Illumina HiSeq 2500', 'Illumina MiSeq')
	total_illumina <- data[(data$Instrument %in% illumina_machines),]
	total_not_illumina <- data[!(data$Instrument %in% illumina_machines),]

	#' # Initial Overview
	#' We're most interested in the number of samples and the total number
	#' of base pairs sequenced - these give a feel of the dominancy of certain
	#' platforms.

	# Count samples and base pairs
	num_samples <- nrow(data)
	num_samples.illumina <- nrow(total_illumina)
	num_samples.not_illumina <- nrow(total_not_illumina)
	num_bp <-sum(data$Total.Bases, na.rm=TRUE)
	num_bp.illumina <- sum(total_illumina$Total.Bases, na.rm=TRUE)
	num_bp.not_illumina <- sum(total_not_illumina$Total.Bases, na.rm=TRUE)

	# What numbers are we talking about here?
	num_samples
	num_bp

	# Plot the number of samples for Illumina / Not Illumina
	par(mai=c(1,1,1,1))
	pie(c(num_samples.illumina, num_samples.not_illumina), init.angle=90,
	labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
	paste('Not Illumina -', percent(num_samples.not_illumina/num_samples))))
	title("Number of Samples per Platform")

	# Plot the total base pairs sequenced for Illumina / Not Illumina
	pie(c(num_bp.illumina, num_bp.not_illumina), init.angle=0,
	labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
	paste('Not Illumina -', percent(num_bp.not_illumina/num_bp))))
	title("Number of Base Pairs per Platform")

	#' ## Hang on, what was that?
	#' The above plot is really surprising! The vast majority of samples are
	#' sequences using Illumina machines, but when looking at the total
	#' number of base pairs this drops significantly.
	#'
	#' # Explaining the skew in percentage of base pairs
	#+ fig.width=8, fig.height=8, par=TRUE
	# Back to the original data - plot number of samples for all instruments
	par(mai=c(1,3,1,1))
	samples_by_instrument <- table(data$Instrument)
	samples_by_instrument <- sort(samples_by_instrument)
	barplot(samples_by_instrument, horiz=TRUE, las=1, main="Samples per Instrument", cex.names=0.7)

	# Plot the total base pairs for all instruments
	bp_by_instrument <- rowsum(data$Total.Bases, data$Instrument, na.rm = TRUE)
	bp_by_instrument <- bp_by_instrument[order(bp_by_instrument[,1]),] # sort by frequency
	barplot(bp_by_instrument, horiz=TRUE, las=1, main="BP Per Instrument", cex.names=0.7)

	#' Ok, So it looks like this discrepancy is caused by a handful of samples
	#' sequenced using the `Complete Genomics` platform. What kinds of studies
	#' were these?

	#+ fig.width=8, fig.height=8, par=TRUE
	# Split this up for library types
	# I should group the low frequency types into "other". For now, it's just
	# sorted by frequency so ignore the stuff at the bottom of the legend.
	par(mai=c(1,3,1,1))
	samples_type_per_instrument <- table(data$Library.Strategy, data$Instrument)
	samples_type_per_instrument <- samples_type_per_instrument[order(rowSums(samples_type_per_instrument),decreasing=TRUE),order(colSums(samples_type_per_instrument))]
	library('RColorBrewer')
	colours<-brewer.pal(11, 'Spectral')
	cols<-colorRampPalette(colours)(nrow(samples_type_per_instrument))
	barplot(samples_type_per_instrument, horiz=TRUE, las=1, main="Samples per Instrument", col=colours, cex.names=0.7)
	legend("bottomright", rownames(samples_type_per_instrument), fill=colours, cex=0.7)

	# Base pairs per platform split by library types
	# This is horrible code. If anyone has a better way of writing this I'd love to know it!
	library('plyr')
	library('reshape2')
	bp_type_per_instrument <- aggregate(Total.Bases ~ Instrument + Library.Strategy, data = data, sum, simplify=TRUE)
	# Unmelt these two variables to make a matrix
	bp_type_per_instrument <- dcast(bp_type_per_instrument, Library.Strategy ~ Instrument, fill=0)
	# Massage into the correct format
	rownames(bp_type_per_instrument) <- bp_type_per_instrument[,1]
	bp_type_per_instrument <- as.matrix(bp_type_per_instrument[,-1])
	# Sort that bad boy
	bp_type_per_instrument <- bp_type_per_instrument[order(rowSums(bp_type_per_instrument),decreasing=TRUE),order(colSums(bp_type_per_instrument))]
	cols<-colorRampPalette(colours)(nrow(bp_type_per_instrument))
	barplot(bp_type_per_instrument, horiz=TRUE, las=1, main="Base Pairs per Instrument", col=colours, cex.names=0.7)
	legend("bottomright", rownames(bp_type_per_instrument), fill=colours, cex=0.7)

	#' Ok, so that explains it - there are some Whole Genome Sequencing (WGS)
	#' and Whole Exome Sequencing (WXS) projects done by Complete Genomics which
	#' are massive and are accounting for a large proportion of the base pairs
	#' stored on the SRA.
	#'
	#' [Complete Genomics](http://www.completegenomics.com/) are a sequencing
	#' facility based in the USA. They have a very complicated library prep
	#' and bioinformatics pipline. As such, they don't sell sequencing machines,
	#' but do #' genome sequencing of samples sent in. They were recently bought by
	#' [BGI](http://www.genomics.cn/).
	#'
	#' # Repeating the Summary Stats with Complete Genomics

	# Repeat counts with Complete genomics
	total_cg <- data[data$Instrument == 'Complete Genomics',]
	num_samples.cg <- nrow(total_cg)
	num_samples.not_illumina_cg <- num_samples.not_illumina - num_samples.cg
	num_bp.cg <- sum(total_cg$Total.Bases, na.rm=TRUE)
	num_bp.not_illumina_cg <- num_bp.not_illumina - num_bp.cg

	# Summarise again with Complete genomics
	par(mai=c(1,1,1,1))

	pie(c(num_samples.illumina, num_samples.cg, num_samples.not_illumina_cg), init.angle=90,
	labels=c(paste('Illumina -', percent(num_samples.illumina/num_samples)),
	paste('Complete Genomics -', percent(num_samples.cg/num_samples)),
	paste('Not Illumina -', percent(num_samples.not_illumina_cg/num_samples))))
	title("Number of Samples per Platform")

	pie(c(num_bp.illumina, num_bp.cg, num_bp.not_illumina_cg), init.angle=140,
	labels=c(paste('Illumina -', percent(num_bp.illumina/num_bp)),
	paste('Complete Genomics -', percent(num_bp.cg/num_bp)),
	paste('Not Illumina -', percent(num_bp.not_illumina_cg/num_bp))))
	title("Number of Base Pairs per Platform")

	#' # WGS Summary, without Complete Genomics
	#' Finally, it would be nice to know the precentages for just Whole
	#' Genome Sequencing (WGS), ignoring the Complete Genomics platform.
	# Find summary stats in WGS only, without Complete Genomics
	total_illumina.WGS <- total_illumina[total_illumina$Library.Strategy=='WGS',]
	total_not_illumina_cg <- data[!(data$Instrument %in% c(illumina_machines, 'Complete Genomics')),]
	total_not_illumina_cg.WGS <- total_not_illumina_cg[total_not_illumina_cg$Library.Strategy=='WGS',]

	num_samples.wgs.illumina <- nrow(total_illumina.WGS)
	num_samples.wgs.not_illumina_cg <- nrow(total_not_illumina_cg.WGS)
	num_samples.wgs.not_cg <- num_samples.wgs.illumina + num_samples.wgs.not_illumina_cg
	num_bp.wgs.illumina <- sum(total_illumina.WGS$Total.Bases, na.rm=TRUE)
	num_bp.wgs.not_illumina_cg <- sum(total_not_illumina_cg.WGS$Total.Bases, na.rm=TRUE)
	num_bp.wgs.not_cg <- num_bp.wgs.illumina + num_bp.wgs.not_illumina_cg

	pie(c(num_samples.wgs.illumina, num_samples.wgs.not_illumina_cg), init.angle=90,
	labels=c(paste('Illumina -', percent(num_samples.wgs.illumina/num_samples.wgs.not_cg)),
	paste('Not Illumina or CG -', percent(num_samples.wgs.not_illumina_cg/num_samples.wgs.not_cg))))
	title("Number of Samples per Platform - WGS, no CG")

	pie(c(num_bp.wgs.illumina, num_bp.wgs.not_illumina_cg), init.angle=90,
	labels=c(paste('Illumina -', percent(num_bp.wgs.illumina/num_bp.wgs.not_cg)),
	paste('Not Illumina or CG -', percent(num_bp.wgs.not_illumina_cg/num_bp.wgs.not_cg))))
	title("Number of Base Pairs per Platform - WGS, no CG")