BioSciEconomist/RNA seq analysis.R

## RNA seq analysis.R
# *-----------------------------------------------------------------
# | PROGRAM NAME: RNA seq analysis.R
# | DATE: 12/27/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE: /Users/amandabogard/Google Drive/R Training
# *----------------------------------------------------------------
# | PURPOSE: dc code and analysis
# *----------------------------------------------------------------


#--------------------------------
# Introduction to RNA-Seq theory and workflow
#--------------------------------

# Load library for DESeq2
library(DESeq2)

# Load library for RColorBrewer
library(RColorBrewer)

# Load library for pheatmap
library(pheatmap)

# Load library for tidyverse
library(tidyverse)


# For the RNA-Seq analysis workflow, we obtain the raw FASTQ sequencing files from the
# sequencing facility. We assess the quality of our sequence reads for each sample,
# then determine from where on the genome the reads originated by performing alignment.
# We will use the information about where the reads align to generate the count matrix
# (the number of reads aligning to the exons of each gene), which we will be using to start
# the differential expression analysis.

### Exploring the raw count matrix

# Explore the first six observations of smoc2_rawcounts
head(smoc2_rawcounts,6)

# Explore the structure of smoc2_rawcounts
str(smoc2_rawcounts)

### DGE Theory: Metadata

# Create genotype vector
genotype <- c("smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe")

# Create condition vector
condition <- c("fibrosis","fibrosis","fibrosis","fibrosis","normal","normal","normal")

# Create data frame
smoc2_metadata <- data.frame(genotype, condition)

# Assign the row names of the data frame
rownames(smoc2_metadata) <- c("smoc2_fibrosis1","smoc2_fibrosis2","smoc2_fibrosis3","smoc2_fibrosis4","smoc2_normal1","smoc2_normal3","smoc2_normal4")

#------------------------------------
# exploratory data analysis
#-----------------------------------

# In the videos, we are exploring gene expression differences between the normal and fibrosis samples
# of wild-type mice. In these exercises, we will be exploring gene expression between the normal and
# fibrosis samples from mice over-expressing the smoc2 gene.

### Matching metadata and counts data

# To perform any analysis with DESeq2, we need to create a DESeq2 object by providing the raw counts,
# metadata, and design formula. To do this, we need to read in the raw counts data and associated
# metadata we created previously, make sure the sample names are in the same order in both datasets,
# then create a DESeq2 object to use for differential expression analysis.
# We will use the design formula ~ condition to test for differential expression between
# conditions (normal and fibrosis).

# Use the match() function to reorder the columns of the raw counts
reorder_idx <- match(rownames(smoc2_metadata), colnames(smoc2_rawcounts))

# Reorder the columns of the count data
reordered_smoc2_rawcounts <- smoc2_rawcounts[ , reorder_idx]

# Create a DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData =  reordered_smoc2_rawcounts,
                                    colData =  smoc2_metadata,
                                    design = ~ condition)


### Normalizing counts with DESeq2

# we need to generate the normalized counts (normalized for library size,
# which is the total number of gene counts per sample, while accounting for library composition).
# To obtain the normalized counts, use the DESeq2 object and generate the normalized counts matrix.

# Determine the size factors to use for normalization
dds_smoc2 <- estimateSizeFactors(dds_smoc2)

# Extract the normalized counts
smoc2_normalized_counts <- counts(dds_smoc2,normalized = TRUE)

### Hierarchical heatmap by condition

# Transform the normalized counts
vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

# Extract the matrix of transformed counts
vsd_mat_smoc2 <- assay(vsd_smoc2)

# Compute the correlation values between samples
vsd_cor_smoc2 <- cor(vsd_mat_smoc2)

# Plot the heatmap
pheatmap(vsd_cor_smoc2, annotation = select(smoc2_metadata, condition))

### PCA analysis

# To continue with the quality assessment of our samples, in the first part of this exercise,
# we will perform PCA to look how our samples cluster and whether our condition of interest
# corresponds with the principal components explaining the most variation in the data.

# Transform the normalized counts
vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

# Plot the PCA of PC1 and PC2
plotPCA(vsd_smoc2, intgroup="condition")

#----------------------------------------
# Differential expression analysis with DESeq2
#---------------------------------------


### Creating the DE object

# Create DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
                                    colData = smoc2_metadata,
                                    design = ~ condition)

# Run the DESeq2 analysis
DESeq2 <- DESeq(dds_smoc2)


### DESeq2 model - exploring dispersions

# After fitting the model in the previous exercise, let's explore the fit of our smoc2 data to
# the negative binomial model by plotting the dispersion estimates using the plotDispEsts() function.
# Remember that the dispersion estimates are used to model the raw counts; if the dispersions don't
# follow the assumptions made by DESeq2, then the variation in the data could be poorly estimated
# and the DE results could be less accurate.

# The assumptions DESeq2 makes are that the dispersions should generally decrease with increasing
# mean and that they should more or less follow the fitted line

# Plot dispersions
plotDispEsts(dds_smoc2)

### DESeq2 model - extracting results

# Extract the results of the differential expression analysis
smoc2_res <- results(dds_smoc2,
                     contrast = c("condition","fibrosis", "normal"),
                     alpha = .05)


# DESeq2 results - LFC shrinkage

# To improve the fold change estimates for our data, we want to take our results and shrink
# the log2 fold changes using the lfcShrink() function.

# Shrink the log2 fold change estimates to be more accurate
smoc2_res <- lfcShrink(dds_smoc2,
                       contrast = c("condition","fibrosis", "normal"),
                       res = smoc2_res)


# To reduce the number of DE genes that we are returning and to reduce the likelihood of the DE genes
# being biologically meaningful, we are going to use a small log2 fold change threshold to determine
# the DE genes.

# Explore the results() function
?results

# Extract results
smoc2_res <- results(dds_smoc2,
                     contrast = c("condition","fibrosis", "normal"),
                     alpha = .05,
                     lfcThreshold = .32)

# Shrink the log2 fold changes
smoc2_res <- lfcShrink(dds_smoc2,
                       contrast = c("condition","fibrosis", "normal"),
                       res = smoc2_res)


# Now that we have extracted our results, we can get a nice overview of the number
# of differentially expressed genes there are for our designated alpha level using
# the summary() function. It will output the numbers/percentages of up- and down-regulated genes,
# as well as, give information about independent filtering and outliers removed.


# Get an overview of the results
summary(smoc2_res)

# example building results table

smoc2_res_all <- data.frame(smoc2_res) %>%
  rownames_to_column(var = "ensgene") %>%
  left_join(x = wt_res_all,
            y = grcm38[, c("ensgene"
                           ,
                           "symbol"
                           ,
                           "description")],
            by = "ensgene")
View(smoc2_res_all)

# Save results as a data frame
smoc2_res_all <- data.frame(smoc2_res)

# Subset the results to only return the significant genes with p-adjusted values less than 0.05
smoc2_res_sig <- subset(smoc2_res_all,padj < 0.05)


#---------------------------------------------
# visualization of results
#---------------------------------------------

### DESeq2 visualizations - MA and volcano plots

# To explore the results, visualizations can be helpful to see a global view of the data,
# as well as, characteristics of the significant genes. Usually, we expect to see
# significant genes identified across the range of mean values, which we can plot using the
# MA plot. If we only see significant genes with high mean values, then it could indicate an
# issue with our data. The volcano plot helps us to get an idea of the range of fold changes
# needed to identify significance in our data.

# Create MA plot
plotMA(smoc2_res)

# Generate logical column
smoc2_res_all <- data.frame(smoc2_res) %>% mutate(threshold = padj < 0.05)

# Create the volcano plot
ggplot(smoc2_res_all) +
  geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
  xlab("log2 fold change") +
  ylab("-log10 adjusted p-value") +
  theme(legend.position = "none",
        plot.title = element_text(size = rel(1.5), hjust = 0.5),
        axis.title = element_text(size = rel(1.25)))


### DESeq2 visualizations - heatmap

# Visualizations can also be helpful in exploring the significant genes in more detail.
# The expression heatmap can be helpful in looking at how different the expression of
# all significant genes are between sample groups, while the expression plot can look
# at the top significant genes or choose individual genes of interest to investigate
# the expression levels between samplegroups.


# Subset normalized counts to significant genes
sig_norm_counts_smoc2 <- normalized_counts_smoc2[rownames(smoc2_res_sig), ]

# Choose heatmap color palette
heat_colors <- brewer.pal(n = 6, name = "YlOrRd")

# Plot heatmap
pheatmap(sig_norm_counts_smoc2,
         color = heat_colors,
         cluster_rows = TRUE,
         show_rownames = FALSE,
         annotation = select(smoc2_metadata, condition),
         scale = "row")


### RNA-Seq DE workflow summary

# Check that all of the samples are in the same order in the metadata and count data
all(rownames(all_metadata) %in% colnames(all_rawcounts))

# DESeq object to test for the effect of fibrosis regardless of genotype
dds_all <- DESeqDataSetFromMatrix(countData = all_rawcounts,
                                  colData = all_metadata,
                                  design = ~ condition)


# DESeq object to test for the effect of genotype on the effect of fibrosis
dds_complex <- DESeqDataSetFromMatrix(countData = all_rawcounts,
                                      colData = all_metadata,
                                      design = ~ condition + genotype + genotype:condition)

### DE analysis

# We are going to continue using the full dataset comparing the genes that exhibit
# significant expression differences between normal and fibrosis samples regardless
# of genotype (design: ~ genotype + condition).


# In this exercise let's perform the unsupervised clustering analyses to
# explore the clustering of our samples and sources of variation.

# Log transform counts for QC
vsd_all <- vst(dds_all, blind = TRUE)

# Create heatmap of sample correlation values
vsd_all %>%
  assay() %>%
  cor() %>%
  pheatmap(annotation =  select(all_metadata, c("condition", "genotype")))

# Create the PCA plot for PC1 and PC2 and color by condition
plotPCA(vsd_all, intgroup = "condition")

# Create the PCA plot for PC1 and PC2 and color by genotype
plotPCA(vsd_all, intgroup = "genotype")

### DE analysis results


# After exploring the PCA and correlation heatmap, we found good clustering of our samples on PC1,
# which seemed to represent the variation in the data due to fibrosis, and PC2, which appeared
# to represent variation in the data due to smoc2 overexpression. We did not find additional
# sources of variation in the data, nor any outliers to remove. Therefore, we can proceed by
# running DESeq2, DE testing, and shrinking the fold changes. We performed these steps for you
# to generate the final results, res_all.

# In this exercise, we'll want to subset the significant genes from the results and output the
# top 10 DE genes by adjusted p-value.

# Select significant genese with padj < 0.05
smoc2_sig <- subset(res_all, padj < 0.05) %>%
  data.frame() %>%
  rownames_to_column(var = "geneID")

# Extract the top 6 genes with padj values
smoc2_sig %>%
  arrange(padj) %>%
  select(geneID, padj) %>%
  head()


# Also, remember that we can use the annotables package to convert the
# Ensembl IDs to the more recognizable gene symbols.
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: RNA seq analysis.R
	# \| DATE: 12/27/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE: /Users/amandabogard/Google Drive/R Training
	# *----------------------------------------------------------------
	# \| PURPOSE: dc code and analysis
	# *----------------------------------------------------------------


	#--------------------------------
	# Introduction to RNA-Seq theory and workflow
	#--------------------------------

	# Load library for DESeq2
	library(DESeq2)

	# Load library for RColorBrewer
	library(RColorBrewer)

	# Load library for pheatmap
	library(pheatmap)

	# Load library for tidyverse
	library(tidyverse)


	# For the RNA-Seq analysis workflow, we obtain the raw FASTQ sequencing files from the
	# sequencing facility. We assess the quality of our sequence reads for each sample,
	# then determine from where on the genome the reads originated by performing alignment.
	# We will use the information about where the reads align to generate the count matrix
	# (the number of reads aligning to the exons of each gene), which we will be using to start
	# the differential expression analysis.

	### Exploring the raw count matrix

	# Explore the first six observations of smoc2_rawcounts
	head(smoc2_rawcounts,6)

	# Explore the structure of smoc2_rawcounts
	str(smoc2_rawcounts)

	### DGE Theory: Metadata

	# Create genotype vector
	genotype <- c("smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe","smoc2_oe")

	# Create condition vector
	condition <- c("fibrosis","fibrosis","fibrosis","fibrosis","normal","normal","normal")

	# Create data frame
	smoc2_metadata <- data.frame(genotype, condition)

	# Assign the row names of the data frame
	rownames(smoc2_metadata) <- c("smoc2_fibrosis1","smoc2_fibrosis2","smoc2_fibrosis3","smoc2_fibrosis4","smoc2_normal1","smoc2_normal3","smoc2_normal4")

	#------------------------------------
	# exploratory data analysis
	#-----------------------------------

	# In the videos, we are exploring gene expression differences between the normal and fibrosis samples
	# of wild-type mice. In these exercises, we will be exploring gene expression between the normal and
	# fibrosis samples from mice over-expressing the smoc2 gene.

	### Matching metadata and counts data

	# To perform any analysis with DESeq2, we need to create a DESeq2 object by providing the raw counts,
	# metadata, and design formula. To do this, we need to read in the raw counts data and associated
	# metadata we created previously, make sure the sample names are in the same order in both datasets,
	# then create a DESeq2 object to use for differential expression analysis.
	# We will use the design formula ~ condition to test for differential expression between
	# conditions (normal and fibrosis).

	# Use the match() function to reorder the columns of the raw counts
	reorder_idx <- match(rownames(smoc2_metadata), colnames(smoc2_rawcounts))

	# Reorder the columns of the count data
	reordered_smoc2_rawcounts <- smoc2_rawcounts[ , reorder_idx]

	# Create a DESeq2 object
	dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
	colData = smoc2_metadata,
	design = ~ condition)


	### Normalizing counts with DESeq2

	# we need to generate the normalized counts (normalized for library size,
	# which is the total number of gene counts per sample, while accounting for library composition).
	# To obtain the normalized counts, use the DESeq2 object and generate the normalized counts matrix.

	# Determine the size factors to use for normalization
	dds_smoc2 <- estimateSizeFactors(dds_smoc2)

	# Extract the normalized counts
	smoc2_normalized_counts <- counts(dds_smoc2,normalized = TRUE)

	### Hierarchical heatmap by condition

	# Transform the normalized counts
	vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

	# Extract the matrix of transformed counts
	vsd_mat_smoc2 <- assay(vsd_smoc2)

	# Compute the correlation values between samples
	vsd_cor_smoc2 <- cor(vsd_mat_smoc2)

	# Plot the heatmap
	pheatmap(vsd_cor_smoc2, annotation = select(smoc2_metadata, condition))

	### PCA analysis

	# To continue with the quality assessment of our samples, in the first part of this exercise,
	# we will perform PCA to look how our samples cluster and whether our condition of interest
	# corresponds with the principal components explaining the most variation in the data.

	# Transform the normalized counts
	vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

	# Plot the PCA of PC1 and PC2
	plotPCA(vsd_smoc2, intgroup="condition")

	#----------------------------------------
	# Differential expression analysis with DESeq2
	#---------------------------------------


	### Creating the DE object

	# Create DESeq2 object
	dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
	colData = smoc2_metadata,
	design = ~ condition)

	# Run the DESeq2 analysis
	DESeq2 <- DESeq(dds_smoc2)


	### DESeq2 model - exploring dispersions

	# After fitting the model in the previous exercise, let's explore the fit of our smoc2 data to
	# the negative binomial model by plotting the dispersion estimates using the plotDispEsts() function.
	# Remember that the dispersion estimates are used to model the raw counts; if the dispersions don't
	# follow the assumptions made by DESeq2, then the variation in the data could be poorly estimated
	# and the DE results could be less accurate.

	# The assumptions DESeq2 makes are that the dispersions should generally decrease with increasing
	# mean and that they should more or less follow the fitted line

	# Plot dispersions
	plotDispEsts(dds_smoc2)

	### DESeq2 model - extracting results

	# Extract the results of the differential expression analysis
	smoc2_res <- results(dds_smoc2,
	contrast = c("condition","fibrosis", "normal"),
	alpha = .05)


	# DESeq2 results - LFC shrinkage

	# To improve the fold change estimates for our data, we want to take our results and shrink
	# the log2 fold changes using the lfcShrink() function.

	# Shrink the log2 fold change estimates to be more accurate
	smoc2_res <- lfcShrink(dds_smoc2,
	contrast = c("condition","fibrosis", "normal"),
	res = smoc2_res)


	# To reduce the number of DE genes that we are returning and to reduce the likelihood of the DE genes
	# being biologically meaningful, we are going to use a small log2 fold change threshold to determine
	# the DE genes.

	# Explore the results() function
	?results

	# Extract results
	smoc2_res <- results(dds_smoc2,
	contrast = c("condition","fibrosis", "normal"),
	alpha = .05,
	lfcThreshold = .32)

	# Shrink the log2 fold changes
	smoc2_res <- lfcShrink(dds_smoc2,
	contrast = c("condition","fibrosis", "normal"),
	res = smoc2_res)


	# Now that we have extracted our results, we can get a nice overview of the number
	# of differentially expressed genes there are for our designated alpha level using
	# the summary() function. It will output the numbers/percentages of up- and down-regulated genes,
	# as well as, give information about independent filtering and outliers removed.


	# Get an overview of the results
	summary(smoc2_res)

	# example building results table

	smoc2_res_all <- data.frame(smoc2_res) %>%
	rownames_to_column(var = "ensgene") %>%
	left_join(x = wt_res_all,
	y = grcm38[, c("ensgene"
	,
	"symbol"
	,
	"description")],
	by = "ensgene")
	View(smoc2_res_all)

	# Save results as a data frame
	smoc2_res_all <- data.frame(smoc2_res)

	# Subset the results to only return the significant genes with p-adjusted values less than 0.05
	smoc2_res_sig <- subset(smoc2_res_all,padj < 0.05)


	#---------------------------------------------
	# visualization of results
	#---------------------------------------------

	### DESeq2 visualizations - MA and volcano plots

	# To explore the results, visualizations can be helpful to see a global view of the data,
	# as well as, characteristics of the significant genes. Usually, we expect to see
	# significant genes identified across the range of mean values, which we can plot using the
	# MA plot. If we only see significant genes with high mean values, then it could indicate an
	# issue with our data. The volcano plot helps us to get an idea of the range of fold changes
	# needed to identify significance in our data.

	# Create MA plot
	plotMA(smoc2_res)

	# Generate logical column
	smoc2_res_all <- data.frame(smoc2_res) %>% mutate(threshold = padj < 0.05)

	# Create the volcano plot
	ggplot(smoc2_res_all) +
	geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
	xlab("log2 fold change") +
	ylab("-log10 adjusted p-value") +
	theme(legend.position = "none",
	plot.title = element_text(size = rel(1.5), hjust = 0.5),
	axis.title = element_text(size = rel(1.25)))



	### DESeq2 visualizations - heatmap

	# Visualizations can also be helpful in exploring the significant genes in more detail.
	# The expression heatmap can be helpful in looking at how different the expression of
	# all significant genes are between sample groups, while the expression plot can look
	# at the top significant genes or choose individual genes of interest to investigate
	# the expression levels between samplegroups.


	# Subset normalized counts to significant genes
	sig_norm_counts_smoc2 <- normalized_counts_smoc2[rownames(smoc2_res_sig), ]

	# Choose heatmap color palette
	heat_colors <- brewer.pal(n = 6, name = "YlOrRd")

	# Plot heatmap
	pheatmap(sig_norm_counts_smoc2,
	color = heat_colors,
	cluster_rows = TRUE,
	show_rownames = FALSE,
	annotation = select(smoc2_metadata, condition),
	scale = "row")


	### RNA-Seq DE workflow summary

	# Check that all of the samples are in the same order in the metadata and count data
	all(rownames(all_metadata) %in% colnames(all_rawcounts))

	# DESeq object to test for the effect of fibrosis regardless of genotype
	dds_all <- DESeqDataSetFromMatrix(countData = all_rawcounts,
	colData = all_metadata,
	design = ~ condition)


	# DESeq object to test for the effect of genotype on the effect of fibrosis
	dds_complex <- DESeqDataSetFromMatrix(countData = all_rawcounts,
	colData = all_metadata,
	design = ~ condition + genotype + genotype:condition)

	### DE analysis

	# We are going to continue using the full dataset comparing the genes that exhibit
	# significant expression differences between normal and fibrosis samples regardless
	# of genotype (design: ~ genotype + condition).


	# In this exercise let's perform the unsupervised clustering analyses to
	# explore the clustering of our samples and sources of variation.

	# Log transform counts for QC
	vsd_all <- vst(dds_all, blind = TRUE)

	# Create heatmap of sample correlation values
	vsd_all %>%
	assay() %>%
	cor() %>%
	pheatmap(annotation = select(all_metadata, c("condition", "genotype")))

	# Create the PCA plot for PC1 and PC2 and color by condition
	plotPCA(vsd_all, intgroup = "condition")

	# Create the PCA plot for PC1 and PC2 and color by genotype
	plotPCA(vsd_all, intgroup = "genotype")

	### DE analysis results


	# After exploring the PCA and correlation heatmap, we found good clustering of our samples on PC1,
	# which seemed to represent the variation in the data due to fibrosis, and PC2, which appeared
	# to represent variation in the data due to smoc2 overexpression. We did not find additional
	# sources of variation in the data, nor any outliers to remove. Therefore, we can proceed by
	# running DESeq2, DE testing, and shrinking the fold changes. We performed these steps for you
	# to generate the final results, res_all.

	# In this exercise, we'll want to subset the significant genes from the results and output the
	# top 10 DE genes by adjusted p-value.

	# Select significant genese with padj < 0.05
	smoc2_sig <- subset(res_all, padj < 0.05) %>%
	data.frame() %>%
	rownames_to_column(var = "geneID")

	# Extract the top 6 genes with padj values
	smoc2_sig %>%
	arrange(padj) %>%
	select(geneID, padj) %>%
	head()


	# Also, remember that we can use the annotables package to convert the
	# Ensembl IDs to the more recognizable gene symbols.