Created
March 15, 2015 01:36
-
-
Save TATABOX42/da31a598bf4f59b2a093 to your computer and use it in GitHub Desktop.
Code used for post: http://btovar.com/2015/03/markov-chains-in-r
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Benjamin Tovar | |
# Date: March 14, 2015 | |
# Post: http://btovar.com/2015/03/markov-chains-in-r | |
# | |
# Example of a Markov Chain of zero order (the current nucleotide is | |
# totally independent of the previous nucleotide). | |
# *********************************************************************** | |
# *********************************************************************** | |
# The multinomial model is: | |
# p(A)+p(C)+p(G)+p(T) = 1.0 | |
# 0.4 +0.1 +0.1 +0.4 = 1.0 | |
# Define the DNA alphabet that will be used to put names to the objects | |
alp <- c("A","C","G","T") | |
# Create the vector that represents the probability distribution of the model | |
zeroOrder <- c(0.4,0.1,0.1,0.4) | |
# Put the name of reference of each base | |
names(zeroOrder) <- alp | |
# compute the emission matrix of the model | |
zeroOrderMat <- matrix(zeroOrder,nr=4,nc=4,byrow=TRUE,dimnames=list(alp,alp)) | |
# > zeroOrderMat | |
# A C G T | |
# A 0.4 0.1 0.1 0.4 | |
# C 0.4 0.1 0.1 0.4 | |
# G 0.4 0.1 0.1 0.4 | |
# T 0.4 0.1 0.1 0.4 | |
# plot the probability matrix | |
library(qgraph) | |
qgraph(zeroOrderMat,layout = "circular", | |
edge.labels = TRUE,edge.color = rainbow(4),esize = 5) | |
# Create a sequence of 1000 bases using this model. | |
zeroOrderSeq <- sample(alp,1000,rep=T,prob=zeroOrder) | |
# ***** Study the composition bias of the sequence ***** | |
# We wil use the "seqinr" package. | |
# For the installation of the package, type: | |
# install.packages("seqinr") | |
# Then, load the package: | |
require("seqinr") | |
# Count the frequency of each base | |
# in the sequence using the "count" function | |
zeroOrderFreq <- count(zeroOrderSeq,1,alphabet=alp,freq=TRUE) | |
# Count the frequency of dinucleotides | |
# in the sequence using the "count" function | |
zeroOrderFreqDin <- count(zeroOrderSeq,2,alphabet=alp,freq=TRUE) | |
# set the tables to data frames | |
zeroOrderFreq <- data.frame(nucleotide=names(zeroOrderFreq), | |
frequency=as.numeric(zeroOrderFreq)) | |
zeroOrderFreqDin <- data.frame(dinucleotide=names(zeroOrderFreqDin), | |
frequency=as.numeric(zeroOrderFreqDin)) | |
# Now, plot the results in the same plot: | |
library(ggplot2) | |
library(gridExtra) | |
p1 <- ggplot(zeroOrderFreq)+ | |
aes(x=nucleotide,y=frequency) + | |
geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) + | |
labs(title="Frequency of nucleotides\n(zero order Markov Chain)") + | |
ylim(0,0.5) + | |
theme_minimal() + | |
theme(legend.position="none") | |
p2 <- ggplot(zeroOrderFreqDin)+ | |
aes(x=dinucleotide,y=frequency) + | |
geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) + | |
labs(title="Frequency of dinucleotides\n(zero order Markov Chain)") + | |
ylim(0,0.2) + | |
theme_minimal() + | |
theme(legend.position="none") | |
# plot | |
grid.arrange(p1,p2) | |
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& | |
# Example of a Markov Chain of first order (the current nucleotide only | |
# depends on the previous nucleotide). | |
# The multinomial model per base is: | |
# p(A|A)+p(C|A)+p(G|A)+p(T|A) = 1.0 | |
# p(A|C)+p(C|C)+p(G|C)+p(T|C) = 1.0 | |
# p(A|G)+p(C|G)+p(G|G)+p(T|G) = 1.0 | |
# p(A|T)+p(C|T)+p(G|T)+p(T|T) = 1.0 | |
# So: | |
# 0.6 + 0.1 + 0.1 + 0.2 = 1.0 | |
# 0.1 + 0.5 + 0.3 + 0.1 = 1.0 | |
# 0.05+ 0.2 + 0.7 + 0.05 = 1.0 | |
# 0.4 + 0.05+0.05 + 0.5 = 1.0 | |
# Create the matrix that will store the probability distribution given | |
# a certain nucleotide: | |
firstOrderMat <- matrix(NA,nr=4,nc=4) | |
# Put names to the 2 dimensions of the matrix | |
colnames(firstOrderMat) <- rownames(firstOrderMat) <- alp | |
# Add the probability distribution per base: | |
firstOrderMat[1,] <- c(0.6,0.1,0.1,0.2) # Given an A in the 1st position | |
firstOrderMat[2,] <- c(0.1,0.5,0.3,0.1) # Given a C in the 1st position | |
firstOrderMat[3,] <- c(0.05,0.2,0.7,0.05)# Given a G in the 1st position | |
firstOrderMat[4,] <- c(0.4,0.05,0.05,0.5)# Given a T in the 1st position | |
# Now we got a matrix | |
# > firstOrderMat | |
# A C G T | |
# A 0.60 0.10 0.10 0.20 | |
# C 0.10 0.50 0.30 0.10 | |
# G 0.05 0.20 0.70 0.05 | |
# T 0.40 0.05 0.05 0.50 | |
# plot the probability matrix | |
library(qgraph) | |
qgraph(firstOrderMat,layout = "circular", | |
edge.labels = TRUE,edge.color = rainbow(4),esize = 5) | |
# In order to continue, we need an initial probability distribution to know | |
# which base is the most probable to start up the sequence. | |
inProb <- c(0.4,0.1,0.1,0.4); names(inProb) <- alp | |
# So, the sequence will have a 40% to start with an A or a T and 10% with C or G | |
# Create a function to generate the sequence. | |
# NOTE: To load the function to the current environment, just copy | |
# the entire function and paste it inside the R prompt. | |
generateFirstOrderSeq <- function(lengthSeq, | |
alphabet, | |
initialProb, | |
firstOrderMatrix){ | |
# lengthSeq = length of the sequence | |
# alphabet = alphabet that compounds the sequence | |
# initialProb = initial probability distribution | |
# firstOrderMatrix = matrix that stores the probability distribution of a | |
# first order Markov Chain | |
# Construct the object that stores the sequence | |
outputSeq <- rep(NA,lengthSeq) | |
# Which is the first base: | |
outputSeq[1] <- sample(alphabet,1,prob=initialProb) | |
# Let the computer decide: | |
for(i in 2:length(outputSeq)){ | |
prevNuc <- outputSeq[i-1] | |
currentProb <- firstOrderMatrix[prevNuc,] | |
outputSeq[i] <- sample(alphabet,1,prob=currentProb) | |
} | |
cat("** DONE: Sequence computation is complete **\n") | |
return(outputSeq) | |
} | |
# Use the generateFirstOrderSeq function to generate a sequence of 1000 bases | |
# long | |
firstOrderSeq <- generateFirstOrderSeq(1000,alp,inProb,firstOrderMat) | |
# ***** Study the composition bias of the sequence ***** | |
# We wil use the "seqinr" package. | |
# For the installation of the package, type: | |
# install.packages("seqinr") | |
# Then, load the package: | |
require("seqinr") | |
# Count the frequency of each base | |
# in the sequence using the "count" function | |
firstOrderFreq <- count(firstOrderSeq,1,alphabet=alp,freq=TRUE) | |
# Count the frequency of dinucleotides | |
# in the sequence using the "count" function | |
firstOrderFreqDin <- count(firstOrderSeq,2,alphabet=alp,freq=TRUE) | |
# set the tables to data frames | |
firstOrderFreq <- data.frame(nucleotide=names(firstOrderFreq), | |
frequency=as.numeric(firstOrderFreq)) | |
firstOrderFreqDin <- data.frame(dinucleotide=names(firstOrderFreqDin), | |
frequency=as.numeric(firstOrderFreqDin)) | |
# Now, plot the results in the same plot: | |
library(ggplot2) | |
library(gridExtra) | |
p3 <- ggplot(firstOrderFreq)+ | |
aes(x=nucleotide,y=frequency) + | |
geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) + | |
labs(title="Frequency of nucleotides\n(first order Markov Chain)") + | |
ylim(0,0.5) + | |
theme_minimal() + | |
theme(legend.position="none") | |
p4 <- ggplot(firstOrderFreqDin)+ | |
aes(x=dinucleotide,y=frequency) + | |
geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) + | |
labs(title="Frequency of dinucleotides\n(first order Markov Chain)") + | |
ylim(0,0.25) + | |
theme_minimal() + | |
theme(legend.position="none") | |
# plot | |
grid.arrange(p3,p4) | |
## Lets plot the 4 plots in one window | |
grid.arrange(p1,p2,p3,p4) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment