TATABOX42/markov_chains.R

## markov_chains.R
#    Author: Benjamin Tovar
#    Date: March 14, 2015
#    Post: http://btovar.com/2015/03/markov-chains-in-r
#
#    Example of a Markov Chain of zero order (the current nucleotide is
#    totally independent of the previous nucleotide).

# ***********************************************************************
# ***********************************************************************

#    The multinomial model is:
#    p(A)+p(C)+p(G)+p(T) = 1.0
#    0.4 +0.1 +0.1 +0.4  = 1.0

# Define the DNA alphabet that will be used to put names to the objects
alp <- c("A","C","G","T")
# Create the vector that represents the probability distribution of the model
zeroOrder <- c(0.4,0.1,0.1,0.4)
# Put the name of reference of each base
names(zeroOrder) <- alp
# compute the emission matrix of the model
zeroOrderMat <- matrix(zeroOrder,nr=4,nc=4,byrow=TRUE,dimnames=list(alp,alp))

# > zeroOrderMat
#     A   C   G   T
# A 0.4 0.1 0.1 0.4
# C 0.4 0.1 0.1 0.4
# G 0.4 0.1 0.1 0.4
# T 0.4 0.1 0.1 0.4

# plot the probability matrix
library(qgraph)

qgraph(zeroOrderMat,layout = "circular",
        edge.labels = TRUE,edge.color = rainbow(4),esize = 5)


# Create a sequence of 1000 bases using this model.
zeroOrderSeq <- sample(alp,1000,rep=T,prob=zeroOrder)

# ***** Study the composition bias of the sequence *****
# We wil use the "seqinr" package.
# For the installation of the package, type:
# install.packages("seqinr")
# Then, load the package:
require("seqinr")

# Count the frequency of each base
# in the sequence using the "count" function
zeroOrderFreq <- count(zeroOrderSeq,1,alphabet=alp,freq=TRUE)

# Count the frequency of dinucleotides
# in the sequence using the "count" function
zeroOrderFreqDin <- count(zeroOrderSeq,2,alphabet=alp,freq=TRUE)

# set the tables to data frames
zeroOrderFreq <- data.frame(nucleotide=names(zeroOrderFreq),
                               frequency=as.numeric(zeroOrderFreq))

zeroOrderFreqDin <- data.frame(dinucleotide=names(zeroOrderFreqDin),
                               frequency=as.numeric(zeroOrderFreqDin))
# Now, plot the results in the same plot:

library(ggplot2)
library(gridExtra)

p1 <- ggplot(zeroOrderFreq)+
        aes(x=nucleotide,y=frequency) +
        geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) +
        labs(title="Frequency of nucleotides\n(zero order Markov Chain)") +
        ylim(0,0.5) +
        theme_minimal() +
        theme(legend.position="none")

p2 <- ggplot(zeroOrderFreqDin)+
        aes(x=dinucleotide,y=frequency) +
        geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) +
        labs(title="Frequency of dinucleotides\n(zero order Markov Chain)") +
        ylim(0,0.2) +
        theme_minimal() +
        theme(legend.position="none")
# plot
grid.arrange(p1,p2)

# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

#    Example of a Markov Chain of first order (the current nucleotide only
#    depends on the previous nucleotide).

#    The multinomial model per base is:
#    p(A|A)+p(C|A)+p(G|A)+p(T|A) = 1.0
#    p(A|C)+p(C|C)+p(G|C)+p(T|C) = 1.0
#    p(A|G)+p(C|G)+p(G|G)+p(T|G) = 1.0
#    p(A|T)+p(C|T)+p(G|T)+p(T|T) = 1.0

#    So:
#    0.6 + 0.1 + 0.1 + 0.2  = 1.0
#    0.1 + 0.5 + 0.3 + 0.1  = 1.0
#    0.05+ 0.2 + 0.7 + 0.05 = 1.0
#    0.4 + 0.05+0.05 + 0.5  = 1.0


# Create the matrix that will store the probability distribution given
# a certain nucleotide:
firstOrderMat <- matrix(NA,nr=4,nc=4)
# Put names to the 2 dimensions of the matrix
colnames(firstOrderMat) <- rownames(firstOrderMat) <- alp
# Add the probability distribution per base:
firstOrderMat[1,] <- c(0.6,0.1,0.1,0.2)  # Given an A in the 1st position
firstOrderMat[2,] <- c(0.1,0.5,0.3,0.1)  # Given a  C in the 1st position
firstOrderMat[3,] <- c(0.05,0.2,0.7,0.05)# Given a  G in the 1st position
firstOrderMat[4,] <- c(0.4,0.05,0.05,0.5)# Given a  T in the 1st position

# Now we got a matrix
#    > firstOrderMat
#         A    C    G    T
#    A 0.60 0.10 0.10 0.20
#    C 0.10 0.50 0.30 0.10
#    G 0.05 0.20 0.70 0.05
#    T 0.40 0.05 0.05 0.50

# plot the probability matrix
library(qgraph)

qgraph(firstOrderMat,layout = "circular",
        edge.labels = TRUE,edge.color = rainbow(4),esize = 5)

# In order to continue, we need an initial probability distribution to know
# which base is the most probable to start up the sequence.
inProb <- c(0.4,0.1,0.1,0.4); names(inProb) <- alp
# So, the sequence will have a 40% to start with an A or a T and 10% with C or G

# Create a function to generate the sequence.
# NOTE: To load the function to the current environment, just copy
# the entire function and paste it inside the R prompt.

generateFirstOrderSeq <- function(lengthSeq,
                                  alphabet,
                                  initialProb,
                                  firstOrderMatrix){
    #    lengthSeq = length of the sequence
    #    alphabet = alphabet that compounds the sequence
    #    initialProb   = initial probability distribution
    #    firstOrderMatrix = matrix that stores the probability distribution of a
    #                       first order Markov Chain

    # Construct the object that stores the sequence
    outputSeq <- rep(NA,lengthSeq)
    # Which is the first base:
    outputSeq[1]  <- sample(alphabet,1,prob=initialProb)
    # Let the computer decide:
    for(i in 2:length(outputSeq)){
        prevNuc <- outputSeq[i-1]
        currentProb <- firstOrderMatrix[prevNuc,]
        outputSeq[i] <- sample(alphabet,1,prob=currentProb)
    }
    cat("** DONE: Sequence computation is complete **\n")
    return(outputSeq)
}

# Use the generateFirstOrderSeq function to generate a sequence of 1000 bases
# long
firstOrderSeq <- generateFirstOrderSeq(1000,alp,inProb,firstOrderMat)

# ***** Study the composition bias of the sequence *****
# We wil use the "seqinr" package.
# For the installation of the package, type:
# install.packages("seqinr")
# Then, load the package:
require("seqinr")

# Count the frequency of each base
# in the sequence using the "count" function
firstOrderFreq <- count(firstOrderSeq,1,alphabet=alp,freq=TRUE)

# Count the frequency of dinucleotides
# in the sequence using the "count" function
firstOrderFreqDin <- count(firstOrderSeq,2,alphabet=alp,freq=TRUE)


# set the tables to data frames
firstOrderFreq <- data.frame(nucleotide=names(firstOrderFreq),
                               frequency=as.numeric(firstOrderFreq))

firstOrderFreqDin <- data.frame(dinucleotide=names(firstOrderFreqDin),
                               frequency=as.numeric(firstOrderFreqDin))
# Now, plot the results in the same plot:

library(ggplot2)
library(gridExtra)

p3 <- ggplot(firstOrderFreq)+
        aes(x=nucleotide,y=frequency) +
        geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) +
        labs(title="Frequency of nucleotides\n(first order Markov Chain)") +
        ylim(0,0.5) +
        theme_minimal() +
        theme(legend.position="none")

p4 <- ggplot(firstOrderFreqDin)+
        aes(x=dinucleotide,y=frequency) +
        geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) +
        labs(title="Frequency of dinucleotides\n(first order Markov Chain)") +
        ylim(0,0.25) +
        theme_minimal() +
        theme(legend.position="none")
# plot
grid.arrange(p3,p4)

## Lets plot the 4 plots in one window
grid.arrange(p1,p2,p3,p4)
	# Author: Benjamin Tovar
	# Date: March 14, 2015
	# Post: http://btovar.com/2015/03/markov-chains-in-r
	#
	# Example of a Markov Chain of zero order (the current nucleotide is
	# totally independent of the previous nucleotide).

	# ***********************************************************************
	# ***********************************************************************

	# The multinomial model is:
	# p(A)+p(C)+p(G)+p(T) = 1.0
	# 0.4 +0.1 +0.1 +0.4 = 1.0

	# Define the DNA alphabet that will be used to put names to the objects
	alp <- c("A","C","G","T")
	# Create the vector that represents the probability distribution of the model
	zeroOrder <- c(0.4,0.1,0.1,0.4)
	# Put the name of reference of each base
	names(zeroOrder) <- alp
	# compute the emission matrix of the model
	zeroOrderMat <- matrix(zeroOrder,nr=4,nc=4,byrow=TRUE,dimnames=list(alp,alp))

	# > zeroOrderMat
	# A C G T
	# A 0.4 0.1 0.1 0.4
	# C 0.4 0.1 0.1 0.4
	# G 0.4 0.1 0.1 0.4
	# T 0.4 0.1 0.1 0.4

	# plot the probability matrix
	library(qgraph)

	qgraph(zeroOrderMat,layout = "circular",
	edge.labels = TRUE,edge.color = rainbow(4),esize = 5)


	# Create a sequence of 1000 bases using this model.
	zeroOrderSeq <- sample(alp,1000,rep=T,prob=zeroOrder)

	# *** Study the composition bias of the sequence ***
	# We wil use the "seqinr" package.
	# For the installation of the package, type:
	# install.packages("seqinr")
	# Then, load the package:
	require("seqinr")

	# Count the frequency of each base
	# in the sequence using the "count" function
	zeroOrderFreq <- count(zeroOrderSeq,1,alphabet=alp,freq=TRUE)

	# Count the frequency of dinucleotides
	# in the sequence using the "count" function
	zeroOrderFreqDin <- count(zeroOrderSeq,2,alphabet=alp,freq=TRUE)

	# set the tables to data frames
	zeroOrderFreq <- data.frame(nucleotide=names(zeroOrderFreq),
	frequency=as.numeric(zeroOrderFreq))

	zeroOrderFreqDin <- data.frame(dinucleotide=names(zeroOrderFreqDin),
	frequency=as.numeric(zeroOrderFreqDin))
	# Now, plot the results in the same plot:

	library(ggplot2)
	library(gridExtra)

	p1 <- ggplot(zeroOrderFreq)+
	aes(x=nucleotide,y=frequency) +
	geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) +
	labs(title="Frequency of nucleotides\n(zero order Markov Chain)") +
	ylim(0,0.5) +
	theme_minimal() +
	theme(legend.position="none")

	p2 <- ggplot(zeroOrderFreqDin)+
	aes(x=dinucleotide,y=frequency) +
	geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) +
	labs(title="Frequency of dinucleotides\n(zero order Markov Chain)") +
	ylim(0,0.2) +
	theme_minimal() +
	theme(legend.position="none")
	# plot
	grid.arrange(p1,p2)

	# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

	# Example of a Markov Chain of first order (the current nucleotide only
	# depends on the previous nucleotide).

	# The multinomial model per base is:
	# p(A\|A)+p(C\|A)+p(G\|A)+p(T\|A) = 1.0
	# p(A\|C)+p(C\|C)+p(G\|C)+p(T\|C) = 1.0
	# p(A\|G)+p(C\|G)+p(G\|G)+p(T\|G) = 1.0
	# p(A\|T)+p(C\|T)+p(G\|T)+p(T\|T) = 1.0

	# So:
	# 0.6 + 0.1 + 0.1 + 0.2 = 1.0
	# 0.1 + 0.5 + 0.3 + 0.1 = 1.0
	# 0.05+ 0.2 + 0.7 + 0.05 = 1.0
	# 0.4 + 0.05+0.05 + 0.5 = 1.0


	# Create the matrix that will store the probability distribution given
	# a certain nucleotide:
	firstOrderMat <- matrix(NA,nr=4,nc=4)
	# Put names to the 2 dimensions of the matrix
	colnames(firstOrderMat) <- rownames(firstOrderMat) <- alp
	# Add the probability distribution per base:
	firstOrderMat[1,] <- c(0.6,0.1,0.1,0.2) # Given an A in the 1st position
	firstOrderMat[2,] <- c(0.1,0.5,0.3,0.1) # Given a C in the 1st position
	firstOrderMat[3,] <- c(0.05,0.2,0.7,0.05)# Given a G in the 1st position
	firstOrderMat[4,] <- c(0.4,0.05,0.05,0.5)# Given a T in the 1st position

	# Now we got a matrix
	# > firstOrderMat
	# A C G T
	# A 0.60 0.10 0.10 0.20
	# C 0.10 0.50 0.30 0.10
	# G 0.05 0.20 0.70 0.05
	# T 0.40 0.05 0.05 0.50

	# plot the probability matrix
	library(qgraph)

	qgraph(firstOrderMat,layout = "circular",
	edge.labels = TRUE,edge.color = rainbow(4),esize = 5)

	# In order to continue, we need an initial probability distribution to know
	# which base is the most probable to start up the sequence.
	inProb <- c(0.4,0.1,0.1,0.4); names(inProb) <- alp
	# So, the sequence will have a 40% to start with an A or a T and 10% with C or G

	# Create a function to generate the sequence.
	# NOTE: To load the function to the current environment, just copy
	# the entire function and paste it inside the R prompt.

	generateFirstOrderSeq <- function(lengthSeq,
	alphabet,
	initialProb,
	firstOrderMatrix){
	# lengthSeq = length of the sequence
	# alphabet = alphabet that compounds the sequence
	# initialProb = initial probability distribution
	# firstOrderMatrix = matrix that stores the probability distribution of a
	# first order Markov Chain

	# Construct the object that stores the sequence
	outputSeq <- rep(NA,lengthSeq)
	# Which is the first base:
	outputSeq[1] <- sample(alphabet,1,prob=initialProb)
	# Let the computer decide:
	for(i in 2:length(outputSeq)){
	prevNuc <- outputSeq[i-1]
	currentProb <- firstOrderMatrix[prevNuc,]
	outputSeq[i] <- sample(alphabet,1,prob=currentProb)
	}
	cat(" DONE: Sequence computation is complete \n")
	return(outputSeq)
	}

	# Use the generateFirstOrderSeq function to generate a sequence of 1000 bases
	# long
	firstOrderSeq <- generateFirstOrderSeq(1000,alp,inProb,firstOrderMat)

	# *** Study the composition bias of the sequence ***
	# We wil use the "seqinr" package.
	# For the installation of the package, type:
	# install.packages("seqinr")
	# Then, load the package:
	require("seqinr")

	# Count the frequency of each base
	# in the sequence using the "count" function
	firstOrderFreq <- count(firstOrderSeq,1,alphabet=alp,freq=TRUE)

	# Count the frequency of dinucleotides
	# in the sequence using the "count" function
	firstOrderFreqDin <- count(firstOrderSeq,2,alphabet=alp,freq=TRUE)


	# set the tables to data frames
	firstOrderFreq <- data.frame(nucleotide=names(firstOrderFreq),
	frequency=as.numeric(firstOrderFreq))

	firstOrderFreqDin <- data.frame(dinucleotide=names(firstOrderFreqDin),
	frequency=as.numeric(firstOrderFreqDin))
	# Now, plot the results in the same plot:

	library(ggplot2)
	library(gridExtra)

	p3 <- ggplot(firstOrderFreq)+
	aes(x=nucleotide,y=frequency) +
	geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) +
	labs(title="Frequency of nucleotides\n(first order Markov Chain)") +
	ylim(0,0.5) +
	theme_minimal() +
	theme(legend.position="none")

	p4 <- ggplot(firstOrderFreqDin)+
	aes(x=dinucleotide,y=frequency) +
	geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) +
	labs(title="Frequency of dinucleotides\n(first order Markov Chain)") +
	ylim(0,0.25) +
	theme_minimal() +
	theme(legend.position="none")
	# plot
	grid.arrange(p3,p4)

	## Lets plot the 4 plots in one window
	grid.arrange(p1,p2,p3,p4)