Skip to content

Instantly share code, notes, and snippets.

@TATABOX42
Created March 15, 2015 01:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TATABOX42/da31a598bf4f59b2a093 to your computer and use it in GitHub Desktop.
Save TATABOX42/da31a598bf4f59b2a093 to your computer and use it in GitHub Desktop.
# Author: Benjamin Tovar
# Date: March 14, 2015
# Post: http://btovar.com/2015/03/markov-chains-in-r
#
# Example of a Markov Chain of zero order (the current nucleotide is
# totally independent of the previous nucleotide).
# ***********************************************************************
# ***********************************************************************
# The multinomial model is:
# p(A)+p(C)+p(G)+p(T) = 1.0
# 0.4 +0.1 +0.1 +0.4 = 1.0
# Define the DNA alphabet that will be used to put names to the objects
alp <- c("A","C","G","T")
# Create the vector that represents the probability distribution of the model
zeroOrder <- c(0.4,0.1,0.1,0.4)
# Put the name of reference of each base
names(zeroOrder) <- alp
# compute the emission matrix of the model
zeroOrderMat <- matrix(zeroOrder,nr=4,nc=4,byrow=TRUE,dimnames=list(alp,alp))
# > zeroOrderMat
# A C G T
# A 0.4 0.1 0.1 0.4
# C 0.4 0.1 0.1 0.4
# G 0.4 0.1 0.1 0.4
# T 0.4 0.1 0.1 0.4
# plot the probability matrix
library(qgraph)
qgraph(zeroOrderMat,layout = "circular",
edge.labels = TRUE,edge.color = rainbow(4),esize = 5)
# Create a sequence of 1000 bases using this model.
zeroOrderSeq <- sample(alp,1000,rep=T,prob=zeroOrder)
# ***** Study the composition bias of the sequence *****
# We wil use the "seqinr" package.
# For the installation of the package, type:
# install.packages("seqinr")
# Then, load the package:
require("seqinr")
# Count the frequency of each base
# in the sequence using the "count" function
zeroOrderFreq <- count(zeroOrderSeq,1,alphabet=alp,freq=TRUE)
# Count the frequency of dinucleotides
# in the sequence using the "count" function
zeroOrderFreqDin <- count(zeroOrderSeq,2,alphabet=alp,freq=TRUE)
# set the tables to data frames
zeroOrderFreq <- data.frame(nucleotide=names(zeroOrderFreq),
frequency=as.numeric(zeroOrderFreq))
zeroOrderFreqDin <- data.frame(dinucleotide=names(zeroOrderFreqDin),
frequency=as.numeric(zeroOrderFreqDin))
# Now, plot the results in the same plot:
library(ggplot2)
library(gridExtra)
p1 <- ggplot(zeroOrderFreq)+
aes(x=nucleotide,y=frequency) +
geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) +
labs(title="Frequency of nucleotides\n(zero order Markov Chain)") +
ylim(0,0.5) +
theme_minimal() +
theme(legend.position="none")
p2 <- ggplot(zeroOrderFreqDin)+
aes(x=dinucleotide,y=frequency) +
geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) +
labs(title="Frequency of dinucleotides\n(zero order Markov Chain)") +
ylim(0,0.2) +
theme_minimal() +
theme(legend.position="none")
# plot
grid.arrange(p1,p2)
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# Example of a Markov Chain of first order (the current nucleotide only
# depends on the previous nucleotide).
# The multinomial model per base is:
# p(A|A)+p(C|A)+p(G|A)+p(T|A) = 1.0
# p(A|C)+p(C|C)+p(G|C)+p(T|C) = 1.0
# p(A|G)+p(C|G)+p(G|G)+p(T|G) = 1.0
# p(A|T)+p(C|T)+p(G|T)+p(T|T) = 1.0
# So:
# 0.6 + 0.1 + 0.1 + 0.2 = 1.0
# 0.1 + 0.5 + 0.3 + 0.1 = 1.0
# 0.05+ 0.2 + 0.7 + 0.05 = 1.0
# 0.4 + 0.05+0.05 + 0.5 = 1.0
# Create the matrix that will store the probability distribution given
# a certain nucleotide:
firstOrderMat <- matrix(NA,nr=4,nc=4)
# Put names to the 2 dimensions of the matrix
colnames(firstOrderMat) <- rownames(firstOrderMat) <- alp
# Add the probability distribution per base:
firstOrderMat[1,] <- c(0.6,0.1,0.1,0.2) # Given an A in the 1st position
firstOrderMat[2,] <- c(0.1,0.5,0.3,0.1) # Given a C in the 1st position
firstOrderMat[3,] <- c(0.05,0.2,0.7,0.05)# Given a G in the 1st position
firstOrderMat[4,] <- c(0.4,0.05,0.05,0.5)# Given a T in the 1st position
# Now we got a matrix
# > firstOrderMat
# A C G T
# A 0.60 0.10 0.10 0.20
# C 0.10 0.50 0.30 0.10
# G 0.05 0.20 0.70 0.05
# T 0.40 0.05 0.05 0.50
# plot the probability matrix
library(qgraph)
qgraph(firstOrderMat,layout = "circular",
edge.labels = TRUE,edge.color = rainbow(4),esize = 5)
# In order to continue, we need an initial probability distribution to know
# which base is the most probable to start up the sequence.
inProb <- c(0.4,0.1,0.1,0.4); names(inProb) <- alp
# So, the sequence will have a 40% to start with an A or a T and 10% with C or G
# Create a function to generate the sequence.
# NOTE: To load the function to the current environment, just copy
# the entire function and paste it inside the R prompt.
generateFirstOrderSeq <- function(lengthSeq,
alphabet,
initialProb,
firstOrderMatrix){
# lengthSeq = length of the sequence
# alphabet = alphabet that compounds the sequence
# initialProb = initial probability distribution
# firstOrderMatrix = matrix that stores the probability distribution of a
# first order Markov Chain
# Construct the object that stores the sequence
outputSeq <- rep(NA,lengthSeq)
# Which is the first base:
outputSeq[1] <- sample(alphabet,1,prob=initialProb)
# Let the computer decide:
for(i in 2:length(outputSeq)){
prevNuc <- outputSeq[i-1]
currentProb <- firstOrderMatrix[prevNuc,]
outputSeq[i] <- sample(alphabet,1,prob=currentProb)
}
cat("** DONE: Sequence computation is complete **\n")
return(outputSeq)
}
# Use the generateFirstOrderSeq function to generate a sequence of 1000 bases
# long
firstOrderSeq <- generateFirstOrderSeq(1000,alp,inProb,firstOrderMat)
# ***** Study the composition bias of the sequence *****
# We wil use the "seqinr" package.
# For the installation of the package, type:
# install.packages("seqinr")
# Then, load the package:
require("seqinr")
# Count the frequency of each base
# in the sequence using the "count" function
firstOrderFreq <- count(firstOrderSeq,1,alphabet=alp,freq=TRUE)
# Count the frequency of dinucleotides
# in the sequence using the "count" function
firstOrderFreqDin <- count(firstOrderSeq,2,alphabet=alp,freq=TRUE)
# set the tables to data frames
firstOrderFreq <- data.frame(nucleotide=names(firstOrderFreq),
frequency=as.numeric(firstOrderFreq))
firstOrderFreqDin <- data.frame(dinucleotide=names(firstOrderFreqDin),
frequency=as.numeric(firstOrderFreqDin))
# Now, plot the results in the same plot:
library(ggplot2)
library(gridExtra)
p3 <- ggplot(firstOrderFreq)+
aes(x=nucleotide,y=frequency) +
geom_bar(stat="identity",aes(fill=nucleotide),alpha=0.6) +
labs(title="Frequency of nucleotides\n(first order Markov Chain)") +
ylim(0,0.5) +
theme_minimal() +
theme(legend.position="none")
p4 <- ggplot(firstOrderFreqDin)+
aes(x=dinucleotide,y=frequency) +
geom_bar(stat="identity",aes(fill=dinucleotide),alpha=0.6) +
labs(title="Frequency of dinucleotides\n(first order Markov Chain)") +
ylim(0,0.25) +
theme_minimal() +
theme(legend.position="none")
# plot
grid.arrange(p3,p4)
## Lets plot the 4 plots in one window
grid.arrange(p1,p2,p3,p4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment