Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Last active December 28, 2015 08:08
Show Gist options
  • Save BenLangmead/7468937 to your computer and use it in GitHub Desktop.
Save BenLangmead/7468937 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import math\n",
"from collections import defaultdict\n",
"\n",
"class MarkovChainOrder(object):\n",
" ''' Simple higher-order Markov chain, specifically for DNA\n",
" sequences. User provides training data (DNA strings). '''\n",
" \n",
" def __init__(self, examples, order=1):\n",
" ''' Initialize the model given collection of example DNA\n",
" strings. '''\n",
" self.order = order\n",
" self.mers = defaultdict(int)\n",
" self.longestMer = longestMer = order + 1\n",
" for ex in examples:\n",
" # count up all k-mers of length 'longestMer' or shorter\n",
" for i in xrange(len(ex) - longestMer + 1):\n",
" for j in xrange(longestMer, -1, -1):\n",
" self.mers[ex[i:i+j]] += 1\n",
" \n",
" def condProb(self, obs, given):\n",
" ''' Return conditional probability of seeing nucleotide \"obs\"\n",
" given we just saw nucleotide string \"given\". Length of\n",
" \"given\" can't exceed model order. Return None if \"given\"\n",
" was never observed. '''\n",
" assert len(given) <= self.order\n",
" ngiven = self.mers[given]\n",
" if ngiven == 0: return None\n",
" return float(self.mers[given + obs]) / self.mers[given]\n",
" \n",
" def jointProb(self, s):\n",
" ''' Return joint probability of observing string s '''\n",
" cum = 1.0\n",
" for i in xrange(self.longestMer-1, len(s)):\n",
" obs, given = s[i], s[i-self.longestMer+1:i]\n",
" p = self.condProb(obs, given)\n",
" if p is not None:\n",
" cum *= p\n",
" # include the smaller k-mers at the very beginning\n",
" for j in xrange(self.longestMer-2, -1, -1):\n",
" obs, given = s[j], s[:j]\n",
" p = self.condProb(obs, given)\n",
" if p is not None:\n",
" cum *= p\n",
" return cum\n",
" \n",
" def jointProbL(self, s):\n",
" ''' Return log2 of joint probability of observing string s '''\n",
" cum = 0.0\n",
" for i in xrange(self.longestMer-1, len(s)):\n",
" obs, given = s[i], s[i-self.longestMer+1:i]\n",
" p = self.condProb(obs, given)\n",
" if p is not None:\n",
" cum += math.log(p, 2)\n",
" # include the smaller k-mers at the very beginning\n",
" for j in xrange(self.longestMer-2, -1, -1):\n",
" obs, given = s[j], s[:j]\n",
" p = self.condProb(obs, given)\n",
" if p is not None:\n",
" cum += math.log(p, 2)\n",
" return cum"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc1 = MarkovChainOrder(['AC' * 10], 1)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc1.condProb('A', 'C') # should be 1; C always followed by A"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"1.0"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc1.condProb('C', 'A') # should be 1; A always followed by C"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"1.0"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc1.condProb('G', 'A') # should be 0; A occurs but is never followed by G"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"0.0"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc2 = MarkovChainOrder(['AC' * 10], 2)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc2.condProb('A', 'AC') # AC always followed by A"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"1.0"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc2.condProb('C', 'CA') # CA always followed by C"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"1.0"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc2.condProb('C', 'AA') is None # because AA doesn't occur"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"True"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc3 = MarkovChainOrder(['AAA1AAA2AAA2AAA3AAA3AAA3'], 3)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc3.condProb('2', 'AAA') # 1/3"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"0.3333333333333333"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mc3.condProb('3', 'AAA') # 1/2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"0.5"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p1 = mc3.condProb('A', '')\n",
"p1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"0.7619047619047619"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p2 = mc3.condProb('A', 'A')\n",
"p2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"0.6875"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p3 = mc3.condProb('A', 'AA')\n",
"p3"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"0.5454545454545454"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p4 = mc3.condProb('1', 'AAA')\n",
"p4"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
"0.16666666666666666"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p1 * p2 * p3 * p4, mc3.jointProb('AAA1') # should be equal"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 17,
"text": [
"(0.0476190476190476, 0.04761904761904761)"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import math\n",
"math.log(mc3.jointProb('AAA1'), 2), mc3.jointProbL('AAA1') # should be equal"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 18,
"text": [
"(-4.392317422778761, -4.392317422778761)"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment