Last active
December 28, 2015 08:08
-
-
Save BenLangmead/7468937 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import math\n", | |
"from collections import defaultdict\n", | |
"\n", | |
"class MarkovChainOrder(object):\n", | |
" ''' Simple higher-order Markov chain, specifically for DNA\n", | |
" sequences. User provides training data (DNA strings). '''\n", | |
" \n", | |
" def __init__(self, examples, order=1):\n", | |
" ''' Initialize the model given collection of example DNA\n", | |
" strings. '''\n", | |
" self.order = order\n", | |
" self.mers = defaultdict(int)\n", | |
" self.longestMer = longestMer = order + 1\n", | |
" for ex in examples:\n", | |
" # count up all k-mers of length 'longestMer' or shorter\n", | |
" for i in xrange(len(ex) - longestMer + 1):\n", | |
" for j in xrange(longestMer, -1, -1):\n", | |
" self.mers[ex[i:i+j]] += 1\n", | |
" \n", | |
" def condProb(self, obs, given):\n", | |
" ''' Return conditional probability of seeing nucleotide \"obs\"\n", | |
" given we just saw nucleotide string \"given\". Length of\n", | |
" \"given\" can't exceed model order. Return None if \"given\"\n", | |
" was never observed. '''\n", | |
" assert len(given) <= self.order\n", | |
" ngiven = self.mers[given]\n", | |
" if ngiven == 0: return None\n", | |
" return float(self.mers[given + obs]) / self.mers[given]\n", | |
" \n", | |
" def jointProb(self, s):\n", | |
" ''' Return joint probability of observing string s '''\n", | |
" cum = 1.0\n", | |
" for i in xrange(self.longestMer-1, len(s)):\n", | |
" obs, given = s[i], s[i-self.longestMer+1:i]\n", | |
" p = self.condProb(obs, given)\n", | |
" if p is not None:\n", | |
" cum *= p\n", | |
" # include the smaller k-mers at the very beginning\n", | |
" for j in xrange(self.longestMer-2, -1, -1):\n", | |
" obs, given = s[j], s[:j]\n", | |
" p = self.condProb(obs, given)\n", | |
" if p is not None:\n", | |
" cum *= p\n", | |
" return cum\n", | |
" \n", | |
" def jointProbL(self, s):\n", | |
" ''' Return log2 of joint probability of observing string s '''\n", | |
" cum = 0.0\n", | |
" for i in xrange(self.longestMer-1, len(s)):\n", | |
" obs, given = s[i], s[i-self.longestMer+1:i]\n", | |
" p = self.condProb(obs, given)\n", | |
" if p is not None:\n", | |
" cum += math.log(p, 2)\n", | |
" # include the smaller k-mers at the very beginning\n", | |
" for j in xrange(self.longestMer-2, -1, -1):\n", | |
" obs, given = s[j], s[:j]\n", | |
" p = self.condProb(obs, given)\n", | |
" if p is not None:\n", | |
" cum += math.log(p, 2)\n", | |
" return cum" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc1 = MarkovChainOrder(['AC' * 10], 1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc1.condProb('A', 'C') # should be 1; C always followed by A" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"1.0" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc1.condProb('C', 'A') # should be 1; A always followed by C" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"1.0" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc1.condProb('G', 'A') # should be 0; A occurs but is never followed by G" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"0.0" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc2 = MarkovChainOrder(['AC' * 10], 2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc2.condProb('A', 'AC') # AC always followed by A" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"1.0" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc2.condProb('C', 'CA') # CA always followed by C" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"1.0" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc2.condProb('C', 'AA') is None # because AA doesn't occur" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc3 = MarkovChainOrder(['AAA1AAA2AAA2AAA3AAA3AAA3'], 3)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc3.condProb('2', 'AAA') # 1/3" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"0.3333333333333333" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mc3.condProb('3', 'AAA') # 1/2" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"0.5" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p1 = mc3.condProb('A', '')\n", | |
"p1" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"0.7619047619047619" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p2 = mc3.condProb('A', 'A')\n", | |
"p2" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 14, | |
"text": [ | |
"0.6875" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p3 = mc3.condProb('A', 'AA')\n", | |
"p3" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": [ | |
"0.5454545454545454" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p4 = mc3.condProb('1', 'AAA')\n", | |
"p4" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 16, | |
"text": [ | |
"0.16666666666666666" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p1 * p2 * p3 * p4, mc3.jointProb('AAA1') # should be equal" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 17, | |
"text": [ | |
"(0.0476190476190476, 0.04761904761904761)" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import math\n", | |
"math.log(mc3.jointProb('AAA1'), 2), mc3.jointProbL('AAA1') # should be equal" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 18, | |
"text": [ | |
"(-4.392317422778761, -4.392317422778761)" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 18 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment