Last active
December 25, 2015 01:19
-
-
Save BenLangmead/6894694 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def hammingDistance(x, y):\n", | |
" ''' Return Hamming distance between x and y '''\n", | |
" assert len(x) == len(y)\n", | |
" nmm = 0\n", | |
" for i in xrange(0, len(x)):\n", | |
" if x[i] != y[i]:\n", | |
" nmm += 1\n", | |
" return nmm" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"hammingDistance('brown', 'blown')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"1" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"hammingDistance('cringe', 'orange')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"2" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def boundEditDistance(x, y):\n", | |
" ''' Return loose lower and upper bounds on the edit distance\n", | |
" between x and y in O(|x| + |y|) time. '''\n", | |
" if x == y: return 0, 0\n", | |
" if len(x) == 0: return len(y), len(y)\n", | |
" if len(y) == 0: return len(x), len(x)\n", | |
" diff = abs(len(x) - len(y))\n", | |
" lower = diff\n", | |
" if lower == 0 and x != y:\n", | |
" lower = 1\n", | |
" minlen = min(len(x), len(y))\n", | |
" upper = hammingDistance(x[:minlen], y[:minlen]) + diff\n", | |
" return lower, upper" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"boundEditDistance('brown', 'blown')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"(1, 1)" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"boundEditDistance('create', 'creation')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"(2, 3)" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"boundEditDistance('aaa', 'bbb')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"(1, 3)" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# note: bounds can be pretty rough\n", | |
"boundEditDistance('the longest', 'longest day')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"(1, 11)" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"boundEditDistance('Shakespeare', 'shake spear')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"(1, 7)" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def edDistRecursive(x, y):\n", | |
" if len(x) == 0: return len(y)\n", | |
" if len(y) == 0: return len(x)\n", | |
" delt = 1 if x[-1] != y[-1] else 0\n", | |
" diag = edDistRecursive(x[:-1], y[:-1]) + delt \n", | |
" vert = edDistRecursive(x[:-1], y) + 1\n", | |
" horz = edDistRecursive(x, y[:-1]) + 1\n", | |
" return min(diag, vert, horz)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"edDistRecursive('Shakespeare', 'shake spear') # this takes a while!" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# let's see how long it takes\n", | |
"from time import time\n", | |
"st = time()\n", | |
"edDistRecursive('Shakespeare', 'shake spear')\n", | |
"print 'Took %0.3f seconds' % (time() - st)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Took 30.490 seconds\n" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def edDistRecursiveMemo(x, y, memo=None):\n", | |
" ''' A version of edDistRecursive with memoization. For each x, y we see, we\n", | |
" record result from edDistRecursiveMemo(x, y). In the future, we retrieve\n", | |
" recorded result rather than re-run the function. '''\n", | |
" if memo is None: memo = {}\n", | |
" if len(x) == 0: return len(y)\n", | |
" if len(y) == 0: return len(x)\n", | |
" if (len(x), len(y)) in memo:\n", | |
" return memo[(len(x), len(y))]\n", | |
" delt = 1 if x[-1] != y[-1] else 0\n", | |
" diag = edDistRecursiveMemo(x[:-1], y[:-1], memo) + delt\n", | |
" vert = edDistRecursiveMemo(x[:-1], y, memo) + 1\n", | |
" horz = edDistRecursiveMemo(x, y[:-1], memo) + 1\n", | |
" ans = min(diag, vert, horz)\n", | |
" memo[(len(x), len(y))] = ans\n", | |
" return ans" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"edDistRecursiveMemo('Shakespeare', 'shake spear')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 14, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# this time it's instantaneous\n", | |
"from time import time\n", | |
"st = time()\n", | |
"edDistRecursiveMemo('Shakespeare', 'shake spear')\n", | |
"print 'Took %0.3f seconds' % (time() - st)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Took 0.001 seconds\n" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from numpy import zeros\n", | |
"\n", | |
"def edDistDp(x, y):\n", | |
" \"\"\" Calculate edit distance between sequences x and y using\n", | |
" matrix dynamic programming. Return distance. \"\"\"\n", | |
" D = zeros((len(x)+1, len(y)+1), dtype=int)\n", | |
" D[0, 1:] = range(1, len(y)+1)\n", | |
" D[1:, 0] = range(1, len(x)+1)\n", | |
" for i in xrange(1, len(x)+1):\n", | |
" for j in xrange(1, len(y)+1):\n", | |
" delt = 1 if x[i-1] != y[j-1] else 0\n", | |
" D[i, j] = min(D[i-1, j-1]+delt, D[i-1, j]+1, D[i, j-1]+1)\n", | |
" return D[len(x), len(y)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"edDistDp('Shakespeare', 'shake spear')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 17, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# again, instantaneous\n", | |
"from time import time\n", | |
"st = time()\n", | |
"edDistDp('Shakespeare', 'shake spear')\n", | |
"print 'Took %0.3f seconds' % (time() - st)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Took 0.001 seconds\n" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"edDistDp('GCGTATGCACGC', 'GCTATGCCACGC')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 19, | |
"text": [ | |
"2" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment