Created
November 13, 2013 17:11
-
-
Save BenLangmead/7452696 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Using an edit-distance-like dynamic programming formulation, we can\n", | |
"# look for approximate occurrences of p in t.\n", | |
"\n", | |
"import sys\n", | |
"import numpy\n", | |
"\n", | |
"# Assume x is the string labeling rows of the matrix and y is the\n", | |
"# string labeling the columns\n", | |
"\n", | |
"def trace(D, x, y):\n", | |
" ''' Backtrace edit-distance matrix D for strings x and y '''\n", | |
" i, j = len(x), len(y)\n", | |
" xscript = []\n", | |
" while i > 0:\n", | |
" diag, vert, horz = sys.maxint, sys.maxint, sys.maxint\n", | |
" delt = None\n", | |
" if i > 0 and j > 0:\n", | |
" delt = 0 if x[i-1] == y[j-1] else 1\n", | |
" diag = D[i-1, j-1] + delt\n", | |
" if i > 0:\n", | |
" vert = D[i-1, j] + 1\n", | |
" if j > 0:\n", | |
" horz = D[i, j-1] + 1\n", | |
" if diag <= vert and diag <= horz:\n", | |
" # diagonal was best\n", | |
" xscript.append('R' if delt == 1 else 'M')\n", | |
" i -= 1; j -= 1\n", | |
" elif vert <= horz:\n", | |
" # vertical was best; this is an insertion in x w/r/t y\n", | |
" xscript.append('I')\n", | |
" i -= 1\n", | |
" else:\n", | |
" # horizontal was best\n", | |
" xscript.append('D')\n", | |
" j -= 1\n", | |
" # j = offset of the first (leftmost) character of t involved in the\n", | |
" # alignment\n", | |
" return j, (''.join(xscript))[::-1] # reverse and string-ize\n", | |
"\n", | |
"def kEditDp(p, t):\n", | |
" ''' Find and return the alignment of p to a substring of t with the\n", | |
" fewest edits. We return the edit distance, the offset of the\n", | |
" substring aligned to, and the edit transcript. If multiple\n", | |
" alignments tie for best, we report the leftmost. '''\n", | |
" D = numpy.zeros((len(p)+1, len(t)+1), dtype=int)\n", | |
" # Note: First row gets zeros. First column initialized as usual.\n", | |
" D[1:, 0] = range(1, len(p)+1)\n", | |
" for i in xrange(1, len(p)+1):\n", | |
" for j in xrange(1, len(t)+1):\n", | |
" delt = 1 if p[i-1] != t[j-1] else 0\n", | |
" D[i, j] = min(D[i-1, j-1] + delt, D[i-1, j] + 1, D[i, j-1] + 1)\n", | |
" # Find minimum edit distance in last row\n", | |
" mnJ, mn = None, len(p) + len(t)\n", | |
" for j in xrange(0, len(t)+1):\n", | |
" if D[len(p), j] < mn:\n", | |
" mnJ, mn = j, D[len(p), j]\n", | |
" # Backtrace; note: stops as soon as it gets to first row\n", | |
" off, xcript = trace(D, p, t[:mnJ])\n", | |
" # Return edit distance, offset into T, edit transcript\n", | |
" return mn, off, xcript, D" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p, t = 'TACGTCAGC', 'AACCCTATGTCATGCCTTGGA'\n", | |
"mn, off, xscript, D = kEditDp(p, t)\n", | |
"mn, off, xscript" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"(2, 5, 'MMRMMMMDMM')" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"D" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1],\n", | |
" [2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1],\n", | |
" [3, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2],\n", | |
" [4, 3, 3, 2, 2, 3, 3, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 2, 3],\n", | |
" [5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 1, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3],\n", | |
" [6, 5, 5, 4, 3, 3, 4, 4, 3, 3, 2, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4],\n", | |
" [7, 6, 5, 5, 4, 4, 4, 4, 4, 4, 3, 2, 1, 2, 3, 4, 4, 4, 4, 4, 5, 4],\n", | |
" [8, 7, 6, 6, 5, 5, 5, 5, 5, 4, 4, 3, 2, 2, 2, 3, 4, 5, 5, 4, 4, 5],\n", | |
" [9, 8, 7, 6, 6, 5, 6, 6, 6, 5, 5, 4, 3, 3, 3, 2, 3, 4, 5, 5, 5, 5]])" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment