Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created November 13, 2013 17:11
Show Gist options
  • Save BenLangmead/7452696 to your computer and use it in GitHub Desktop.
Save BenLangmead/7452696 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Using an edit-distance-like dynamic programming formulation, we can\n",
"# look for approximate occurrences of p in t.\n",
"\n",
"import sys\n",
"import numpy\n",
"\n",
"# Assume x is the string labeling rows of the matrix and y is the\n",
"# string labeling the columns\n",
"\n",
"def trace(D, x, y):\n",
" ''' Backtrace edit-distance matrix D for strings x and y '''\n",
" i, j = len(x), len(y)\n",
" xscript = []\n",
" while i > 0:\n",
" diag, vert, horz = sys.maxint, sys.maxint, sys.maxint\n",
" delt = None\n",
" if i > 0 and j > 0:\n",
" delt = 0 if x[i-1] == y[j-1] else 1\n",
" diag = D[i-1, j-1] + delt\n",
" if i > 0:\n",
" vert = D[i-1, j] + 1\n",
" if j > 0:\n",
" horz = D[i, j-1] + 1\n",
" if diag <= vert and diag <= horz:\n",
" # diagonal was best\n",
" xscript.append('R' if delt == 1 else 'M')\n",
" i -= 1; j -= 1\n",
" elif vert <= horz:\n",
" # vertical was best; this is an insertion in x w/r/t y\n",
" xscript.append('I')\n",
" i -= 1\n",
" else:\n",
" # horizontal was best\n",
" xscript.append('D')\n",
" j -= 1\n",
" # j = offset of the first (leftmost) character of t involved in the\n",
" # alignment\n",
" return j, (''.join(xscript))[::-1] # reverse and string-ize\n",
"\n",
"def kEditDp(p, t):\n",
" ''' Find and return the alignment of p to a substring of t with the\n",
" fewest edits. We return the edit distance, the offset of the\n",
" substring aligned to, and the edit transcript. If multiple\n",
" alignments tie for best, we report the leftmost. '''\n",
" D = numpy.zeros((len(p)+1, len(t)+1), dtype=int)\n",
" # Note: First row gets zeros. First column initialized as usual.\n",
" D[1:, 0] = range(1, len(p)+1)\n",
" for i in xrange(1, len(p)+1):\n",
" for j in xrange(1, len(t)+1):\n",
" delt = 1 if p[i-1] != t[j-1] else 0\n",
" D[i, j] = min(D[i-1, j-1] + delt, D[i-1, j] + 1, D[i, j-1] + 1)\n",
" # Find minimum edit distance in last row\n",
" mnJ, mn = None, len(p) + len(t)\n",
" for j in xrange(0, len(t)+1):\n",
" if D[len(p), j] < mn:\n",
" mnJ, mn = j, D[len(p), j]\n",
" # Backtrace; note: stops as soon as it gets to first row\n",
" off, xcript = trace(D, p, t[:mnJ])\n",
" # Return edit distance, offset into T, edit transcript\n",
" return mn, off, xcript, D"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p, t = 'TACGTCAGC', 'AACCCTATGTCATGCCTTGGA'\n",
"mn, off, xscript, D = kEditDp(p, t)\n",
"mn, off, xscript"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"(2, 5, 'MMRMMMMDMM')"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"D"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1],\n",
" [2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1],\n",
" [3, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2],\n",
" [4, 3, 3, 2, 2, 3, 3, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 2, 3],\n",
" [5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 1, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3],\n",
" [6, 5, 5, 4, 3, 3, 4, 4, 3, 3, 2, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4],\n",
" [7, 6, 5, 5, 4, 4, 4, 4, 4, 4, 3, 2, 1, 2, 3, 4, 4, 4, 4, 4, 5, 4],\n",
" [8, 7, 6, 6, 5, 5, 5, 5, 5, 4, 4, 3, 2, 2, 2, 3, 4, 5, 5, 4, 4, 5],\n",
" [9, 8, 7, 6, 6, 5, 6, 6, 6, 5, 5, 4, 3, 3, 3, 2, 3, 4, 5, 5, 5, 5]])"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment