Created
October 16, 2013 17:49
-
-
Save BenLangmead/7011945 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# For space reasons,I'm not showing the implementations of these\n", | |
"# imported functions\n", | |
"from index_edit import kEditDp, Index, partition" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def queryIndexEdit(p, t, k, index):\n", | |
" ''' Look for occurrences of p in t with up to k edits using an\n", | |
" index combined with dynamic-programming alignment. '''\n", | |
" l = index.ln\n", | |
" occurrences = []\n", | |
" seen = set() # for avoiding reporting same hit twice\n", | |
" for part, poff in partition(p, k+1):\n", | |
" for hit in index.occurrences(part): # query index w/ partition\n", | |
" # left edge of T to include in DP matrix\n", | |
" lf = max(0, hit - poff - k)\n", | |
" # right edge of T to include in DP matrix\n", | |
" rt = min(len(t), hit - poff + len(p) + k)\n", | |
" mn, off, xcript = kEditDp(p, t[lf:rt])\n", | |
" off += lf\n", | |
" if mn <= k and (mn, off) not in seen:\n", | |
" occurrences.append((mn, off, xcript))\n", | |
" seen.add((mn, off))\n", | |
" return occurrences" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"t = 'I had two banana splits'\n", | |
"ind = Index(t, ln=4)\n", | |
"queryIndexEdit('bananasplit', t, 1, ind)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 31, | |
"text": [ | |
"[(1, 10, 'MMMMMMDMMMMM')]" | |
] | |
} | |
], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"t = 'haystack needle haystack noodle haystack nedle haystack'\n", | |
"# needle noodle ne-dle\n", | |
"# |||||| | ||| || |||\n", | |
"# needle needle needle\n", | |
"p = 'needle'\n", | |
"# Find the two occurrences that are within edit distance 1\n", | |
"# The substring length for the index has to be <= 3 (why?)\n", | |
"queryIndexEdit(p, t, 1, Index(t, ln=3))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 38, | |
"text": [ | |
"[(0, 9, 'MMMMMM'), (1, 41, 'MIMMMM')]" | |
] | |
} | |
], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Find the three occurrences that are within edit distance 2\n", | |
"# The substring length for the index has to be <= 2 (why?)\n", | |
"queryIndexEdit('needle', t, 2, Index(t, ln=2))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 39, | |
"text": [ | |
"[(0, 9, 'MMMMMM'), (1, 41, 'MIMMMM'), (2, 25, 'MRRMMM')]" | |
] | |
} | |
], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment