Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created October 16, 2013 17:49
Show Gist options
  • Save BenLangmead/7011945 to your computer and use it in GitHub Desktop.
Save BenLangmead/7011945 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# For space reasons,I'm not showing the implementations of these\n",
"# imported functions\n",
"from index_edit import kEditDp, Index, partition"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def queryIndexEdit(p, t, k, index):\n",
" ''' Look for occurrences of p in t with up to k edits using an\n",
" index combined with dynamic-programming alignment. '''\n",
" l = index.ln\n",
" occurrences = []\n",
" seen = set() # for avoiding reporting same hit twice\n",
" for part, poff in partition(p, k+1):\n",
" for hit in index.occurrences(part): # query index w/ partition\n",
" # left edge of T to include in DP matrix\n",
" lf = max(0, hit - poff - k)\n",
" # right edge of T to include in DP matrix\n",
" rt = min(len(t), hit - poff + len(p) + k)\n",
" mn, off, xcript = kEditDp(p, t[lf:rt])\n",
" off += lf\n",
" if mn <= k and (mn, off) not in seen:\n",
" occurrences.append((mn, off, xcript))\n",
" seen.add((mn, off))\n",
" return occurrences"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"t = 'I had two banana splits'\n",
"ind = Index(t, ln=4)\n",
"queryIndexEdit('bananasplit', t, 1, ind)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 31,
"text": [
"[(1, 10, 'MMMMMMDMMMMM')]"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"t = 'haystack needle haystack noodle haystack nedle haystack'\n",
"# needle noodle ne-dle\n",
"# |||||| | ||| || |||\n",
"# needle needle needle\n",
"p = 'needle'\n",
"# Find the two occurrences that are within edit distance 1\n",
"# The substring length for the index has to be <= 3 (why?)\n",
"queryIndexEdit(p, t, 1, Index(t, ln=3))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 38,
"text": [
"[(0, 9, 'MMMMMM'), (1, 41, 'MIMMMM')]"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Find the three occurrences that are within edit distance 2\n",
"# The substring length for the index has to be <= 2 (why?)\n",
"queryIndexEdit('needle', t, 2, Index(t, ln=2))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 39,
"text": [
"[(0, 9, 'MMMMMM'), (1, 41, 'MIMMMM'), (2, 25, 'MRRMMM')]"
]
}
],
"prompt_number": 39
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment