Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Last active December 25, 2015 01:19
Show Gist options
  • Save BenLangmead/6894694 to your computer and use it in GitHub Desktop.
Save BenLangmead/6894694 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"def hammingDistance(x, y):\n",
" ''' Return Hamming distance between x and y '''\n",
" assert len(x) == len(y)\n",
" nmm = 0\n",
" for i in xrange(0, len(x)):\n",
" if x[i] != y[i]:\n",
" nmm += 1\n",
" return nmm"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"hammingDistance('brown', 'blown')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"1"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"hammingDistance('cringe', 'orange')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"2"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def boundEditDistance(x, y):\n",
" ''' Return loose lower and upper bounds on the edit distance\n",
" between x and y in O(|x| + |y|) time. '''\n",
" if x == y: return 0, 0\n",
" if len(x) == 0: return len(y), len(y)\n",
" if len(y) == 0: return len(x), len(x)\n",
" diff = abs(len(x) - len(y))\n",
" lower = diff\n",
" if lower == 0 and x != y:\n",
" lower = 1\n",
" minlen = min(len(x), len(y))\n",
" upper = hammingDistance(x[:minlen], y[:minlen]) + diff\n",
" return lower, upper"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"boundEditDistance('brown', 'blown')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"(1, 1)"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"boundEditDistance('create', 'creation')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"(2, 3)"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"boundEditDistance('aaa', 'bbb')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"(1, 3)"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# note: bounds can be pretty rough\n",
"boundEditDistance('the longest', 'longest day')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"(1, 11)"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"boundEditDistance('Shakespeare', 'shake spear')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"(1, 7)"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def edDistRecursive(x, y):\n",
" if len(x) == 0: return len(y)\n",
" if len(y) == 0: return len(x)\n",
" delt = 1 if x[-1] != y[-1] else 0\n",
" diag = edDistRecursive(x[:-1], y[:-1]) + delt \n",
" vert = edDistRecursive(x[:-1], y) + 1\n",
" horz = edDistRecursive(x, y[:-1]) + 1\n",
" return min(diag, vert, horz)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edDistRecursive('Shakespeare', 'shake spear') # this takes a while!"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"3"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# let's see how long it takes\n",
"from time import time\n",
"st = time()\n",
"edDistRecursive('Shakespeare', 'shake spear')\n",
"print 'Took %0.3f seconds' % (time() - st)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Took 30.490 seconds\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def edDistRecursiveMemo(x, y, memo=None):\n",
" ''' A version of edDistRecursive with memoization. For each x, y we see, we\n",
" record result from edDistRecursiveMemo(x, y). In the future, we retrieve\n",
" recorded result rather than re-run the function. '''\n",
" if memo is None: memo = {}\n",
" if len(x) == 0: return len(y)\n",
" if len(y) == 0: return len(x)\n",
" if (len(x), len(y)) in memo:\n",
" return memo[(len(x), len(y))]\n",
" delt = 1 if x[-1] != y[-1] else 0\n",
" diag = edDistRecursiveMemo(x[:-1], y[:-1], memo) + delt\n",
" vert = edDistRecursiveMemo(x[:-1], y, memo) + 1\n",
" horz = edDistRecursiveMemo(x, y[:-1], memo) + 1\n",
" ans = min(diag, vert, horz)\n",
" memo[(len(x), len(y))] = ans\n",
" return ans"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edDistRecursiveMemo('Shakespeare', 'shake spear')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"3"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# this time it's instantaneous\n",
"from time import time\n",
"st = time()\n",
"edDistRecursiveMemo('Shakespeare', 'shake spear')\n",
"print 'Took %0.3f seconds' % (time() - st)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Took 0.001 seconds\n"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from numpy import zeros\n",
"\n",
"def edDistDp(x, y):\n",
" \"\"\" Calculate edit distance between sequences x and y using\n",
" matrix dynamic programming. Return distance. \"\"\"\n",
" D = zeros((len(x)+1, len(y)+1), dtype=int)\n",
" D[0, 1:] = range(1, len(y)+1)\n",
" D[1:, 0] = range(1, len(x)+1)\n",
" for i in xrange(1, len(x)+1):\n",
" for j in xrange(1, len(y)+1):\n",
" delt = 1 if x[i-1] != y[j-1] else 0\n",
" D[i, j] = min(D[i-1, j-1]+delt, D[i-1, j]+1, D[i, j-1]+1)\n",
" return D[len(x), len(y)]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edDistDp('Shakespeare', 'shake spear')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 17,
"text": [
"3"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# again, instantaneous\n",
"from time import time\n",
"st = time()\n",
"edDistDp('Shakespeare', 'shake spear')\n",
"print 'Took %0.3f seconds' % (time() - st)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Took 0.001 seconds\n"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edDistDp('GCGTATGCACGC', 'GCTATGCCACGC')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 19,
"text": [
"2"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment