Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created September 16, 2013 18:28
Show Gist options
  • Save BenLangmead/6584538 to your computer and use it in GitHub Desktop.
Save BenLangmead/6584538 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "CG_InvertedIndex2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import bisect\n",
"import sys\n",
"\n",
"class Index(object):\n",
" \n",
" def __init__(self, t, ln=2):\n",
" \"\"\" Create index, extracting substrings of length 'ln' \"\"\"\n",
" self.ln = ln\n",
" self.index = []\n",
" for i in xrange(0, len(t)-ln+1):\n",
" self.index.append((t[i:i+ln], i)) # add <substr, offset> pair\n",
" self.index.sort() # sort pairs\n",
" \n",
" def query(self, p):\n",
" \"\"\" Return candidate alignments for p \"\"\"\n",
" st = bisect.bisect_left(self.index, (p[:self.ln], -1))\n",
" en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxint))\n",
" hits = self.index[st:en]\n",
" return [ h[1] for h in hits ] # return just the offsets\n",
"\n",
"def queryIndex(p, t, index):\n",
" \"\"\" Look for occurrences of p in t with help of index \"\"\"\n",
" ln = index.ln\n",
" occurrences = []\n",
" for i in index.query(p):\n",
" if t[i+ln:i+len(p)] == p[ln:]:\n",
" occurrences.append(i)\n",
" return occurrences"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"t = 'This is an example text, example text, example text'\n",
"p = 'example'\n",
"ind = Index(t)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"queryIndex(p, t, ind)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 15,
"text": [
"[11, 25, 39]"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def naive(p, t):\n",
" assert len(p) <= len(t) # assume text at least as long as pattern\n",
" occurrences = []\n",
" # For each alignment of p to t\n",
" for i in xrange(0, len(t)-len(p)+1):\n",
" match = True # assume the pattern matches until proven wrong\n",
" # For each position of p\n",
" for j in xrange(0, len(p)):\n",
" if t[i+j] != p[j]:\n",
" match = False # at least 1 char mismatches, so no match\n",
" break\n",
" if match:\n",
" occurrences.append(i)\n",
" return occurrences"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"naive(p, t)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 17,
"text": [
"[11, 25, 39]"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import bisect\n",
"import sys\n",
"\n",
"class Index2(object):\n",
" \n",
" def __init__(self, t, ln=2, interval=2):\n",
" \"\"\" Create index, extracting substrings of length 'ln' every\n",
" 'interval' positions \"\"\"\n",
" self.ln = ln\n",
" self.interval = interval\n",
" self.index = []\n",
" for i in xrange(0, len(t)-ln+1, interval):\n",
" self.index.append((t[i:i+ln], i)) # add <substr, offset> pair\n",
" self.index.sort() # sort pairs\n",
" \n",
" def query(self, p):\n",
" \"\"\" Return candidate alignments for p \"\"\"\n",
" st = bisect.bisect_left(self.index, (p[:self.ln], -1))\n",
" en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxint))\n",
" hits = self.index[st:en]\n",
" return [ h[1] for h in hits ] # return just the offsets\n",
"\n",
"def queryIndex2(p, t, index):\n",
" \"\"\" Look for occurrences of p in t with help of index \"\"\"\n",
" ln, interval = index.ln, index.interval\n",
" occurrences = []\n",
" for k in xrange(0, interval): # For each offset into interval\n",
" for i in index.query(p[k:]): # For each index hit\n",
" # Test for match\n",
" if t[i-k:i] == p[:k] and t[i+ln:i-k+len(p)] == p[k+ln:]:\n",
" occurrences.append(i-k)\n",
" return sorted(occurrences)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ind2 = Index2(t, ln=2, interval=2)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"queryIndex2(p, t, ind2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 22,
"text": [
"[11, 25, 39]"
]
}
],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment