Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created September 23, 2013 02:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BenLangmead/6665861 to your computer and use it in GitHub Desktop.
Save BenLangmead/6665861 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"class SuffixTree(object):\n",
" \n",
" class Node(object):\n",
" def __init__(self, lab):\n",
" self.lab = lab # label on path leading to this node\n",
" self.out = {} # outgoing edges; maps characters to nodes\n",
" \n",
" def __init__(self, s):\n",
" \"\"\" Make suffix tree, without suffix links, from s in quadratic time\n",
" and linear space \"\"\"\n",
" s += '$'\n",
" self.root = self.Node(None)\n",
" self.root.out[s[0]] = self.Node(s) # trie for just longest suf\n",
" # add the rest of the suffixes, from longest to shortest\n",
" for i in xrange(1, len(s)):\n",
" # start at root; we\u2019ll walk down as far as we can go\n",
" cur = self.root\n",
" j = i\n",
" while j < len(s):\n",
" if s[j] in cur.out:\n",
" child = cur.out[s[j]]\n",
" lab = child.lab\n",
" # Walk along edge until we exhaust edge label or\n",
" # until we mismatch\n",
" k = j+1 \n",
" while k-j < len(lab) and s[k] == lab[k-j]:\n",
" k += 1\n",
" if k-j == len(lab):\n",
" cur = child # we exhausted the edge\n",
" j = k\n",
" else:\n",
" # we fell off in middle of edge\n",
" cExist, cNew = lab[k-j], s[k]\n",
" # create \u201cmid\u201d: new node bisecting edge\n",
" mid = self.Node(lab[:k-j])\n",
" mid.out[cNew] = self.Node(s[k:])\n",
" # original child becomes mid\u2019s child\n",
" mid.out[cExist] = child\n",
" # original child\u2019s label is curtailed\n",
" child.lab = lab[k-j:]\n",
" # mid becomes new child of original parent\n",
" cur.out[s[j]] = mid\n",
" else:\n",
" # Fell off tree at a node: make new edge hanging off it\n",
" cur.out[s[j]] = self.Node(s[j:])\n",
" \n",
" def followPath(self, s):\n",
" \"\"\" Follow path given by s. If we fall off tree, return None. If we\n",
" finish mid-edge, return (node, offset) where 'node' is child and\n",
" 'offset' is label offset. If we finish on a node, return (node,\n",
" None). \"\"\"\n",
" cur = self.root\n",
" i = 0\n",
" while i < len(s):\n",
" c = s[i]\n",
" if c not in cur.out:\n",
" return (None, None) # fell off at a node\n",
" child = cur.out[s[i]]\n",
" lab = child.lab\n",
" j = i+1\n",
" while j-i < len(lab) and j < len(s) and s[j] == lab[j-i]:\n",
" j += 1\n",
" if j-i == len(lab):\n",
" cur = child # exhausted edge\n",
" i = j\n",
" elif j == len(s):\n",
" return (child, j-i) # exhausted query string in middle of edge\n",
" else:\n",
" return (None, None) # fell off in the middle of the edge\n",
" return (cur, None) # exhausted query string at internal node\n",
" \n",
" def hasSubstring(self, s):\n",
" \"\"\" Return true iff s appears as a substring \"\"\"\n",
" node, off = self.followPath(s)\n",
" return node is not None\n",
" \n",
" def hasSuffix(self, s):\n",
" \"\"\" Return true iff s is a suffix \"\"\"\n",
" node, off = self.followPath(s)\n",
" if node is None:\n",
" return False # fell off the tree\n",
" if off is None:\n",
" # finished on top of a node\n",
" return '$' in node.out\n",
" else:\n",
" # finished at offset 'off' within an edge leading to 'node'\n",
" return node.lab[off] == '$'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stree = SuffixTree('there would have been a time for such a word')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stree.hasSubstring('nope')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"False"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stree.hasSubstring('would have been')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"True"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stree.hasSubstring('such a word')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"True"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stree.hasSuffix('would have been')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"False"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stree.hasSuffix('such a word')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"True"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment