Created
September 23, 2013 02:34
-
-
Save BenLangmead/6665861 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class SuffixTree(object):\n", | |
" \n", | |
" class Node(object):\n", | |
" def __init__(self, lab):\n", | |
" self.lab = lab # label on path leading to this node\n", | |
" self.out = {} # outgoing edges; maps characters to nodes\n", | |
" \n", | |
" def __init__(self, s):\n", | |
" \"\"\" Make suffix tree, without suffix links, from s in quadratic time\n", | |
" and linear space \"\"\"\n", | |
" s += '$'\n", | |
" self.root = self.Node(None)\n", | |
" self.root.out[s[0]] = self.Node(s) # trie for just longest suf\n", | |
" # add the rest of the suffixes, from longest to shortest\n", | |
" for i in xrange(1, len(s)):\n", | |
" # start at root; we\u2019ll walk down as far as we can go\n", | |
" cur = self.root\n", | |
" j = i\n", | |
" while j < len(s):\n", | |
" if s[j] in cur.out:\n", | |
" child = cur.out[s[j]]\n", | |
" lab = child.lab\n", | |
" # Walk along edge until we exhaust edge label or\n", | |
" # until we mismatch\n", | |
" k = j+1 \n", | |
" while k-j < len(lab) and s[k] == lab[k-j]:\n", | |
" k += 1\n", | |
" if k-j == len(lab):\n", | |
" cur = child # we exhausted the edge\n", | |
" j = k\n", | |
" else:\n", | |
" # we fell off in middle of edge\n", | |
" cExist, cNew = lab[k-j], s[k]\n", | |
" # create \u201cmid\u201d: new node bisecting edge\n", | |
" mid = self.Node(lab[:k-j])\n", | |
" mid.out[cNew] = self.Node(s[k:])\n", | |
" # original child becomes mid\u2019s child\n", | |
" mid.out[cExist] = child\n", | |
" # original child\u2019s label is curtailed\n", | |
" child.lab = lab[k-j:]\n", | |
" # mid becomes new child of original parent\n", | |
" cur.out[s[j]] = mid\n", | |
" else:\n", | |
" # Fell off tree at a node: make new edge hanging off it\n", | |
" cur.out[s[j]] = self.Node(s[j:])\n", | |
" \n", | |
" def followPath(self, s):\n", | |
" \"\"\" Follow path given by s. If we fall off tree, return None. If we\n", | |
" finish mid-edge, return (node, offset) where 'node' is child and\n", | |
" 'offset' is label offset. If we finish on a node, return (node,\n", | |
" None). \"\"\"\n", | |
" cur = self.root\n", | |
" i = 0\n", | |
" while i < len(s):\n", | |
" c = s[i]\n", | |
" if c not in cur.out:\n", | |
" return (None, None) # fell off at a node\n", | |
" child = cur.out[s[i]]\n", | |
" lab = child.lab\n", | |
" j = i+1\n", | |
" while j-i < len(lab) and j < len(s) and s[j] == lab[j-i]:\n", | |
" j += 1\n", | |
" if j-i == len(lab):\n", | |
" cur = child # exhausted edge\n", | |
" i = j\n", | |
" elif j == len(s):\n", | |
" return (child, j-i) # exhausted query string in middle of edge\n", | |
" else:\n", | |
" return (None, None) # fell off in the middle of the edge\n", | |
" return (cur, None) # exhausted query string at internal node\n", | |
" \n", | |
" def hasSubstring(self, s):\n", | |
" \"\"\" Return true iff s appears as a substring \"\"\"\n", | |
" node, off = self.followPath(s)\n", | |
" return node is not None\n", | |
" \n", | |
" def hasSuffix(self, s):\n", | |
" \"\"\" Return true iff s is a suffix \"\"\"\n", | |
" node, off = self.followPath(s)\n", | |
" if node is None:\n", | |
" return False # fell off the tree\n", | |
" if off is None:\n", | |
" # finished on top of a node\n", | |
" return '$' in node.out\n", | |
" else:\n", | |
" # finished at offset 'off' within an edge leading to 'node'\n", | |
" return node.lab[off] == '$'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"stree = SuffixTree('there would have been a time for such a word')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"stree.hasSubstring('nope')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"False" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"stree.hasSubstring('would have been')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"stree.hasSubstring('such a word')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"stree.hasSuffix('would have been')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"False" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"stree.hasSuffix('such a word')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment