Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Last active December 24, 2015 12:19
Show Gist options
  • Save BenLangmead/6796858 to your computer and use it in GitHub Desktop.
Save BenLangmead/6796858 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Suffix tree built with simple O(m^2)-time algorithm.\n",
"class SuffixTree(object):\n",
" \n",
" class Node(object):\n",
" def __init__(self, depth, off, ln, lab=None):\n",
" self.depth = depth\n",
" self.off = off # offset into T of substring\n",
" self.ln = ln # length of substring\n",
" self.out = {} # outgoing edges; maps characters to nodes\n",
" \n",
" def __init__(self, t):\n",
" \"\"\" Make suffix tree, without suffix links, from s in quadratic time\n",
" and linear space \"\"\"\n",
" t += '$'\n",
" self.t = t\n",
" self.root = self.Node(0, 0, 0, None)\n",
" self.root.out[t[0]] = self.Node(len(t), 0, len(t), t)\n",
" self.nodes = []\n",
" for i in xrange(1, len(t)):\n",
" cur = self.root\n",
" j = i\n",
" while j < len(t):\n",
" if t[j] in cur.out:\n",
" child = cur.out[t[j]]\n",
" lab = t[child.off:child.off+child.ln]\n",
" k = j+1 # Walk along edge\n",
" while k-j < len(lab) and t[k] == lab[k-j]:\n",
" k += 1\n",
" if k-j == len(lab):\n",
" cur = child # exhausted the edge\n",
" j = k\n",
" else:\n",
" # fell off in middle of edge\n",
" cExist, cNew = lab[k-j], t[k]\n",
" mid = self.Node(cur.depth + k-j, child.off, k-j, lab[:k-j])\n",
" mid.out[cNew] = self.Node(mid.depth + len(t[k:]), k, len(t[k:]), t[k:])\n",
" self.nodes.append(mid)\n",
" self.nodes.append(mid.out[cNew])\n",
" mid.out[cExist] = child\n",
" child.off += (k-j)\n",
" child.ln -= (k-j)\n",
" cur.out[t[j]] = mid\n",
" else:\n",
" # Create a new edge hanging off of this node\n",
" cur.out[t[j]] = self.Node(cur.depth + len(t[j:]), j, len(t[j:]), t[j:])\n",
" self.nodes.append(cur.out[t[j]])\n",
" \n",
" def saLcp(self):\n",
" # Return suffix array and an LCP1 array corresponding to this\n",
" # suffix tree. self.root is root, self.t is the text.\n",
" self.minSinceLeaf = 0\n",
" sa, lcp1 = [], []\n",
" def __visit(n):\n",
" if len(n.out) == 0:\n",
" # leaf node, record offset and LCP1 with previous leaf\n",
" sa.append(len(self.t) - n.depth)\n",
" lcp1.append(self.minSinceLeaf)\n",
" # reset LCP1 to depth of this leaf\n",
" self.minSinceLeaf = n.depth\n",
" # visit children in lexicographical order\n",
" for c, child in sorted(n.out.iteritems()):\n",
" __visit(child)\n",
" # after each child visit, perhaps decrease\n",
" # minimum-depth-since-last-leaf value\n",
" self.minSinceLeaf = min(self.minSinceLeaf, n.depth)\n",
" __visit(self.root)\n",
" return sa, lcp1[1:]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"st = SuffixTree('abaaba')\n",
"sa, lcp1 = st.saLcp()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sa, lcp1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"([6, 5, 2, 3, 0, 4, 1], [0, 1, 1, 3, 0, 2])"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment