Last active
December 18, 2015 01:29
-
-
Save yongpitt/5704156 to your computer and use it in GitHub Desktop.
This is a demonstration class for Suffix Tree data structure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /*This is a demonstration class for Suffix Tree data structure*/ | |
| /* | |
| Before we move on, it's nice to learn from http://www.cs.cmu.edu/~ckingsf/bioinfo-lectures/suffixtrees.pdf | |
| the basics about a suffix tree and what it does, as we have done with the Suffix Array data structure. | |
| Don't get confused between the suffix tree data structure and other related structures. Recall that when | |
| we learn suffix array we have a comparison among several data structures that are closely related to each | |
| other: | |
| A high-level description on suffix array vs suffix tree: | |
| A suffix array is a space-efficient datastructure, which, if stored together with the original string, | |
| provides the same functions as a suffix tree. | |
| So yes, you can think of a suffix array as a storage mechanism for suffix trees. Depending on the details, | |
| there may be performance costs to using arrays over full-blown trees, which typically are outweighed by | |
| the space-use benefits. I believe the arrays are constructed almost identically to the way trees are | |
| constructed, and, in practice, suffix arrays are the preferred method for storing/representing a suffix tree. | |
| Trie vs Suffix tree vs Suffix array (http://stackoverflow.com/questions/2487576/trie-vs-suffix-tree-vs-suffix-array): | |
| A TRIE was the first data structure of this kind discovered. The suffix tree is an improvement over the | |
| TRIE ( it has suffix links which allow for example linear error search, the suffix tree trims unnecessary | |
| branches of the TRIE therefore it does not require as much space ). | |
| The suffix array is a stripped down data structure based on the suffix tree (no suffix links (slow error | |
| matches), yet pattern matching is very fast). | |
| The TRIE is not for real world use because it consumes too much space. | |
| The suffix tree is lighter and faster than the Trie and is used to index ADN or optimize some large web | |
| search engines. | |
| The suffix array is slower in some pattern searches than the suffix tree but uses less space, and is more | |
| widely used than the Suffix tree. | |
| */ | |
| //Now let's get start on implementing the real thing: | |
| class SuffixTreeNode | |
| { | |
| friend class SuffixTree; | |
| private: | |
| unordered_map<char, SuffixTreeNode*> children; | |
| char v; | |
| vector<int> indices; | |
| public: | |
| SuffixTreeNode() {} | |
| void insertString(string s, int idx){ | |
| indices.push_back(idx); | |
| if(!s.empty()){ | |
| v=s[0]; | |
| SuffixTreeNode *child; | |
| if(children.find(v)!=children.end()) | |
| child = children[v]; | |
| else{ | |
| child = new SuffixTreeNode(); | |
| children.insert(make_pair(v,child)); | |
| } | |
| child->insertString(s.substr(1),idx); | |
| } | |
| } | |
| vector<int> Search(string pattern){ | |
| if(pattern.empty()) | |
| return indices; | |
| if(children.find(pattern[0])!=children.end()) | |
| return children[pattern[0]]->Search(pattern.substr(1)); | |
| return vector<int>(); | |
| } | |
| }; | |
| class SuffixTree | |
| { | |
| private: | |
| SuffixTreeNode *root; | |
| public: | |
| SuffixTree(string s){ | |
| root = new SuffixTreeNode(); | |
| for(int i=0; i<s.size(); i++) | |
| root->insertString(s.substr(i),i); | |
| } | |
| vector<int> Search(string &pattern){ | |
| return root->Search(pattern); | |
| } | |
| bool isSubstr(string &pattern){ | |
| if(pattern.empty()) | |
| return true; | |
| SuffixTreeNode *stn = root; | |
| for(int i=0; i<pattern.size(); i++){ | |
| if(stn->children.find(pattern[i])==stn->children.end()) | |
| return false; | |
| else | |
| stn = stn->children[pattern[i]]; | |
| } | |
| return true; | |
| } | |
| bool isSuffix(string &pattern){ | |
| if(pattern.empty()) | |
| return true; | |
| SuffixTreeNode *stn = root; | |
| for(int i=0; i<pattern.size(); i++){ | |
| if(stn->children.find(pattern[i])==stn->children.end()) | |
| return false; | |
| else | |
| stn = stn->children[pattern[i]]; | |
| } | |
| if(stn->children.empty()) | |
| return true; | |
| else | |
| return false; | |
| } | |
| int numOccurrences(string pattern){ | |
| if(pattern.empty()) | |
| return -1; | |
| SuffixTreeNode *stn = root; | |
| for(int i=0; i<pattern.size(); i++){ | |
| if(stn->children.find(pattern[i])==stn->children.end()) | |
| return 0; | |
| else | |
| stn = stn->children[pattern[i]]; | |
| } | |
| return stn->children.size(); //Fix this: We should return all children, include children's children. | |
| } | |
| //int longestRepeat() | |
| //Find the longest repeat in T: | |
| //Find the deepest node that has at least 2 leaves under it | |
| ~SuffixTree() {} | |
| }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment