yongpitt/SuffixTree.cpp

## SuffixTree.cpp
/*This is a demonstration class for Suffix Tree data structure*/

/*
Before we move on, it's nice to learn from http://www.cs.cmu.edu/~ckingsf/bioinfo-lectures/suffixtrees.pdf
the basics about a suffix tree and what it does, as we have done with the Suffix Array data structure.
Don't get confused between the suffix tree data structure and other related structures. Recall that when
we learn suffix array we have a comparison among several data structures that are closely related to each
other:

       A high-level description on suffix array vs suffix tree:

       A suffix array is a space-efficient datastructure, which, if stored together with the original string,
       provides the same functions as a suffix tree.
       So yes, you can think of a suffix array as a storage mechanism for suffix trees. Depending on the details,
       there may be performance costs to using arrays over full-blown trees, which typically are outweighed by
       the space-use benefits. I believe the arrays are constructed almost identically to the way trees are
       constructed, and, in practice, suffix arrays are the preferred method for storing/representing a suffix tree.

       Trie vs Suffix tree vs Suffix array (http://stackoverflow.com/questions/2487576/trie-vs-suffix-tree-vs-suffix-array):

       A TRIE was the first data structure of this kind discovered.   The suffix tree is an improvement over the
       TRIE ( it has suffix links which allow for example linear error search, the suffix tree trims unnecessary
       branches of the TRIE therefore it does not require as much space ).

       The suffix array is a stripped down data structure based on the suffix tree (no suffix links (slow error
       matches), yet pattern matching is very fast).

       The TRIE is not for real world use because it consumes too much space.

       The suffix tree is lighter and faster than the Trie and is used to index ADN or optimize some large web
       search engines.

       The suffix array is slower in some pattern searches than the suffix tree but uses less space, and is more
       widely used than the Suffix tree.
*/

//Now let's get start on implementing the real thing:
class SuffixTreeNode
{
  friend class SuffixTree;
private:
	unordered_map<char, SuffixTreeNode*> children;
	char v;
	vector<int> indices;
public:

	SuffixTreeNode() {}
	void insertString(string s, int idx){
		indices.push_back(idx);
		if(!s.empty()){
			v=s[0];
			SuffixTreeNode *child;
			if(children.find(v)!=children.end())
				child = children[v];
			else{
				child = new SuffixTreeNode();
				children.insert(make_pair(v,child));
			}
			child->insertString(s.substr(1),idx);
		}
	}
	vector<int> Search(string pattern){
		if(pattern.empty())
			return indices;
		if(children.find(pattern[0])!=children.end())
			return children[pattern[0]]->Search(pattern.substr(1));
		return vector<int>();
	}
};
class SuffixTree
{
private:
	SuffixTreeNode *root;
public:
	SuffixTree(string s){
		root = new SuffixTreeNode();
		for(int i=0; i<s.size(); i++)
		   root->insertString(s.substr(i),i);
	}
	vector<int> Search(string &pattern){
		return root->Search(pattern);
	}
	bool isSubstr(string &pattern){
		if(pattern.empty())
			return true;
		SuffixTreeNode *stn = root;
		for(int i=0; i<pattern.size(); i++){
			if(stn->children.find(pattern[i])==stn->children.end())
				return false;
			else
				stn = stn->children[pattern[i]];
		}
		return true;
	}
	bool isSuffix(string &pattern){
		if(pattern.empty())
			return true;
		SuffixTreeNode *stn = root;
		for(int i=0; i<pattern.size(); i++){
			if(stn->children.find(pattern[i])==stn->children.end())
				return false;
			else
				stn = stn->children[pattern[i]];
		}
		if(stn->children.empty())
		    return true;
		else
			return false;
	}
	int numOccurrences(string pattern){
		if(pattern.empty())
			return -1;
		SuffixTreeNode *stn = root;
		for(int i=0; i<pattern.size(); i++){
			if(stn->children.find(pattern[i])==stn->children.end())
				return 0;
			else
				stn = stn->children[pattern[i]];
		}
		return stn->children.size(); //Fix this: We should return all children, include children's children.
	}
	//int longestRepeat()
	//Find the longest repeat in T:
        //Find the deepest node that has at least 2 leaves under it

	~SuffixTree() {}
};
	/This is a demonstration class for Suffix Tree data structure/

	/*
	Before we move on, it's nice to learn from http://www.cs.cmu.edu/~ckingsf/bioinfo-lectures/suffixtrees.pdf
	the basics about a suffix tree and what it does, as we have done with the Suffix Array data structure.
	Don't get confused between the suffix tree data structure and other related structures. Recall that when
	we learn suffix array we have a comparison among several data structures that are closely related to each
	other:

	A high-level description on suffix array vs suffix tree:

	A suffix array is a space-efficient datastructure, which, if stored together with the original string,
	provides the same functions as a suffix tree.
	So yes, you can think of a suffix array as a storage mechanism for suffix trees. Depending on the details,
	there may be performance costs to using arrays over full-blown trees, which typically are outweighed by
	the space-use benefits. I believe the arrays are constructed almost identically to the way trees are
	constructed, and, in practice, suffix arrays are the preferred method for storing/representing a suffix tree.

	Trie vs Suffix tree vs Suffix array (http://stackoverflow.com/questions/2487576/trie-vs-suffix-tree-vs-suffix-array):

	A TRIE was the first data structure of this kind discovered. The suffix tree is an improvement over the
	TRIE ( it has suffix links which allow for example linear error search, the suffix tree trims unnecessary
	branches of the TRIE therefore it does not require as much space ).

	The suffix array is a stripped down data structure based on the suffix tree (no suffix links (slow error
	matches), yet pattern matching is very fast).

	The TRIE is not for real world use because it consumes too much space.

	The suffix tree is lighter and faster than the Trie and is used to index ADN or optimize some large web
	search engines.

	The suffix array is slower in some pattern searches than the suffix tree but uses less space, and is more
	widely used than the Suffix tree.
	*/

	//Now let's get start on implementing the real thing:
	class SuffixTreeNode
	{
	friend class SuffixTree;
	private:
	unordered_map<char, SuffixTreeNode*> children;
	char v;
	vector<int> indices;
	public:

	SuffixTreeNode() {}
	void insertString(string s, int idx){
	indices.push_back(idx);
	if(!s.empty()){
	v=s[0];
	SuffixTreeNode *child;
	if(children.find(v)!=children.end())
	child = children[v];
	else{
	child = new SuffixTreeNode();
	children.insert(make_pair(v,child));
	}
	child->insertString(s.substr(1),idx);
	}
	}
	vector<int> Search(string pattern){
	if(pattern.empty())
	return indices;
	if(children.find(pattern[0])!=children.end())
	return children[pattern[0]]->Search(pattern.substr(1));
	return vector<int>();
	}
	};
	class SuffixTree
	{
	private:
	SuffixTreeNode *root;
	public:
	SuffixTree(string s){
	root = new SuffixTreeNode();
	for(int i=0; i<s.size(); i++)
	root->insertString(s.substr(i),i);
	}
	vector<int> Search(string &pattern){
	return root->Search(pattern);
	}
	bool isSubstr(string &pattern){
	if(pattern.empty())
	return true;
	SuffixTreeNode *stn = root;
	for(int i=0; i<pattern.size(); i++){
	if(stn->children.find(pattern[i])==stn->children.end())
	return false;
	else
	stn = stn->children[pattern[i]];
	}
	return true;
	}
	bool isSuffix(string &pattern){
	if(pattern.empty())
	return true;
	SuffixTreeNode *stn = root;
	for(int i=0; i<pattern.size(); i++){
	if(stn->children.find(pattern[i])==stn->children.end())
	return false;
	else
	stn = stn->children[pattern[i]];
	}
	if(stn->children.empty())
	return true;
	else
	return false;
	}
	int numOccurrences(string pattern){
	if(pattern.empty())
	return -1;
	SuffixTreeNode *stn = root;
	for(int i=0; i<pattern.size(); i++){
	if(stn->children.find(pattern[i])==stn->children.end())
	return 0;
	else
	stn = stn->children[pattern[i]];
	}
	return stn->children.size(); //Fix this: We should return all children, include children's children.
	}
	//int longestRepeat()
	//Find the longest repeat in T:
	//Find the deepest node that has at least 2 leaves under it

	~SuffixTree() {}
	};