htfy96/sa.cpp

## sa.cpp
#include <iostream>
#include <algorithm>
#include <string>
#include <vector>
#include <climits>

using namespace std;

/*
 * Suffix array is a useful data structure which stores
 * suffix of a string in alphabetical order.
 * (https://en.wikipedia.org/wiki/Suffix_array)
 *
 * For example:
 * raw = abaab
 * Suffix: abaab, baab, aab, ab, b
 * Sorted suffix: aab, ab, abaab, b, baab .
 * Null character is considered less than any other character.
 *
 * Suffix array of raw:
 * [2, 3, 0, 4, 1]
 *
 * because suffix starting from position 2(aab) is the smallest, then
 * comes suffix starting from position 3(ab), ..., and suffix starting
 * from position 1 (baab) is the largest
 */


/*
 * This algorithm builds suffix array in O(nlogn) time complexity
 * and O(CHAR_MAX + n) space complexity, based on sorting segments
 * with length 1, 2, 4, 8, ..., until all segments are different,
 * which is called "doubling algorithm"
 *
 * ======================================
 *
 * Still take "abaab" as an example:
 *
 * Iteration 1
 * --------------------
 *  We sort all segments with length 1 in 'raw'(i.e. a b a a b), and relabel the original string
 *  x   =   01001 // 'a' => 0, 'b' => 1. two different segments. still have repeated segments, continue iteration
 *  sa  =   02314 // because letter 'a' appears at 0, 2, 3, then letter 'b' at 1, 4
 *  Here, x[i] stands for the order of raw[i]
 *
 * Iteration 2 [Step = 1]
 * --------------------
 *  We sort all segments with length 2 in 'raw'(i.e. 01 10 00 01 1$), and relabel it
 *  x   =   13012 // '00' => 0, 01 => 1, 1$ => 2, 10 => 3. 4 different segments. still have repeated segments, continue iteration.
 *  sa  =   20341 // 00 at position 2, 01 at position 0 and 3, 1$ at position 4, 10 at position 1
 *
 *  Here, x[i] stands for the order of raw[i...i+1]
 *
 * Iteration 3 [Step = 2]
 * --------------------
 *  We sort all segments with length 4 in 'x'(i.e. 10 31 02 1$ 2$,
 *  because if we want x[i] stands for raw[i...i+3], then we need to let the concatenation of
 *  old_x[i](which stands for raw[i...i+1]) and old_x[i+2](which stands for raw[i+2...i+3]) become the
 *  key of length-4 segment)
 *
 *  x   =   24013 // 02 => 0, 1$ => 1, 10 => 2, 2$ => 3, 31 => 4. All different, done!
 *  sa  =   23041 // 02 at position 2, 1$ at position 3, 10 at position 0, 2$ at position 4, 31 at position 1
 *
 *  Here, x[i] stands for the order of raw[i...i+3] of all length-4 substrings of raw
 *
 *  ==========================================
 *  In each iteration, we need sort key pairs (x[i], x[i + step])  (0 <= i < n)
 *  The naive method is to use quicksort, thus we need to use O(nlogn) for each iteration,
 *  and O(nlognlogn) for the whole algorithm.
 *
 *  A better way is to look into the range of x[i]/raw[i]: the maximum is bounded by
 *  both the length of raw (n) [since our goal is to make each elements of x different] and
 *  the character set size CHAR_MAX [for raw[i]]. Therefore, we can use bucket sort.
 *
 *  For 2-keyword bucket sort, the basic idea is:
 *  1) sort elements based on the 2nd keyword
 *  2) sort elements based on the 1st keyword (must be stable)
 *
 *  ==========================================
 *  How can we perform a stable bukkit sort?
 *
 *  // Assume origin is [1, 2, 2, 0, 0]
 *  for (int item : origin)
 *      bukkit[item]++;
 *  // bukkit: [2, 1, 2]
 *  for (size_t i=1; i<bukkit_size; ++i)
 *      bukkit[i] += bukkit[i-1];
 *  // bukkit: [2, 3, 5]
 *  for  (int i = origin.size() - 1; i >= 0; --i)
 *      result[ --bukkit[ origin[i] ] ] = i
 *  // result: [3, 4, 0, 1, 2]
 *  ==========================================
 *
 *  So how can we get the 2nd keyword? Remember that the 2nd keyword for position i is x[i+step] (if i + step < N)
 *  or $ (if i + step >= N).
 *
 *  Take a look at the result of the 2nd iteration:
 *  x   =   13012
 *  and the key pairs to sort of the 3rd iteration:
 *      10 31 02 1$ 2$,
 *  2nd keyword:
 *      0  1  2  $  $
 *
 *  ==========================================
 *
 *  But the result of sort based on the 2nd keyword is already done!
 *  Sa of iteration 2:
 *  sa  =   23041
 *
 *  new_sa == { N - step, N - step + 1, ..., N - 1, sa[origin1] - step, sa[origin2] - step, ... }
 *  the smallest "step" elements must be the last "step" elements, because the second keyword of them is always $.
 *  Sort result of 2nd keyword in iteration 3:
 *  new_sa= 3 4 0 1 2.
 *
 *  step = 2, thus 3 4 are the smallest elements. position 0, 1 are impossible for a second keyword:
 *  ignore them from sa, the remaining items in sa (2 3 4) is the order of 2nd keywords of position (0 1 2)
 *
 */
vector<int> build_sa(const string& raw)
{
    vector<int> pool(max(size_t(CHAR_MAX), raw.size()));
    vector<int> sa(raw.size());
    string x(raw.size() * 2 , '\0'); // trick to avoid x[i + step] outbounds

    // The next 6 lines are stable bukkit sort(keyword=raw[i], result: sa and x)
    for (size_t i=0; i<raw.size(); ++i)
        ++pool[x[i]=raw[i]];
    for (size_t i=1; i< pool.size(); ++i)
        pool[i] += pool[i-1];
    for (int i=raw.size() - 1; i>=0; --i)
        sa[--pool[raw[i]]] = i;
    for (int step = 1; step < raw.size(); step *=2)
    {
        cout << "Step="<<step << "  ";
        cout << endl;
        for (int i=0; i<raw.size(); ++i)
            cout << sa[i] << " ";
        vector<int> new_sa;
        // the 2nd keyword of position N - step, ... must be smallest
        for (size_t i=raw.size() - step; i < raw.size(); ++i)
            new_sa.push_back(i);
        // Then, for each elements that could be 2nd keyword, keep its original order with position sa[i] - step
        for (size_t i=0; i< raw.size(); ++i)
            if (sa[i] >= step)
                new_sa.push_back(sa[i] - step);
        for (int i=0; i<raw.size(); ++i)
            cout << new_sa[i] << " ";
        cout << endl;
        fill(pool.begin(), pool.end(), 0);

        // The next 9 lines perform stable bukkit sort(keyword: x[new_sa[i]] [the first keyword], result: sa)
        for (size_t i=0; i<new_sa.size(); ++i)
        {
            cout << "Adding to pool " << x[new_sa[i]] << endl;
            pool[x[new_sa[i]]]++;
        }
        for (size_t i=1; i<pool.size(); ++i)
            pool[i] += pool[i-1];
        for (int i=new_sa.size() - 1; i>=0; --i)
            sa[--pool[x[new_sa[i]]]] = new_sa[i];

        for (int i=0; i<new_sa.size(); ++i)
            cout << sa[i] << " ";
        cout << endl;

        // To rebuild x from sa
        // Need decide the number based on whether two key pairs are identical,
        // Two identical key pairs (10, 10) should be assigned with the same number in next iteration
        string newx = x;
        int p=0;
        newx[sa[0]] = 0;
        for (int i=1; i<sa.size(); ++i)
            newx[sa[i]] = x[sa[i]] == x[sa[i-1]] && x[sa[i] + step] == x[sa[i-1]+step] ? p : ++p;
        x = newx;
        for (size_t i=0; i<raw.size(); ++i)
            cout << x[i] << endl;
        cout << endl;
        // All different, go away
        if (p == raw.size() - 1) break;
    }
    return sa;

}

int main()
{
    string s;
    cin >> s;
    vector<int> sa = build_sa(s);
    for (size_t i=0; i<sa.size(); ++i)
        cout << sa[i] << " ";
}
	#include <iostream>
	#include <algorithm>
	#include <string>
	#include <vector>
	#include <climits>

	using namespace std;

	/*
	* Suffix array is a useful data structure which stores
	* suffix of a string in alphabetical order.
	* (https://en.wikipedia.org/wiki/Suffix_array)
	*
	* For example:
	* raw = abaab
	* Suffix: abaab, baab, aab, ab, b
	* Sorted suffix: aab, ab, abaab, b, baab .
	* Null character is considered less than any other character.
	*
	* Suffix array of raw:
	* [2, 3, 0, 4, 1]
	*
	* because suffix starting from position 2(aab) is the smallest, then
	* comes suffix starting from position 3(ab), ..., and suffix starting
	* from position 1 (baab) is the largest
	*/


	/*
	* This algorithm builds suffix array in O(nlogn) time complexity
	* and O(CHAR_MAX + n) space complexity, based on sorting segments
	* with length 1, 2, 4, 8, ..., until all segments are different,
	* which is called "doubling algorithm"
	*
	* ======================================
	*
	* Still take "abaab" as an example:
	*
	* Iteration 1
	* --------------------
	* We sort all segments with length 1 in 'raw'(i.e. a b a a b), and relabel the original string
	* x = 01001 // 'a' => 0, 'b' => 1. two different segments. still have repeated segments, continue iteration
	* sa = 02314 // because letter 'a' appears at 0, 2, 3, then letter 'b' at 1, 4
	* Here, x[i] stands for the order of raw[i]
	*
	* Iteration 2 [Step = 1]
	* --------------------
	* We sort all segments with length 2 in 'raw'(i.e. 01 10 00 01 1$), and relabel it
	* x = 13012 // '00' => 0, 01 => 1, 1$ => 2, 10 => 3. 4 different segments. still have repeated segments, continue iteration.
	* sa = 20341 // 00 at position 2, 01 at position 0 and 3, 1$ at position 4, 10 at position 1
	*
	* Here, x[i] stands for the order of raw[i...i+1]
	*
	* Iteration 3 [Step = 2]
	* --------------------
	* We sort all segments with length 4 in 'x'(i.e. 10 31 02 1$ 2$,
	* because if we want x[i] stands for raw[i...i+3], then we need to let the concatenation of
	* old_x[i](which stands for raw[i...i+1]) and old_x[i+2](which stands for raw[i+2...i+3]) become the
	* key of length-4 segment)
	*
	* x = 24013 // 02 => 0, 1$ => 1, 10 => 2, 2$ => 3, 31 => 4. All different, done!
	* sa = 23041 // 02 at position 2, 1$ at position 3, 10 at position 0, 2$ at position 4, 31 at position 1
	*
	* Here, x[i] stands for the order of raw[i...i+3] of all length-4 substrings of raw
	*
	* ==========================================
	* In each iteration, we need sort key pairs (x[i], x[i + step]) (0 <= i < n)
	* The naive method is to use quicksort, thus we need to use O(nlogn) for each iteration,
	* and O(nlognlogn) for the whole algorithm.
	*
	* A better way is to look into the range of x[i]/raw[i]: the maximum is bounded by
	* both the length of raw (n) [since our goal is to make each elements of x different] and
	* the character set size CHAR_MAX [for raw[i]]. Therefore, we can use bucket sort.
	*
	* For 2-keyword bucket sort, the basic idea is:
	* 1) sort elements based on the 2nd keyword
	* 2) sort elements based on the 1st keyword (must be stable)
	*
	* ==========================================
	* How can we perform a stable bukkit sort?
	*
	* // Assume origin is [1, 2, 2, 0, 0]
	* for (int item : origin)
	* bukkit[item]++;
	* // bukkit: [2, 1, 2]
	* for (size_t i=1; i<bukkit_size; ++i)
	* bukkit[i] += bukkit[i-1];
	* // bukkit: [2, 3, 5]
	* for (int i = origin.size() - 1; i >= 0; --i)
	* result[ --bukkit[ origin[i] ] ] = i
	* // result: [3, 4, 0, 1, 2]
	* ==========================================
	*
	* So how can we get the 2nd keyword? Remember that the 2nd keyword for position i is x[i+step] (if i + step < N)
	* or $ (if i + step >= N).
	*
	* Take a look at the result of the 2nd iteration:
	* x = 13012
	* and the key pairs to sort of the 3rd iteration:
	* 10 31 02 1$ 2$,
	* 2nd keyword:
	* 0 1 2 $ $
	*
	* ==========================================
	*
	* But the result of sort based on the 2nd keyword is already done!
	* Sa of iteration 2:
	* sa = 23041
	*
	* new_sa == { N - step, N - step + 1, ..., N - 1, sa[origin1] - step, sa[origin2] - step, ... }
	* the smallest "step" elements must be the last "step" elements, because the second keyword of them is always $.
	* Sort result of 2nd keyword in iteration 3:
	* new_sa= 3 4 0 1 2.
	*
	* step = 2, thus 3 4 are the smallest elements. position 0, 1 are impossible for a second keyword:
	* ignore them from sa, the remaining items in sa (2 3 4) is the order of 2nd keywords of position (0 1 2)
	*
	*/
	vector<int> build_sa(const string& raw)
	{
	vector<int> pool(max(size_t(CHAR_MAX), raw.size()));
	vector<int> sa(raw.size());
	string x(raw.size() * 2 , '\0'); // trick to avoid x[i + step] outbounds

	// The next 6 lines are stable bukkit sort(keyword=raw[i], result: sa and x)
	for (size_t i=0; i<raw.size(); ++i)
	++pool[x[i]=raw[i]];
	for (size_t i=1; i< pool.size(); ++i)
	pool[i] += pool[i-1];
	for (int i=raw.size() - 1; i>=0; --i)
	sa[--pool[raw[i]]] = i;
	for (int step = 1; step < raw.size(); step *=2)
	{
	cout << "Step="<<step << " ";
	cout << endl;
	for (int i=0; i<raw.size(); ++i)
	cout << sa[i] << " ";
	vector<int> new_sa;
	// the 2nd keyword of position N - step, ... must be smallest
	for (size_t i=raw.size() - step; i < raw.size(); ++i)
	new_sa.push_back(i);
	// Then, for each elements that could be 2nd keyword, keep its original order with position sa[i] - step
	for (size_t i=0; i< raw.size(); ++i)
	if (sa[i] >= step)
	new_sa.push_back(sa[i] - step);
	for (int i=0; i<raw.size(); ++i)
	cout << new_sa[i] << " ";
	cout << endl;
	fill(pool.begin(), pool.end(), 0);

	// The next 9 lines perform stable bukkit sort(keyword: x[new_sa[i]] [the first keyword], result: sa)
	for (size_t i=0; i<new_sa.size(); ++i)
	{
	cout << "Adding to pool " << x[new_sa[i]] << endl;
	pool[x[new_sa[i]]]++;
	}
	for (size_t i=1; i<pool.size(); ++i)
	pool[i] += pool[i-1];
	for (int i=new_sa.size() - 1; i>=0; --i)
	sa[--pool[x[new_sa[i]]]] = new_sa[i];

	for (int i=0; i<new_sa.size(); ++i)
	cout << sa[i] << " ";
	cout << endl;

	// To rebuild x from sa
	// Need decide the number based on whether two key pairs are identical,
	// Two identical key pairs (10, 10) should be assigned with the same number in next iteration
	string newx = x;
	int p=0;
	newx[sa[0]] = 0;
	for (int i=1; i<sa.size(); ++i)
	newx[sa[i]] = x[sa[i]] == x[sa[i-1]] && x[sa[i] + step] == x[sa[i-1]+step] ? p : ++p;
	x = newx;
	for (size_t i=0; i<raw.size(); ++i)
	cout << x[i] << endl;
	cout << endl;
	// All different, go away
	if (p == raw.size() - 1) break;
	}
	return sa;

	}

	int main()
	{
	string s;
	cin >> s;
	vector<int> sa = build_sa(s);
	for (size_t i=0; i<sa.size(); ++i)
	cout << sa[i] << " ";
	}