HYPJUDY/split-kmp.cpp

## split-kmp.cpp
/*
*  Description: Splitting a string per the given seperator/delimiter
*               similar to split() funtion in Python with KMP algorithm.
*
*  Function: vector<string> split(const string &s, const string &sep)
*  Refer to https://docs.python.org/2/library/stdtypes.html str.split([sep[, maxsplit]]):
*  - Return a list of the words in the string, using sep as the delimiter string.
*  - If sep is given, consecutive delimiters are not grouped together
*    and are deemed to delimit empty strings. The sep argument may consist
*    of multiple characters. Splitting an empty string with a specified
*    separator returns [''].
*  - If sep is not specified or is null(""): runs of consecutive whitespace(or \n, \t)
*    are regarded as a single separator, and the result will contain no empty
*    strings at the start or end if the string has leading or trailing whitespace.
*
*  Details: https://hypjudy.github.io/2017/04/18/KMP-algorithm-split/
*  Author: HYPJUDY 2017.4.18
*/

#include <iostream>
#include <string>
#include <vector>
using namespace std;

/* Used when sep is not specific or is null(""):
*  Change all spaces(or \n, \t) to one space.
*  Remove leading or trailing whitespace. */
string preprocess(const string &s) {
	string str = s;
	string res = "";
	unsigned int len = str.length();
	unsigned int i, j;
	for (i = 0; i < len; ++i) {
		if (str[i] == '\n' || str[i] == '\t') str[i] = ' ';
	}
	// find first and last none space character str[i] and str[j]
	for (i = 0; i < len && str[i] == ' ' ||
		str[i] == '\n' || str[i] == '\t'; ++i);
	for (j = len - 1; j > 0 && str[j] == ' ' ||
		str[j] == '\n' || str[j] == '\t'; --j);
	int space = 0;
	for (i; i <= j; ++i) {
		if (str[i] != ' '&&space) {
			res += ' ';
			space = 0;
		}
		else if (str[i] == ' ') space = 1;
		else space = 0;
		if (!space) res += s[i];
	}
	return res;
}

/* Return a table indicates the length of partial
*  matched string for each substring
*  Time complexity: O(n), n is the length of delimiter */
vector<int> partial_match_table(string delimiter) {
	vector<int> res;
	res.push_back(0);
	unsigned int i = 1; // iterate over delimiter, the end of prefix
	unsigned int len = 0; // the end of suffix, the length of matched substr
	while (i < delimiter.size()) {
		if (delimiter[i] == delimiter[len]) {
			++len;
			res.push_back(len);
			++i;
		}
		else {
			if(len) len = res[len - 1]; // trick
			else {
				++i;
				res.push_back(0);
			}
		}
	}
	return res;
}


/* Split string using KMP algorithm.
*  Return a list of the words in the string,
*  using sep as the delimiter string.
*  Time complexity: O(m), m is the length of s */
vector<string> split(const string &s, const string &sep = "") {
	vector<string> res;
	// deal with null seperator
	string str = s;
	string delimiter = sep;
	if (str == "" && delimiter == "") return res;
	if (delimiter == "") {
		str = preprocess(s);
		if (str == "") return res;
		delimiter = " ";
	}

	vector<int> table = partial_match_table(delimiter);
	int i = 0, j = 0; // point to string and delimiter respectively
	int last = 0; // to split position
	int m = str.size(), n = delimiter.size();
	while(i < m) {
		if (str[i] == delimiter[j]) {
			++i;
			++j;
		}
		else if (j == 0) {
			++i;
		}
		else { // skip matched string of prefix of sep and suffix of s
			j = table[j - 1];
		}
		if (j == n) { // find a seperator
			res.push_back(str.substr(last, i - n - last));
			last = i;
			j = 0; // start to match again
		}
	}
	res.push_back(str.substr(last, i - last));
	return res;
}

string print_str(vector<string> v) {
	if (v.size() == 0) return "[]";
	string res = "[";
	unsigned int i = 0;
	for (i; i < v.size() - 1; ++i)
		res+= "'" + v[i] + "', ";
	res += "'" + v[i] + "'" + "]";
	return res;
}

void test(const int test_num, const string &in,
	const string &expected_out, const string &sep = "") {
	string out = print_str(split(in, sep));
	string flag;
	if (out == expected_out) flag = "Passed";
	else flag = "Failed";
	cout << "**************** Test" << test_num << ": " << flag << endl;
	cout << "          Input string: " << in << endl;
	cout << "       Split by string: " << sep << endl;
	cout << "         Output string: " << out << endl;
	cout << "Expected output string: " << expected_out << endl << endl;
}

int main() {
	/* ************************** sep is given  *************************** */
	// mormal test
	test(0, "1<>2<>3", "['1', '2', '3']", "<>");
	test(1, "1,,2", "['1', '', '2']", ",");
	test(2, "Line1-abcdef \nLine2-abc \nLine4-abcd", "['Line1-abcdef', '\nLine2-abc', '\nLine4-abcd']", " ");
	test(3, "topeka,kansas city,wichita,olathe", "['topeka', 'kansas city', 'wichita', 'olathe']", ",");
	test(4, "I am a student.", "['I ', 'm ', ' student.']", "a");
	test(5, "I am\n a \tstudent.", "['I ', 'm\n ', ' \tstudent.']", "a");

	// boundary test
	test(6, "a*", "['a', '']", "*"); // sepertor in the end
	test(7, "**abc**abc**", "['', 'abc', 'abc', '']", "**"); // seperator in the start and end  of string
	test(8, "", "['']", "a"); // null string
	test(9, " a", "['', 'a']", " "); // leading whitespace
	test(10, "a ", "['a', '']", " "); // trailing whitespace
	test(11, " a ", "['', 'a', '']", " "); // leading and trailing whitespaces
	test(12, "   ", "['', '', '', '']", " "); // whitespaces
	test(13, "***", "['', '', '', '']", "*"); // string consist of several seperators
	test(14, "*", "['', '']", "*"); // string equals seperator
	test(15, "aaa", "['', 'a']", "aa");

        /* *************** sep is  not specified or is null("") ******************** */
	test(16, "I am a student.", "['I', 'am', 'a', 'student.']");
	test(17, "Hello ,    world\t!\n", "['Hello', ',', 'world', '!']");
	test(18, "", "[]"); // empty string
	test(19, "", "[]", ""); // empty string
	test(20, "Hello ,    world\t!\n", "['Hello', ',', 'world', '!']", ""); // string with spaces
	test(21, "a", "['a']", ""); // string without space
	test(22, "  ", "[]"); // leading or trailing whitespace
	test(23, " 1  2   3  ", "['1', '2', '3']");

	// system("pause");
	return 0;
}
	/*
	* Description: Splitting a string per the given seperator/delimiter
	* similar to split() funtion in Python with KMP algorithm.
	*
	* Function: vector<string> split(const string &s, const string &sep)
	* Refer to https://docs.python.org/2/library/stdtypes.html str.split([sep[, maxsplit]]):
	* - Return a list of the words in the string, using sep as the delimiter string.
	* - If sep is given, consecutive delimiters are not grouped together
	* and are deemed to delimit empty strings. The sep argument may consist
	* of multiple characters. Splitting an empty string with a specified
	* separator returns [''].
	* - If sep is not specified or is null(""): runs of consecutive whitespace(or \n, \t)
	* are regarded as a single separator, and the result will contain no empty
	* strings at the start or end if the string has leading or trailing whitespace.
	*
	* Details: https://hypjudy.github.io/2017/04/18/KMP-algorithm-split/
	* Author: HYPJUDY 2017.4.18
	*/

	#include <iostream>
	#include <string>
	#include <vector>
	using namespace std;

	/* Used when sep is not specific or is null(""):
	* Change all spaces(or \n, \t) to one space.
	* Remove leading or trailing whitespace. */
	string preprocess(const string &s) {
	string str = s;
	string res = "";
	unsigned int len = str.length();
	unsigned int i, j;
	for (i = 0; i < len; ++i) {
	if (str[i] == '\n' \|\| str[i] == '\t') str[i] = ' ';
	}
	// find first and last none space character str[i] and str[j]
	for (i = 0; i < len && str[i] == ' ' \|\|
	str[i] == '\n' \|\| str[i] == '\t'; ++i);
	for (j = len - 1; j > 0 && str[j] == ' ' \|\|
	str[j] == '\n' \|\| str[j] == '\t'; --j);
	int space = 0;
	for (i; i <= j; ++i) {
	if (str[i] != ' '&&space) {
	res += ' ';
	space = 0;
	}
	else if (str[i] == ' ') space = 1;
	else space = 0;
	if (!space) res += s[i];
	}
	return res;
	}

	/* Return a table indicates the length of partial
	* matched string for each substring
	* Time complexity: O(n), n is the length of delimiter */
	vector<int> partial_match_table(string delimiter) {
	vector<int> res;
	res.push_back(0);
	unsigned int i = 1; // iterate over delimiter, the end of prefix
	unsigned int len = 0; // the end of suffix, the length of matched substr
	while (i < delimiter.size()) {
	if (delimiter[i] == delimiter[len]) {
	++len;
	res.push_back(len);
	++i;
	}
	else {
	if(len) len = res[len - 1]; // trick
	else {
	++i;
	res.push_back(0);
	}
	}
	}
	return res;
	}


	/* Split string using KMP algorithm.
	* Return a list of the words in the string,
	* using sep as the delimiter string.
	* Time complexity: O(m), m is the length of s */
	vector<string> split(const string &s, const string &sep = "") {
	vector<string> res;
	// deal with null seperator
	string str = s;
	string delimiter = sep;
	if (str == "" && delimiter == "") return res;
	if (delimiter == "") {
	str = preprocess(s);
	if (str == "") return res;
	delimiter = " ";
	}

	vector<int> table = partial_match_table(delimiter);
	int i = 0, j = 0; // point to string and delimiter respectively
	int last = 0; // to split position
	int m = str.size(), n = delimiter.size();
	while(i < m) {
	if (str[i] == delimiter[j]) {
	++i;
	++j;
	}
	else if (j == 0) {
	++i;
	}
	else { // skip matched string of prefix of sep and suffix of s
	j = table[j - 1];
	}
	if (j == n) { // find a seperator
	res.push_back(str.substr(last, i - n - last));
	last = i;
	j = 0; // start to match again
	}
	}
	res.push_back(str.substr(last, i - last));
	return res;
	}

	string print_str(vector<string> v) {
	if (v.size() == 0) return "[]";
	string res = "[";
	unsigned int i = 0;
	for (i; i < v.size() - 1; ++i)
	res+= "'" + v[i] + "', ";
	res += "'" + v[i] + "'" + "]";
	return res;
	}

	void test(const int test_num, const string &in,
	const string &expected_out, const string &sep = "") {
	string out = print_str(split(in, sep));
	string flag;
	if (out == expected_out) flag = "Passed";
	else flag = "Failed";
	cout << "**************** Test" << test_num << ": " << flag << endl;
	cout << " Input string: " << in << endl;
	cout << " Split by string: " << sep << endl;
	cout << " Output string: " << out << endl;
	cout << "Expected output string: " << expected_out << endl << endl;
	}

	int main() {
	/* ************************ sep is given ************************* */
	// mormal test
	test(0, "1<>2<>3", "['1', '2', '3']", "<>");
	test(1, "1,,2", "['1', '', '2']", ",");
	test(2, "Line1-abcdef \nLine2-abc \nLine4-abcd", "['Line1-abcdef', '\nLine2-abc', '\nLine4-abcd']", " ");
	test(3, "topeka,kansas city,wichita,olathe", "['topeka', 'kansas city', 'wichita', 'olathe']", ",");
	test(4, "I am a student.", "['I ', 'm ', ' student.']", "a");
	test(5, "I am\n a \tstudent.", "['I ', 'm\n ', ' \tstudent.']", "a");

	// boundary test
	test(6, "a", "['a', '']", ""); // sepertor in the end
	test(7, "abcabc", "['', 'abc', 'abc', '']", ""); // seperator in the start and end of string
	test(8, "", "['']", "a"); // null string
	test(9, " a", "['', 'a']", " "); // leading whitespace
	test(10, "a ", "['a', '']", " "); // trailing whitespace
	test(11, " a ", "['', 'a', '']", " "); // leading and trailing whitespaces
	test(12, " ", "['', '', '', '']", " "); // whitespaces
	test(13, "**", "['', '', '', '']", ""); // string consist of several seperators
	test(14, "", "['', '']", ""); // string equals seperator
	test(15, "aaa", "['', 'a']", "aa");

	/* ************* sep is not specified or is null("") ****************** */
	test(16, "I am a student.", "['I', 'am', 'a', 'student.']");
	test(17, "Hello , world\t!\n", "['Hello', ',', 'world', '!']");
	test(18, "", "[]"); // empty string
	test(19, "", "[]", ""); // empty string
	test(20, "Hello , world\t!\n", "['Hello', ',', 'world', '!']", ""); // string with spaces
	test(21, "a", "['a']", ""); // string without space
	test(22, " ", "[]"); // leading or trailing whitespace
	test(23, " 1 2 3 ", "['1', '2', '3']");

	// system("pause");
	return 0;
	}