Last active
May 3, 2017 04:17
-
-
Save HYPJUDY/fed1ef68be4045d433d0c5263f7ebe4c to your computer and use it in GitHub Desktop.
Pattern search: Splitting a string per the given seperator/delimiter similar to split() funtion in Python with KML algorithm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Description: Splitting a string per the given seperator/delimiter | |
* similar to split() funtion in Python with KMP algorithm. | |
* | |
* Function: vector<string> split(const string &s, const string &sep) | |
* Refer to https://docs.python.org/2/library/stdtypes.html str.split([sep[, maxsplit]]): | |
* - Return a list of the words in the string, using sep as the delimiter string. | |
* - If sep is given, consecutive delimiters are not grouped together | |
* and are deemed to delimit empty strings. The sep argument may consist | |
* of multiple characters. Splitting an empty string with a specified | |
* separator returns ['']. | |
* - If sep is not specified or is null(""): runs of consecutive whitespace(or \n, \t) | |
* are regarded as a single separator, and the result will contain no empty | |
* strings at the start or end if the string has leading or trailing whitespace. | |
* | |
* Details: https://hypjudy.github.io/2017/04/18/KMP-algorithm-split/ | |
* Author: HYPJUDY 2017.4.18 | |
*/ | |
#include <iostream> | |
#include <string> | |
#include <vector> | |
using namespace std; | |
/* Used when sep is not specific or is null(""): | |
* Change all spaces(or \n, \t) to one space. | |
* Remove leading or trailing whitespace. */ | |
string preprocess(const string &s) { | |
string str = s; | |
string res = ""; | |
unsigned int len = str.length(); | |
unsigned int i, j; | |
for (i = 0; i < len; ++i) { | |
if (str[i] == '\n' || str[i] == '\t') str[i] = ' '; | |
} | |
// find first and last none space character str[i] and str[j] | |
for (i = 0; i < len && str[i] == ' ' || | |
str[i] == '\n' || str[i] == '\t'; ++i); | |
for (j = len - 1; j > 0 && str[j] == ' ' || | |
str[j] == '\n' || str[j] == '\t'; --j); | |
int space = 0; | |
for (i; i <= j; ++i) { | |
if (str[i] != ' '&&space) { | |
res += ' '; | |
space = 0; | |
} | |
else if (str[i] == ' ') space = 1; | |
else space = 0; | |
if (!space) res += s[i]; | |
} | |
return res; | |
} | |
/* Return a table indicates the length of partial | |
* matched string for each substring | |
* Time complexity: O(n), n is the length of delimiter */ | |
vector<int> partial_match_table(string delimiter) { | |
vector<int> res; | |
res.push_back(0); | |
unsigned int i = 1; // iterate over delimiter, the end of prefix | |
unsigned int len = 0; // the end of suffix, the length of matched substr | |
while (i < delimiter.size()) { | |
if (delimiter[i] == delimiter[len]) { | |
++len; | |
res.push_back(len); | |
++i; | |
} | |
else { | |
if(len) len = res[len - 1]; // trick | |
else { | |
++i; | |
res.push_back(0); | |
} | |
} | |
} | |
return res; | |
} | |
/* Split string using KMP algorithm. | |
* Return a list of the words in the string, | |
* using sep as the delimiter string. | |
* Time complexity: O(m), m is the length of s */ | |
vector<string> split(const string &s, const string &sep = "") { | |
vector<string> res; | |
// deal with null seperator | |
string str = s; | |
string delimiter = sep; | |
if (str == "" && delimiter == "") return res; | |
if (delimiter == "") { | |
str = preprocess(s); | |
if (str == "") return res; | |
delimiter = " "; | |
} | |
vector<int> table = partial_match_table(delimiter); | |
int i = 0, j = 0; // point to string and delimiter respectively | |
int last = 0; // to split position | |
int m = str.size(), n = delimiter.size(); | |
while(i < m) { | |
if (str[i] == delimiter[j]) { | |
++i; | |
++j; | |
} | |
else if (j == 0) { | |
++i; | |
} | |
else { // skip matched string of prefix of sep and suffix of s | |
j = table[j - 1]; | |
} | |
if (j == n) { // find a seperator | |
res.push_back(str.substr(last, i - n - last)); | |
last = i; | |
j = 0; // start to match again | |
} | |
} | |
res.push_back(str.substr(last, i - last)); | |
return res; | |
} | |
string print_str(vector<string> v) { | |
if (v.size() == 0) return "[]"; | |
string res = "["; | |
unsigned int i = 0; | |
for (i; i < v.size() - 1; ++i) | |
res+= "'" + v[i] + "', "; | |
res += "'" + v[i] + "'" + "]"; | |
return res; | |
} | |
void test(const int test_num, const string &in, | |
const string &expected_out, const string &sep = "") { | |
string out = print_str(split(in, sep)); | |
string flag; | |
if (out == expected_out) flag = "Passed"; | |
else flag = "Failed"; | |
cout << "**************** Test" << test_num << ": " << flag << endl; | |
cout << " Input string: " << in << endl; | |
cout << " Split by string: " << sep << endl; | |
cout << " Output string: " << out << endl; | |
cout << "Expected output string: " << expected_out << endl << endl; | |
} | |
int main() { | |
/* ************************** sep is given *************************** */ | |
// mormal test | |
test(0, "1<>2<>3", "['1', '2', '3']", "<>"); | |
test(1, "1,,2", "['1', '', '2']", ","); | |
test(2, "Line1-abcdef \nLine2-abc \nLine4-abcd", "['Line1-abcdef', '\nLine2-abc', '\nLine4-abcd']", " "); | |
test(3, "topeka,kansas city,wichita,olathe", "['topeka', 'kansas city', 'wichita', 'olathe']", ","); | |
test(4, "I am a student.", "['I ', 'm ', ' student.']", "a"); | |
test(5, "I am\n a \tstudent.", "['I ', 'm\n ', ' \tstudent.']", "a"); | |
// boundary test | |
test(6, "a*", "['a', '']", "*"); // sepertor in the end | |
test(7, "**abc**abc**", "['', 'abc', 'abc', '']", "**"); // seperator in the start and end of string | |
test(8, "", "['']", "a"); // null string | |
test(9, " a", "['', 'a']", " "); // leading whitespace | |
test(10, "a ", "['a', '']", " "); // trailing whitespace | |
test(11, " a ", "['', 'a', '']", " "); // leading and trailing whitespaces | |
test(12, " ", "['', '', '', '']", " "); // whitespaces | |
test(13, "***", "['', '', '', '']", "*"); // string consist of several seperators | |
test(14, "*", "['', '']", "*"); // string equals seperator | |
test(15, "aaa", "['', 'a']", "aa"); | |
/* *************** sep is not specified or is null("") ******************** */ | |
test(16, "I am a student.", "['I', 'am', 'a', 'student.']"); | |
test(17, "Hello , world\t!\n", "['Hello', ',', 'world', '!']"); | |
test(18, "", "[]"); // empty string | |
test(19, "", "[]", ""); // empty string | |
test(20, "Hello , world\t!\n", "['Hello', ',', 'world', '!']", ""); // string with spaces | |
test(21, "a", "['a']", ""); // string without space | |
test(22, " ", "[]"); // leading or trailing whitespace | |
test(23, " 1 2 3 ", "['1', '2', '3']"); | |
// system("pause"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment