Skip to content

Instantly share code, notes, and snippets.

@HYPJUDY
Last active May 3, 2017 04:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HYPJUDY/fed1ef68be4045d433d0c5263f7ebe4c to your computer and use it in GitHub Desktop.
Save HYPJUDY/fed1ef68be4045d433d0c5263f7ebe4c to your computer and use it in GitHub Desktop.
Pattern search: Splitting a string per the given seperator/delimiter similar to split() funtion in Python with KML algorithm.
/*
* Description: Splitting a string per the given seperator/delimiter
* similar to split() funtion in Python with KMP algorithm.
*
* Function: vector<string> split(const string &s, const string &sep)
* Refer to https://docs.python.org/2/library/stdtypes.html str.split([sep[, maxsplit]]):
* - Return a list of the words in the string, using sep as the delimiter string.
* - If sep is given, consecutive delimiters are not grouped together
* and are deemed to delimit empty strings. The sep argument may consist
* of multiple characters. Splitting an empty string with a specified
* separator returns [''].
* - If sep is not specified or is null(""): runs of consecutive whitespace(or \n, \t)
* are regarded as a single separator, and the result will contain no empty
* strings at the start or end if the string has leading or trailing whitespace.
*
* Details: https://hypjudy.github.io/2017/04/18/KMP-algorithm-split/
* Author: HYPJUDY 2017.4.18
*/
#include <iostream>
#include <string>
#include <vector>
using namespace std;
/* Used when sep is not specific or is null(""):
* Change all spaces(or \n, \t) to one space.
* Remove leading or trailing whitespace. */
string preprocess(const string &s) {
string str = s;
string res = "";
unsigned int len = str.length();
unsigned int i, j;
for (i = 0; i < len; ++i) {
if (str[i] == '\n' || str[i] == '\t') str[i] = ' ';
}
// find first and last none space character str[i] and str[j]
for (i = 0; i < len && str[i] == ' ' ||
str[i] == '\n' || str[i] == '\t'; ++i);
for (j = len - 1; j > 0 && str[j] == ' ' ||
str[j] == '\n' || str[j] == '\t'; --j);
int space = 0;
for (i; i <= j; ++i) {
if (str[i] != ' '&&space) {
res += ' ';
space = 0;
}
else if (str[i] == ' ') space = 1;
else space = 0;
if (!space) res += s[i];
}
return res;
}
/* Return a table indicates the length of partial
* matched string for each substring
* Time complexity: O(n), n is the length of delimiter */
vector<int> partial_match_table(string delimiter) {
vector<int> res;
res.push_back(0);
unsigned int i = 1; // iterate over delimiter, the end of prefix
unsigned int len = 0; // the end of suffix, the length of matched substr
while (i < delimiter.size()) {
if (delimiter[i] == delimiter[len]) {
++len;
res.push_back(len);
++i;
}
else {
if(len) len = res[len - 1]; // trick
else {
++i;
res.push_back(0);
}
}
}
return res;
}
/* Split string using KMP algorithm.
* Return a list of the words in the string,
* using sep as the delimiter string.
* Time complexity: O(m), m is the length of s */
vector<string> split(const string &s, const string &sep = "") {
vector<string> res;
// deal with null seperator
string str = s;
string delimiter = sep;
if (str == "" && delimiter == "") return res;
if (delimiter == "") {
str = preprocess(s);
if (str == "") return res;
delimiter = " ";
}
vector<int> table = partial_match_table(delimiter);
int i = 0, j = 0; // point to string and delimiter respectively
int last = 0; // to split position
int m = str.size(), n = delimiter.size();
while(i < m) {
if (str[i] == delimiter[j]) {
++i;
++j;
}
else if (j == 0) {
++i;
}
else { // skip matched string of prefix of sep and suffix of s
j = table[j - 1];
}
if (j == n) { // find a seperator
res.push_back(str.substr(last, i - n - last));
last = i;
j = 0; // start to match again
}
}
res.push_back(str.substr(last, i - last));
return res;
}
string print_str(vector<string> v) {
if (v.size() == 0) return "[]";
string res = "[";
unsigned int i = 0;
for (i; i < v.size() - 1; ++i)
res+= "'" + v[i] + "', ";
res += "'" + v[i] + "'" + "]";
return res;
}
void test(const int test_num, const string &in,
const string &expected_out, const string &sep = "") {
string out = print_str(split(in, sep));
string flag;
if (out == expected_out) flag = "Passed";
else flag = "Failed";
cout << "**************** Test" << test_num << ": " << flag << endl;
cout << " Input string: " << in << endl;
cout << " Split by string: " << sep << endl;
cout << " Output string: " << out << endl;
cout << "Expected output string: " << expected_out << endl << endl;
}
int main() {
/* ************************** sep is given *************************** */
// mormal test
test(0, "1<>2<>3", "['1', '2', '3']", "<>");
test(1, "1,,2", "['1', '', '2']", ",");
test(2, "Line1-abcdef \nLine2-abc \nLine4-abcd", "['Line1-abcdef', '\nLine2-abc', '\nLine4-abcd']", " ");
test(3, "topeka,kansas city,wichita,olathe", "['topeka', 'kansas city', 'wichita', 'olathe']", ",");
test(4, "I am a student.", "['I ', 'm ', ' student.']", "a");
test(5, "I am\n a \tstudent.", "['I ', 'm\n ', ' \tstudent.']", "a");
// boundary test
test(6, "a*", "['a', '']", "*"); // sepertor in the end
test(7, "**abc**abc**", "['', 'abc', 'abc', '']", "**"); // seperator in the start and end of string
test(8, "", "['']", "a"); // null string
test(9, " a", "['', 'a']", " "); // leading whitespace
test(10, "a ", "['a', '']", " "); // trailing whitespace
test(11, " a ", "['', 'a', '']", " "); // leading and trailing whitespaces
test(12, " ", "['', '', '', '']", " "); // whitespaces
test(13, "***", "['', '', '', '']", "*"); // string consist of several seperators
test(14, "*", "['', '']", "*"); // string equals seperator
test(15, "aaa", "['', 'a']", "aa");
/* *************** sep is not specified or is null("") ******************** */
test(16, "I am a student.", "['I', 'am', 'a', 'student.']");
test(17, "Hello , world\t!\n", "['Hello', ',', 'world', '!']");
test(18, "", "[]"); // empty string
test(19, "", "[]", ""); // empty string
test(20, "Hello , world\t!\n", "['Hello', ',', 'world', '!']", ""); // string with spaces
test(21, "a", "['a']", ""); // string without space
test(22, " ", "[]"); // leading or trailing whitespace
test(23, " 1 2 3 ", "['1', '2', '3']");
// system("pause");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment