Last active
August 29, 2015 14:26
-
-
Save santa4nt/df22ba79fbc310ac38bc to your computer and use it in GitHub Desktop.
Various implementations of string manipulations in C++.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CC = g++ | |
CFLAGS = -std=c++11 -Wall -g | |
LFLAGS = -lboost_unit_test_framework | |
DEFINES = -D_DEBUG | |
TARGET = stringmisc-test | |
all: $(TARGET) | |
stringmisc-test: stringmisc.o stringmisc-test.o | |
$(CC) stringmisc.o stringmisc-test.o $(LFLAGS) -o $(TARGET) | |
stringmisc.o: stringmisc.cpp | |
$(CC) $(CFLAGS) $(DEFINES) -c stringmisc.cpp | |
stringmisc-test.o: stringmisc-test.cpp | |
$(CC) $(CFLAGS) $(DEFINES) -c stringmisc-test.cpp | |
run: all | |
./$(TARGET) | |
valg: all | |
valgrind --leak-check=full -v ./$(TARGET) | |
clean: | |
rm -f *.o *.gch $(TARGET) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stringmisc.hpp" | |
#include <vector> | |
#include <string> | |
#include <memory> | |
#include <utility> | |
#include <cstring> | |
#define BOOST_TEST_DYN_LINK | |
#define BOOST_TEST_MODULE string-misc-test | |
#include <boost/test/unit_test.hpp> | |
using std::string; | |
using std::vector; | |
using std::pair; | |
using std::unique_ptr; | |
BOOST_AUTO_TEST_CASE( has_all_unique_chars__empty_string__returns_true ) | |
{ | |
BOOST_REQUIRE( has_all_unique_chars("") ); | |
BOOST_REQUIRE( has_all_unique_chars_ex("") ); | |
string s = ""; | |
BOOST_REQUIRE( has_all_unique_chars_ex(s) ); | |
BOOST_CHECK_EQUAL( "", s ); | |
} | |
BOOST_AUTO_TEST_CASE( has_all_unique_chars__unique_chars_input__returns_true ) | |
{ | |
BOOST_REQUIRE( has_all_unique_chars("abcdefghijklmnoprstuvwxyzABCDEFGHIJKLMNOPRSTUVWXYZ") ); | |
BOOST_REQUIRE( has_all_unique_chars_ex("abcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXYZ") ); | |
BOOST_REQUIRE( has_all_unique_chars("@bcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXYZ") ); | |
BOOST_REQUIRE( has_all_unique_chars_ex(" abcdefghijkl#$%prstuvwxyzABCDEFGHIJKLMN0123456789") ); | |
string s = "Helo W0rd"; | |
BOOST_REQUIRE( has_all_unique_chars_ex(s) ); | |
BOOST_CHECK_EQUAL( " 0HWdelor", s ); | |
} | |
BOOST_AUTO_TEST_CASE( has_all_unique_chars__has_duplicate_chars_input__returns_false ) | |
{ | |
BOOST_REQUIRE( !has_all_unique_chars("abcdefghijklmnoprstuvwxyzaBCDEFGHIJKLMNOPRSTUVWXYZ") ); | |
BOOST_REQUIRE( !has_all_unique_chars_ex("abcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXYZ") ); | |
BOOST_REQUIRE( !has_all_unique_chars("@bcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXZZ") ); | |
BOOST_REQUIRE( !has_all_unique_chars_ex(" abcdefghijkl#$%%rstuvwxyzABCDEFGHIJKLMN0123456789") ); | |
string s = "Hello World"; | |
BOOST_REQUIRE( !has_all_unique_chars_ex(s) ); | |
BOOST_CHECK_EQUAL( " HWdellloor", s ); | |
} | |
BOOST_AUTO_TEST_CASE( reverse_in_place__empty_string__do_nothing ) | |
{ | |
string test = ""; // basic edge case | |
reverse_in_place(test); | |
BOOST_REQUIRE_EQUAL( "", test ); | |
char ctest[1] = ""; | |
reverse_in_place(ctest); | |
BOOST_REQUIRE_EQUAL( "", ctest ); | |
} | |
BOOST_AUTO_TEST_CASE( reverse_in_place__one_char_string__do_nothing ) | |
{ | |
string test = "a"; // another edge case | |
reverse_in_place(test); | |
BOOST_REQUIRE_EQUAL( "a", test ); | |
char ctest[2] = {0}; | |
ctest[0] = 'a'; | |
reverse_in_place(ctest); | |
BOOST_REQUIRE_EQUAL( "a", ctest ); | |
} | |
BOOST_AUTO_TEST_CASE( reverse_in_place__test_string__do_reverse ) | |
{ | |
string test = "A "; // even # of chars, and only two chars (another edge case) | |
reverse_in_place(test); | |
BOOST_REQUIRE_EQUAL( " A", test ); | |
test = "Hello World"; // odd # of chars, and arbitrary | |
reverse_in_place(test); | |
BOOST_REQUIRE_EQUAL( "dlroW olleH", test ); | |
// now, using raw C-strings ... | |
test = "A "; // even # of chars, and only two chars (another edge case) | |
char ctest[32] = {0}; | |
memcpy(ctest, test.c_str(), test.length()); | |
reverse_in_place(ctest); | |
BOOST_REQUIRE_EQUAL( " A", ctest ); | |
test = "Hello World"; // odd # of chars, and arbitrary | |
memset(ctest, 0, sizeof(char) * 32); | |
memcpy(ctest, test.c_str(), test.length()); | |
reverse_in_place(ctest); | |
BOOST_REQUIRE_EQUAL( "dlroW olleH", ctest ); | |
} | |
BOOST_AUTO_TEST_CASE( is_permutation__empty_strings__returns_expected ) | |
{ | |
BOOST_REQUIRE( is_permutation("", "") ); | |
BOOST_REQUIRE( !is_permutation("", " ") ); | |
BOOST_REQUIRE( !is_permutation("\t", "") ); | |
} | |
BOOST_AUTO_TEST_CASE( is_permutation__permuted_strings__returns_true ) | |
{ | |
BOOST_REQUIRE( is_permutation("abcdefg", "gfedcba") ); | |
BOOST_REQUIRE( is_permutation("cafdgbe", "abcdefg") ); | |
BOOST_REQUIRE( is_permutation("aaa", "aaa") ); | |
BOOST_REQUIRE( is_permutation("A", "A") ); | |
} | |
BOOST_AUTO_TEST_CASE( is_permutation__not_permuted_strings__returns_false ) | |
{ | |
BOOST_REQUIRE( !is_permutation("abcdefg", "gfedcbA") ); | |
BOOST_REQUIRE( !is_permutation("cafdgbe", "abccefg") ); | |
BOOST_REQUIRE( !is_permutation("aa", "a") ); | |
} | |
BOOST_AUTO_TEST_CASE( encode_num__empty_string__returns_empty_string ) | |
{ | |
string enc = encode_num(""); | |
BOOST_REQUIRE_EQUAL( "", enc ); | |
} | |
BOOST_AUTO_TEST_CASE( encode_num__shorter_input__returns_input ) | |
{ | |
vector<string> inputs = | |
{ | |
"a", | |
"AAaa", | |
"ab", | |
"abbcccd" | |
}; | |
for (const string &input : inputs) | |
{ | |
string enc = encode_num(input); | |
BOOST_REQUIRE_EQUAL( input, enc ); | |
} | |
} | |
BOOST_AUTO_TEST_CASE( encode_num__longer_input__returns_encoded ) | |
{ | |
vector<pair<string, string>> inputs_encoded = | |
{ | |
{"aaa", "a3"}, | |
{"AAaaa", "A2a3"}, | |
{"abbcccccd", "a1b2c5d1"} | |
}; | |
for (const pair<string, string> &input_encoded : inputs_encoded) | |
{ | |
string enc = encode_num(input_encoded.first); | |
BOOST_REQUIRE_EQUAL( input_encoded.second, enc ); | |
} | |
} | |
BOOST_AUTO_TEST_CASE( sanitize_spaces__empty_string__returns_empty_string ) | |
{ | |
BOOST_REQUIRE_EQUAL( "", sanitize_spaces("").c_str() ); | |
char buf[1] = {0}; | |
_sanitize_spaces(buf, 0); | |
BOOST_REQUIRE_EQUAL( '\0', buf[0] ); | |
} | |
BOOST_AUTO_TEST_CASE( sanitize_spaces__no_space_string__returns_input_string ) | |
{ | |
vector<string> inputs = | |
{ | |
"a", | |
"AA", | |
"123", | |
"\t", | |
"HelloWorld!" | |
}; | |
for (const string &input : inputs) | |
{ | |
BOOST_REQUIRE_EQUAL( input, sanitize_spaces(input).c_str() ); | |
} | |
} | |
BOOST_AUTO_TEST_CASE( sanitize_spaces__contains_spaces__returns_sanitized ) | |
{ | |
vector<pair<string, string>> inputs_sanitized = | |
{ | |
{" ", "%20"}, | |
{" ", "%20%20"}, | |
{"a !", "a%20!"}, | |
{"a b", "a%20%20b"}, | |
{" bcd", "%20bcd"}, | |
{"abc ", "abc%20%20"} | |
}; | |
for (const pair<string, string> &input_sanitized : inputs_sanitized) | |
{ | |
// test the wrapper | |
string sanitized = sanitize_spaces(input_sanitized.first); | |
BOOST_REQUIRE_EQUAL( input_sanitized.second, sanitized.c_str() ); | |
// test the direct array access version | |
size_t buffer_count = input_sanitized.second.length() + 1; | |
unique_ptr<char[]> buffer( new char[buffer_count] ); | |
memset(buffer.get(), 0, sizeof(char) * buffer_count); | |
strcpy(buffer.get(), input_sanitized.first.c_str()); | |
_sanitize_spaces(buffer.get(), buffer_count - 1); | |
BOOST_REQUIRE( strcmp(buffer.get(), input_sanitized.second.c_str()) == 0 ); | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stringmisc.hpp" | |
#include <set> | |
#include <map> | |
#include <memory> | |
#include <sstream> | |
#include <iostream> | |
#include <iterator> | |
#include <algorithm> | |
#include <cassert> | |
#include <cstring> | |
using std::set; | |
using std::map; | |
using std::sort; | |
using std::unique_ptr; | |
using std::ostringstream; | |
using std::cout; | |
using std::endl; | |
// private functions' declarations | |
inline static void _swap(string &input, unsigned int i, unsigned int j); | |
inline static void _swap(char *input, unsigned int i, unsigned int j); | |
using char_count = map<char, unsigned int>; | |
static char_count get_char_count(const string& input); | |
template <typename K, typename V> | |
static bool is_equivalent(const map<K,V> &a, const map<K,V> &b); | |
#ifdef _DEBUG | |
template <typename K, typename V> | |
static void print_map(const map<K,V> &input); | |
#else | |
template <typename K, typename V> | |
static void print_map(const map<K,V> &input) {} | |
#endif | |
static void _do_sanitize_spaces(char *buffer, size_t ccount); | |
// end private functions' declarations | |
// O(n) time and O(1) space | |
bool has_all_unique_chars(const string &input) | |
{ | |
bool seen[1 << (sizeof(char) * 8)] = {0}; // 256 possible char values | |
// more precisely, this will use 256 booleans of space (~256 * 8 bytes) | |
// we can do better by using bitfields, so that each boolean only takes 1 bit of space | |
for (unsigned int i = 0; i < input.length(); ++i) | |
{ | |
if (seen[(int)input[i]]) | |
{ | |
return false; | |
} | |
seen[(int)input[i]] = true; | |
} | |
return true; | |
} | |
// O(n^2) time and O(1) space, but no extra space needed | |
bool has_all_unique_chars_ex(const string &input) | |
{ | |
for (unsigned int i = 0; i < input.length(); ++i) | |
{ | |
for (unsigned int j = 0; j < input.length(); ++j) | |
{ | |
if (i == j) continue; | |
if (input[i] == input[j]) | |
{ | |
return false; | |
} | |
} | |
} | |
return true; | |
} | |
// O(n log n) time and O(1) space, no extra space, but input is modified | |
bool has_all_unique_chars_ex(string &input) | |
{ | |
if (input.length() == 0) | |
{ | |
return true; | |
} | |
// first, sort the characters in the string in-place | |
std::sort (input.begin(), input.end()); | |
// then, find consecutive duplicate characters | |
for (unsigned int i = 1; i < input.length(); ++i) | |
{ | |
if (input[i] == input[i-1]) | |
{ | |
return false; | |
} | |
} | |
return true; | |
} | |
void reverse_in_place(string &input) | |
{ | |
// NOTE: this will only work for implementation of std::string where | |
// its underlying buffer is accessible in this manner, AND it is null-terminated! | |
reverse_in_place(&input[0]); | |
} | |
// we really can't, but we have to assume that `input` is null-terminated | |
void reverse_in_place(char *input) | |
{ | |
size_t len = strlen(input); | |
if (len <= 1) | |
{ | |
return; | |
} | |
unsigned int i = 0; | |
unsigned int j = len - 1; | |
for (; i != j && i < j;) | |
{ | |
_swap(input, i, j); | |
++i; | |
--j; | |
} | |
} | |
void _swap(string &input, unsigned int i, unsigned int j) | |
{ | |
assert (i < input.length()); | |
assert (j < input.length()); | |
_swap(&input[0], i, j); | |
} | |
void _swap(char *input, unsigned int i, unsigned int j) | |
{ | |
char temp = input[i]; | |
input[i] = input[j]; | |
input[j] = temp; | |
} | |
// O(n) | |
bool is_permutation(const string &a, const string &b) | |
{ | |
// some obvious shortcuts | |
if (a.length() != b.length()) | |
{ | |
return false; | |
} | |
// get the char counts of both strings | |
char_count ccnt_a = get_char_count(a); | |
char_count ccnt_b = get_char_count(b); | |
//print_map(ccnt_a); | |
//print_map(ccnt_b); | |
// then compare them, the two strings are permutations of each other | |
// iff the two char counts are identical | |
return is_equivalent(ccnt_a, ccnt_b); | |
// NOTE: this solution runs in O(n), more precisely, O(3n) | |
} | |
char_count get_char_count(const string& input) | |
{ | |
char_count ccnt; | |
for (unsigned int i = 0; i < input.length(); ++i) | |
{ | |
char c = input[i]; | |
if (0 == ccnt.count(c)) | |
{ | |
ccnt[c] = 1; | |
} | |
else | |
{ | |
ccnt[c] += 1; | |
} | |
} | |
return ccnt; | |
} | |
template <typename K, typename V> | |
bool is_equivalent(const map<K,V> &a, const map<K,V> &b) | |
{ | |
// first, get the key sets of both maps | |
set<K> keys_a, keys_b; | |
for (typename map<K,V>::const_iterator cit = a.cbegin(); cit != a.cend(); ++cit) | |
{ | |
keys_a.insert(cit->first); | |
} | |
for (typename map<K,V>::const_iterator cit = b.cbegin(); cit != b.cend(); ++cit) | |
{ | |
keys_b.insert(cit->first); | |
} | |
// obvious shortcuts | |
if (keys_a.size() != keys_b.size()) | |
{ | |
return false; | |
} | |
// now, since both sets of keys are of equal length, we only need to iterate | |
// through one of them | |
for (typename set<K>::iterator it = keys_a.begin(); it != keys_a.end(); ++it) | |
{ | |
// if the other map doesn't has this key, then game over | |
if (0 == b.count(*it)) | |
{ | |
return false; | |
} | |
// if they do, the value must be equal | |
if (a.at(*it) != b.at(*it)) | |
{ | |
return false; | |
} | |
} | |
// if we get here, then the two maps have equal set of keys with equal values | |
return true; | |
} | |
#ifdef _DEBUG | |
template <typename K, typename V> | |
void print_map(const map<K,V> &input) | |
{ | |
cout << "{" << endl; | |
for (typename map<K,V>::const_iterator cit = input.cbegin(); cit != input.cend(); ++cit) | |
{ | |
cout << " " << cit->first << ": " << cit->second << "," << endl; | |
} | |
cout << "}" << endl; | |
} | |
#endif | |
string encode_num(const string &input) | |
{ | |
size_t inlen = input.length(); | |
if (inlen <= 1) | |
{ | |
return string(input); | |
} | |
char seen = input[0]; | |
int count = 1; | |
ostringstream encoded; | |
// NOTE: this loop, and therefore this implementation, only works for input | |
// strings of length 2 or more! | |
for (unsigned int i = 1; i < inlen; ++i) | |
{ | |
if (input[i] == seen) | |
{ | |
// seeing the same character as before, simply increment count | |
++count; | |
continue; | |
} | |
else | |
{ | |
// seeing a different character for the first time; | |
// flush the previous character seen and its running count | |
encoded << seen << count; | |
// then reset and check if the encoded string so far has | |
// exceeded the input length | |
seen = input[i]; | |
count = 1; | |
if ((int)encoded.tellp() >= (int)inlen) | |
{ | |
return string(input); | |
} | |
} | |
} | |
// don't forget to flush the last seen character and its run count! | |
encoded << seen << count; | |
if ((int)encoded.tellp() >= (int)inlen) | |
{ | |
return string(input); | |
} | |
return encoded.str(); | |
} | |
string sanitize_spaces(const string &input) | |
{ | |
// count the number of space characters in the input string | |
int count = 0; | |
for (char c : input) | |
{ | |
if (c == ' ') | |
{ | |
++count; | |
} | |
} | |
// allocate an array of char that is expanded to accomodate | |
// expected additional characters: | |
// original length + additional 2 chars per space found (+ null terminator) | |
size_t expected_count = input.length() + count * 2 + 1; | |
unique_ptr<char[]> output_chars( new char[expected_count] ); | |
memset(output_chars.get(), 0, sizeof(char) * expected_count); | |
// copy the input characters into this expanded vector | |
memcpy(output_chars.get(), input.c_str(), sizeof(char) * input.length()); | |
_sanitize_spaces(output_chars.get(), expected_count - 1); | |
return string(output_chars.get()); | |
} | |
// buffer is a char array, and ccount is the number of characters that buffer | |
// can hold; | |
// buffer should be modified in-place to replace ' ' with "%20" | |
// ccount should hold enough room to accomodate the final character count, | |
// not including the null terminator | |
void _sanitize_spaces(char *buffer, size_t ccount) | |
{ | |
// input sanitation | |
if (buffer == nullptr || ccount == 0) | |
{ | |
return; | |
} | |
// the input array is completely filled with original input, | |
// so that tells us that original input string was not expanded, | |
// which means that there was no space (' ') character in the original input | |
if (buffer[ccount-1] != '\0') | |
{ | |
#ifdef _DEBUG | |
for (unsigned int i = 0; i < ccount; ++i) | |
{ | |
assert (buffer[i] != ' '); | |
} | |
#endif | |
return; | |
} | |
// validate that we have enough room in the input buffer | |
unsigned int num_spaces = 0; | |
for (unsigned int i = 0; i < ccount; ++i) | |
{ | |
if (buffer[i] == ' ') | |
{ | |
++num_spaces; | |
} | |
} | |
size_t orig_len = strlen(buffer); | |
if (ccount - orig_len != 2 * num_spaces) | |
{ | |
assert (false); | |
return; | |
} | |
_do_sanitize_spaces(buffer, ccount); | |
} | |
// assume input is already validated | |
void _do_sanitize_spaces(char *buffer, size_t ccount) | |
{ | |
int i, j; | |
#ifdef _DEBUG | |
const string &orig(buffer); | |
#endif | |
// first, move the padding from the end of the buffer to the beginning | |
for (i = strlen(buffer) - 1, j = ccount - 1; | |
i >= 0; | |
--i, --j) | |
{ | |
buffer[j] = buffer[i]; | |
} | |
++j; // this now points to the start of the original string, | |
// somewhere in the middle of the buffer | |
#ifdef _DEBUG | |
assert (string(&buffer[j]) == orig); | |
#endif | |
// now, move the string back to the head of the buffer, one character | |
// at a time, while looking for space (' ') characters | |
i = 0; | |
for (; j < (int)ccount; ++j) | |
{ | |
assert (i >= 0 && i < (int)ccount); | |
if (buffer[j] == ' ') | |
{ | |
buffer[i++] = '%'; | |
buffer[i++] = '2'; | |
buffer[i++] = '0'; | |
} | |
else | |
{ | |
buffer[i++] = buffer[j]; | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <vector> | |
using std::string; | |
using std::vector; | |
/** | |
* Given a string, find out whether it contains only unique characters. | |
*/ | |
bool has_all_unique_chars(const string &input); | |
bool has_all_unique_chars_ex(const string &input); | |
bool has_all_unique_chars_ex(string &input); | |
/** | |
* Reverse a string in-place. | |
*/ | |
void reverse_in_place(string &input); | |
void reverse_in_place(char *input); | |
/** | |
* Given two string inputs, find out whether they are permutations of | |
* each other. | |
*/ | |
bool is_permutation(const string &a, const string &b); | |
/** | |
* Encode a string in this manner: | |
* e.g. "abbcccccd" -> "a1b2c5d1" | |
* But only if the output is shorter than the input. | |
*/ | |
string encode_num(const string &input); | |
/** | |
* Sanitize space (' ') characters by replacing each one into "%20". | |
*/ | |
string sanitize_spaces(const string &input); | |
void _sanitize_spaces(char *buffer, size_t ccount); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment