Skip to content

Instantly share code, notes, and snippets.

@santa4nt
Last active August 29, 2015 14:26
Show Gist options
  • Save santa4nt/df22ba79fbc310ac38bc to your computer and use it in GitHub Desktop.
Save santa4nt/df22ba79fbc310ac38bc to your computer and use it in GitHub Desktop.
Various implementations of string manipulations in C++.
CC = g++
CFLAGS = -std=c++11 -Wall -g
LFLAGS = -lboost_unit_test_framework
DEFINES = -D_DEBUG
TARGET = stringmisc-test
all: $(TARGET)
stringmisc-test: stringmisc.o stringmisc-test.o
$(CC) stringmisc.o stringmisc-test.o $(LFLAGS) -o $(TARGET)
stringmisc.o: stringmisc.cpp
$(CC) $(CFLAGS) $(DEFINES) -c stringmisc.cpp
stringmisc-test.o: stringmisc-test.cpp
$(CC) $(CFLAGS) $(DEFINES) -c stringmisc-test.cpp
run: all
./$(TARGET)
valg: all
valgrind --leak-check=full -v ./$(TARGET)
clean:
rm -f *.o *.gch $(TARGET)
#include "stringmisc.hpp"
#include <vector>
#include <string>
#include <memory>
#include <utility>
#include <cstring>
#define BOOST_TEST_DYN_LINK
#define BOOST_TEST_MODULE string-misc-test
#include <boost/test/unit_test.hpp>
using std::string;
using std::vector;
using std::pair;
using std::unique_ptr;
BOOST_AUTO_TEST_CASE( has_all_unique_chars__empty_string__returns_true )
{
BOOST_REQUIRE( has_all_unique_chars("") );
BOOST_REQUIRE( has_all_unique_chars_ex("") );
string s = "";
BOOST_REQUIRE( has_all_unique_chars_ex(s) );
BOOST_CHECK_EQUAL( "", s );
}
BOOST_AUTO_TEST_CASE( has_all_unique_chars__unique_chars_input__returns_true )
{
BOOST_REQUIRE( has_all_unique_chars("abcdefghijklmnoprstuvwxyzABCDEFGHIJKLMNOPRSTUVWXYZ") );
BOOST_REQUIRE( has_all_unique_chars_ex("abcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXYZ") );
BOOST_REQUIRE( has_all_unique_chars("@bcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXYZ") );
BOOST_REQUIRE( has_all_unique_chars_ex(" abcdefghijkl#$%prstuvwxyzABCDEFGHIJKLMN0123456789") );
string s = "Helo W0rd";
BOOST_REQUIRE( has_all_unique_chars_ex(s) );
BOOST_CHECK_EQUAL( " 0HWdelor", s );
}
BOOST_AUTO_TEST_CASE( has_all_unique_chars__has_duplicate_chars_input__returns_false )
{
BOOST_REQUIRE( !has_all_unique_chars("abcdefghijklmnoprstuvwxyzaBCDEFGHIJKLMNOPRSTUVWXYZ") );
BOOST_REQUIRE( !has_all_unique_chars_ex("abcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXYZ") );
BOOST_REQUIRE( !has_all_unique_chars("@bcdefghijklmnoprstuvwxyz ABCDEFGHIJKLMNOPRSTUVWXZZ") );
BOOST_REQUIRE( !has_all_unique_chars_ex(" abcdefghijkl#$%%rstuvwxyzABCDEFGHIJKLMN0123456789") );
string s = "Hello World";
BOOST_REQUIRE( !has_all_unique_chars_ex(s) );
BOOST_CHECK_EQUAL( " HWdellloor", s );
}
BOOST_AUTO_TEST_CASE( reverse_in_place__empty_string__do_nothing )
{
string test = ""; // basic edge case
reverse_in_place(test);
BOOST_REQUIRE_EQUAL( "", test );
char ctest[1] = "";
reverse_in_place(ctest);
BOOST_REQUIRE_EQUAL( "", ctest );
}
BOOST_AUTO_TEST_CASE( reverse_in_place__one_char_string__do_nothing )
{
string test = "a"; // another edge case
reverse_in_place(test);
BOOST_REQUIRE_EQUAL( "a", test );
char ctest[2] = {0};
ctest[0] = 'a';
reverse_in_place(ctest);
BOOST_REQUIRE_EQUAL( "a", ctest );
}
BOOST_AUTO_TEST_CASE( reverse_in_place__test_string__do_reverse )
{
string test = "A "; // even # of chars, and only two chars (another edge case)
reverse_in_place(test);
BOOST_REQUIRE_EQUAL( " A", test );
test = "Hello World"; // odd # of chars, and arbitrary
reverse_in_place(test);
BOOST_REQUIRE_EQUAL( "dlroW olleH", test );
// now, using raw C-strings ...
test = "A "; // even # of chars, and only two chars (another edge case)
char ctest[32] = {0};
memcpy(ctest, test.c_str(), test.length());
reverse_in_place(ctest);
BOOST_REQUIRE_EQUAL( " A", ctest );
test = "Hello World"; // odd # of chars, and arbitrary
memset(ctest, 0, sizeof(char) * 32);
memcpy(ctest, test.c_str(), test.length());
reverse_in_place(ctest);
BOOST_REQUIRE_EQUAL( "dlroW olleH", ctest );
}
BOOST_AUTO_TEST_CASE( is_permutation__empty_strings__returns_expected )
{
BOOST_REQUIRE( is_permutation("", "") );
BOOST_REQUIRE( !is_permutation("", " ") );
BOOST_REQUIRE( !is_permutation("\t", "") );
}
BOOST_AUTO_TEST_CASE( is_permutation__permuted_strings__returns_true )
{
BOOST_REQUIRE( is_permutation("abcdefg", "gfedcba") );
BOOST_REQUIRE( is_permutation("cafdgbe", "abcdefg") );
BOOST_REQUIRE( is_permutation("aaa", "aaa") );
BOOST_REQUIRE( is_permutation("A", "A") );
}
BOOST_AUTO_TEST_CASE( is_permutation__not_permuted_strings__returns_false )
{
BOOST_REQUIRE( !is_permutation("abcdefg", "gfedcbA") );
BOOST_REQUIRE( !is_permutation("cafdgbe", "abccefg") );
BOOST_REQUIRE( !is_permutation("aa", "a") );
}
BOOST_AUTO_TEST_CASE( encode_num__empty_string__returns_empty_string )
{
string enc = encode_num("");
BOOST_REQUIRE_EQUAL( "", enc );
}
BOOST_AUTO_TEST_CASE( encode_num__shorter_input__returns_input )
{
vector<string> inputs =
{
"a",
"AAaa",
"ab",
"abbcccd"
};
for (const string &input : inputs)
{
string enc = encode_num(input);
BOOST_REQUIRE_EQUAL( input, enc );
}
}
BOOST_AUTO_TEST_CASE( encode_num__longer_input__returns_encoded )
{
vector<pair<string, string>> inputs_encoded =
{
{"aaa", "a3"},
{"AAaaa", "A2a3"},
{"abbcccccd", "a1b2c5d1"}
};
for (const pair<string, string> &input_encoded : inputs_encoded)
{
string enc = encode_num(input_encoded.first);
BOOST_REQUIRE_EQUAL( input_encoded.second, enc );
}
}
BOOST_AUTO_TEST_CASE( sanitize_spaces__empty_string__returns_empty_string )
{
BOOST_REQUIRE_EQUAL( "", sanitize_spaces("").c_str() );
char buf[1] = {0};
_sanitize_spaces(buf, 0);
BOOST_REQUIRE_EQUAL( '\0', buf[0] );
}
BOOST_AUTO_TEST_CASE( sanitize_spaces__no_space_string__returns_input_string )
{
vector<string> inputs =
{
"a",
"AA",
"123",
"\t",
"HelloWorld!"
};
for (const string &input : inputs)
{
BOOST_REQUIRE_EQUAL( input, sanitize_spaces(input).c_str() );
}
}
BOOST_AUTO_TEST_CASE( sanitize_spaces__contains_spaces__returns_sanitized )
{
vector<pair<string, string>> inputs_sanitized =
{
{" ", "%20"},
{" ", "%20%20"},
{"a !", "a%20!"},
{"a b", "a%20%20b"},
{" bcd", "%20bcd"},
{"abc ", "abc%20%20"}
};
for (const pair<string, string> &input_sanitized : inputs_sanitized)
{
// test the wrapper
string sanitized = sanitize_spaces(input_sanitized.first);
BOOST_REQUIRE_EQUAL( input_sanitized.second, sanitized.c_str() );
// test the direct array access version
size_t buffer_count = input_sanitized.second.length() + 1;
unique_ptr<char[]> buffer( new char[buffer_count] );
memset(buffer.get(), 0, sizeof(char) * buffer_count);
strcpy(buffer.get(), input_sanitized.first.c_str());
_sanitize_spaces(buffer.get(), buffer_count - 1);
BOOST_REQUIRE( strcmp(buffer.get(), input_sanitized.second.c_str()) == 0 );
}
}
#include "stringmisc.hpp"
#include <set>
#include <map>
#include <memory>
#include <sstream>
#include <iostream>
#include <iterator>
#include <algorithm>
#include <cassert>
#include <cstring>
using std::set;
using std::map;
using std::sort;
using std::unique_ptr;
using std::ostringstream;
using std::cout;
using std::endl;
// private functions' declarations
inline static void _swap(string &input, unsigned int i, unsigned int j);
inline static void _swap(char *input, unsigned int i, unsigned int j);
using char_count = map<char, unsigned int>;
static char_count get_char_count(const string& input);
template <typename K, typename V>
static bool is_equivalent(const map<K,V> &a, const map<K,V> &b);
#ifdef _DEBUG
template <typename K, typename V>
static void print_map(const map<K,V> &input);
#else
template <typename K, typename V>
static void print_map(const map<K,V> &input) {}
#endif
static void _do_sanitize_spaces(char *buffer, size_t ccount);
// end private functions' declarations
// O(n) time and O(1) space
bool has_all_unique_chars(const string &input)
{
bool seen[1 << (sizeof(char) * 8)] = {0}; // 256 possible char values
// more precisely, this will use 256 booleans of space (~256 * 8 bytes)
// we can do better by using bitfields, so that each boolean only takes 1 bit of space
for (unsigned int i = 0; i < input.length(); ++i)
{
if (seen[(int)input[i]])
{
return false;
}
seen[(int)input[i]] = true;
}
return true;
}
// O(n^2) time and O(1) space, but no extra space needed
bool has_all_unique_chars_ex(const string &input)
{
for (unsigned int i = 0; i < input.length(); ++i)
{
for (unsigned int j = 0; j < input.length(); ++j)
{
if (i == j) continue;
if (input[i] == input[j])
{
return false;
}
}
}
return true;
}
// O(n log n) time and O(1) space, no extra space, but input is modified
bool has_all_unique_chars_ex(string &input)
{
if (input.length() == 0)
{
return true;
}
// first, sort the characters in the string in-place
std::sort (input.begin(), input.end());
// then, find consecutive duplicate characters
for (unsigned int i = 1; i < input.length(); ++i)
{
if (input[i] == input[i-1])
{
return false;
}
}
return true;
}
void reverse_in_place(string &input)
{
// NOTE: this will only work for implementation of std::string where
// its underlying buffer is accessible in this manner, AND it is null-terminated!
reverse_in_place(&input[0]);
}
// we really can't, but we have to assume that `input` is null-terminated
void reverse_in_place(char *input)
{
size_t len = strlen(input);
if (len <= 1)
{
return;
}
unsigned int i = 0;
unsigned int j = len - 1;
for (; i != j && i < j;)
{
_swap(input, i, j);
++i;
--j;
}
}
void _swap(string &input, unsigned int i, unsigned int j)
{
assert (i < input.length());
assert (j < input.length());
_swap(&input[0], i, j);
}
void _swap(char *input, unsigned int i, unsigned int j)
{
char temp = input[i];
input[i] = input[j];
input[j] = temp;
}
// O(n)
bool is_permutation(const string &a, const string &b)
{
// some obvious shortcuts
if (a.length() != b.length())
{
return false;
}
// get the char counts of both strings
char_count ccnt_a = get_char_count(a);
char_count ccnt_b = get_char_count(b);
//print_map(ccnt_a);
//print_map(ccnt_b);
// then compare them, the two strings are permutations of each other
// iff the two char counts are identical
return is_equivalent(ccnt_a, ccnt_b);
// NOTE: this solution runs in O(n), more precisely, O(3n)
}
char_count get_char_count(const string& input)
{
char_count ccnt;
for (unsigned int i = 0; i < input.length(); ++i)
{
char c = input[i];
if (0 == ccnt.count(c))
{
ccnt[c] = 1;
}
else
{
ccnt[c] += 1;
}
}
return ccnt;
}
template <typename K, typename V>
bool is_equivalent(const map<K,V> &a, const map<K,V> &b)
{
// first, get the key sets of both maps
set<K> keys_a, keys_b;
for (typename map<K,V>::const_iterator cit = a.cbegin(); cit != a.cend(); ++cit)
{
keys_a.insert(cit->first);
}
for (typename map<K,V>::const_iterator cit = b.cbegin(); cit != b.cend(); ++cit)
{
keys_b.insert(cit->first);
}
// obvious shortcuts
if (keys_a.size() != keys_b.size())
{
return false;
}
// now, since both sets of keys are of equal length, we only need to iterate
// through one of them
for (typename set<K>::iterator it = keys_a.begin(); it != keys_a.end(); ++it)
{
// if the other map doesn't has this key, then game over
if (0 == b.count(*it))
{
return false;
}
// if they do, the value must be equal
if (a.at(*it) != b.at(*it))
{
return false;
}
}
// if we get here, then the two maps have equal set of keys with equal values
return true;
}
#ifdef _DEBUG
template <typename K, typename V>
void print_map(const map<K,V> &input)
{
cout << "{" << endl;
for (typename map<K,V>::const_iterator cit = input.cbegin(); cit != input.cend(); ++cit)
{
cout << " " << cit->first << ": " << cit->second << "," << endl;
}
cout << "}" << endl;
}
#endif
string encode_num(const string &input)
{
size_t inlen = input.length();
if (inlen <= 1)
{
return string(input);
}
char seen = input[0];
int count = 1;
ostringstream encoded;
// NOTE: this loop, and therefore this implementation, only works for input
// strings of length 2 or more!
for (unsigned int i = 1; i < inlen; ++i)
{
if (input[i] == seen)
{
// seeing the same character as before, simply increment count
++count;
continue;
}
else
{
// seeing a different character for the first time;
// flush the previous character seen and its running count
encoded << seen << count;
// then reset and check if the encoded string so far has
// exceeded the input length
seen = input[i];
count = 1;
if ((int)encoded.tellp() >= (int)inlen)
{
return string(input);
}
}
}
// don't forget to flush the last seen character and its run count!
encoded << seen << count;
if ((int)encoded.tellp() >= (int)inlen)
{
return string(input);
}
return encoded.str();
}
string sanitize_spaces(const string &input)
{
// count the number of space characters in the input string
int count = 0;
for (char c : input)
{
if (c == ' ')
{
++count;
}
}
// allocate an array of char that is expanded to accomodate
// expected additional characters:
// original length + additional 2 chars per space found (+ null terminator)
size_t expected_count = input.length() + count * 2 + 1;
unique_ptr<char[]> output_chars( new char[expected_count] );
memset(output_chars.get(), 0, sizeof(char) * expected_count);
// copy the input characters into this expanded vector
memcpy(output_chars.get(), input.c_str(), sizeof(char) * input.length());
_sanitize_spaces(output_chars.get(), expected_count - 1);
return string(output_chars.get());
}
// buffer is a char array, and ccount is the number of characters that buffer
// can hold;
// buffer should be modified in-place to replace ' ' with "%20"
// ccount should hold enough room to accomodate the final character count,
// not including the null terminator
void _sanitize_spaces(char *buffer, size_t ccount)
{
// input sanitation
if (buffer == nullptr || ccount == 0)
{
return;
}
// the input array is completely filled with original input,
// so that tells us that original input string was not expanded,
// which means that there was no space (' ') character in the original input
if (buffer[ccount-1] != '\0')
{
#ifdef _DEBUG
for (unsigned int i = 0; i < ccount; ++i)
{
assert (buffer[i] != ' ');
}
#endif
return;
}
// validate that we have enough room in the input buffer
unsigned int num_spaces = 0;
for (unsigned int i = 0; i < ccount; ++i)
{
if (buffer[i] == ' ')
{
++num_spaces;
}
}
size_t orig_len = strlen(buffer);
if (ccount - orig_len != 2 * num_spaces)
{
assert (false);
return;
}
_do_sanitize_spaces(buffer, ccount);
}
// assume input is already validated
void _do_sanitize_spaces(char *buffer, size_t ccount)
{
int i, j;
#ifdef _DEBUG
const string &orig(buffer);
#endif
// first, move the padding from the end of the buffer to the beginning
for (i = strlen(buffer) - 1, j = ccount - 1;
i >= 0;
--i, --j)
{
buffer[j] = buffer[i];
}
++j; // this now points to the start of the original string,
// somewhere in the middle of the buffer
#ifdef _DEBUG
assert (string(&buffer[j]) == orig);
#endif
// now, move the string back to the head of the buffer, one character
// at a time, while looking for space (' ') characters
i = 0;
for (; j < (int)ccount; ++j)
{
assert (i >= 0 && i < (int)ccount);
if (buffer[j] == ' ')
{
buffer[i++] = '%';
buffer[i++] = '2';
buffer[i++] = '0';
}
else
{
buffer[i++] = buffer[j];
}
}
}
#include <string>
#include <vector>
using std::string;
using std::vector;
/**
* Given a string, find out whether it contains only unique characters.
*/
bool has_all_unique_chars(const string &input);
bool has_all_unique_chars_ex(const string &input);
bool has_all_unique_chars_ex(string &input);
/**
* Reverse a string in-place.
*/
void reverse_in_place(string &input);
void reverse_in_place(char *input);
/**
* Given two string inputs, find out whether they are permutations of
* each other.
*/
bool is_permutation(const string &a, const string &b);
/**
* Encode a string in this manner:
* e.g. "abbcccccd" -> "a1b2c5d1"
* But only if the output is shorter than the input.
*/
string encode_num(const string &input);
/**
* Sanitize space (' ') characters by replacing each one into "%20".
*/
string sanitize_spaces(const string &input);
void _sanitize_spaces(char *buffer, size_t ccount);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment