Skip to content

Instantly share code, notes, and snippets.

@tbl3rd
Created March 19, 2013 00:14
Show Gist options
  • Save tbl3rd/5192339 to your computer and use it in GitHub Desktop.
Save tbl3rd/5192339 to your computer and use it in GitHub Desktop.
tbl.local # ls
stl.cc
tbl.local # cat stl.cc
// C++ has strong support for internationalization and localization, and
// it is better developed than the C equivalents. Here's a small sample
// program illustrating some techniques. This program is pure ANSI/ISO
// standard C++. It should compile and run on any conforming hosted C++
// implementation on any OS. If you want it to run on a platform that
// uses wide characters for its native data representation, just change
// 'std::string' to 'std::wstring' in main(). It parses words and frobs
// case according to locale. In C++ it is fairly simple to have multiple
// locales active in a process too -- so-called "non-global locales".
//
// Notice that this program's whole reason for being is to manipulate
// characters and strings, but it doesn't mention any char or string
// types except in main().
//
// I wanted to parameterize on character -- rather than string -- type,
// but bugs in the trait classes in the MSVC++ standard library prevent
// that. So instead, I parameterize string type and use its value_type
// as the character type. Only main() knows the character/string type.
//
// This program is basically the old Unix scripting example
//
// cat "$@" |
// tr -cs 'A-Za-z' '\012' |
// sed '/^$/d' |
// tr 'A-Z' 'a-z' |
// sort |
// uniq -c |
// sort -n -r
//
// written in C++ and abstracted over character types
// and locales. It counts word frequency in a list of
// files, and prints a table sorted from most frequent
// to least.
// #pragma warning(disable: 4786)
#include <algorithm>
#include <iostream>
#include <iterator>
#include <fstream>
#include <locale>
#include <map>
#include <sstream>
#include <vector>
namespace {
// Convert a Y(X) map into a X(Y) multimap in reverse insertion order.
//
template<class Multi, class Map>
inline void invertMapReverse(Multi &u, const Map &m)
{
typedef typename Map::const_reverse_iterator MCRI;
MCRI const end = m.rend();
typename Multi::iterator hint = u.begin();
for (MCRI p = m.rbegin(); p != end; ++p) {
hint = u.insert(hint, typename Multi::value_type(p->second, p->first));
}
}
// Show the std::pair p on s as "first second\n"
//
template<class Pair, class Stream> class ShowPair {
Stream &itsStream;
public:
void operator()(const Pair &p) {
itsStream << p.first << ' ' << p.second << '\n';
}
ShowPair(Stream &s): itsStream(s) {}
};
// Return the locale for this process.
//
inline const std::locale &theLocale()
{
static const std::locale theLocale;
return theLocale;
}
// Collect into wtc any word remaining in s after stripping punctuation.
//
template<class WordsToCounts> struct CollectOneWord {
typedef typename WordsToCounts::key_type StringT;
typedef typename StringT::value_type CharT;
typedef typename StringT::const_iterator ConstIterator;
private:
WordsToCounts &itsWtc;
static bool isAlNum(CharT c) { return std::isalnum(c, theLocale()); }
static bool isNotAlNum(CharT c) { return !isAlNum(c); }
public:
void operator()(const StringT &s) {
ConstIterator b = std::find_if(s.begin(), s.end(), isAlNum);
ConstIterator e = std::find_if(b, s.end(), isNotAlNum);
const StringT w(b, e);
if (!w.empty()) ++itsWtc[w];
}
CollectOneWord(WordsToCounts &wtc): itsWtc(wtc) {}
};
// Collect a word from s after downcasing it.
//
template<class WordsToCounts> class CollectFromLowercaseString {
typedef typename WordsToCounts::key_type StringT;
typedef typename StringT::value_type CharT;
CollectOneWord<WordsToCounts> itsCow;
static CharT toLowercase(CharT c) { return std::tolower(c, theLocale()); }
public:
void operator()(const StringT &s) {
std::basic_ostringstream<CharT> os;
std::ostream_iterator<CharT> osi(os);
std::transform(s.begin(), s.end(), osi, toLowercase);
itsCow(os.str());
}
CollectFromLowercaseString(WordsToCounts &wtc): itsCow(wtc) {}
};
// Collect all words from the file at pathname into wtc.
//
template<class WordsToCounts> class CollectAllWordsFromFile {
typedef typename WordsToCounts::key_type StringT;
typedef typename StringT::value_type CharT;
CollectFromLowercaseString<WordsToCounts> itsCollector;
public:
void operator()(const char *pathname) {
std::basic_ifstream<CharT> inputFile(pathname);
if (inputFile) {
std::istream_iterator<StringT> begin(inputFile), end;
std::for_each(begin, end, itsCollector);
} else {
throw pathname;
}
}
CollectAllWordsFromFile(WordsToCounts &wtc): itsCollector(wtc) {}
};
// Represent (ac, av) as a container and validate it.
//
class CommandLine {
int itsAc;
char **itsAv;
public:
char **begin() const { return itsAv + 1; }
char **end() const { return itsAv + itsAc; }
const char *name() const { return itsAv[0]; }
void showUsage() const {
std::cerr << "Usage: " << name() << " [<file> ...]" << std::endl;
}
bool validate() const {
const bool ok = itsAc > 1;
if (!ok) showUsage();
return ok;
}
CommandLine(int ac, char **av): itsAc(ac), itsAv(av) {}
};
// Count the frequency of words occuring in files named on cl.
//
template<class String, class Stream>
inline void countWordFrequency(const CommandLine &cl, Stream &outS)
{
typedef std::map<String, int, std::less<String> > WordsToCounts;
typedef std::multimap<int, String, std::less<int> > CountsToWords;
WordsToCounts wordsIn;
CollectAllWordsFromFile<WordsToCounts> caw(wordsIn);
std::for_each(cl.begin(), cl.end(), caw);
CountsToWords wordsOut;
invertMapReverse(wordsOut, wordsIn);
ShowPair<typename CountsToWords::value_type, Stream> showPair(outS);
std::for_each(wordsOut.rbegin(), wordsOut.rend(), showPair);
}
}
int main(int ac, char *av[])
{
CommandLine cl(ac, av);
bool ok = cl.validate();
try {
if (ok) countWordFrequency<std::string>(cl, std::cout);
} catch (const char *pathname) {
std::cerr << cl.name() << ": cannot open file named \""
<< pathname << "\"." << std::endl;
cl.showUsage();
ok = false;
}
return ok? EXIT_SUCCESS: EXIT_FAILURE;
}
// The CommandLine class is a minimal STL-style container -- implemented
// only enough to use conveniently with for_each().
//
// invertMapReverse() is a new "generic algorithm" that inverts the
// mapping relation in any "associative container". It does it in
// linear time too!
tbl.local # make stl
g++ stl.cc -o stl
tbl.local # ./stl stl.cc | head
26 std
17 the
15 a
13 it
13 in
13 const
11 typename
11 and
10 typedef
10 stringt
tbl.local #
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment