Skip to content

Instantly share code, notes, and snippets.

@reeFridge
Last active March 12, 2017 16:54
Show Gist options
  • Save reeFridge/950b697daf0c3a7437daf6f7bacc5453 to your computer and use it in GitHub Desktop.
Save reeFridge/950b697daf0c3a7437daf6f7bacc5453 to your computer and use it in GitHub Desktop.
Word frequency counter.
/**
* Author: reefridge <reefridgerator@gmail.com>
* Compilation: `clang++ freqs.cpp -o freqs`
* Synopsis: `freqs <input> <output>`
*/
#include<string>
#include<vector>
#include<cctype>
#include<fstream>
#include<iostream>
using namespace std;
enum Arg {
INPUT = 1,
OUTPUT
};
enum ExitCode {
OK,
NOT_ENOUGH_ARGS,
READ_DATA_ERROR
};
struct WordFreq {
private:
unsigned int count;
string token;
public:
WordFreq(string w) : token(w), count(0) {};
const string& getWord() {
return token;
}
const unsigned int& getCount() {
return count;
}
void inc() {
count++;
}
};
const string alph = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
vector<string> getWordsFromFile(ifstream&);
vector<string> splitPart(string&);
string& toLowerCase(string& str);
vector<WordFreq> wordsToFreqs(vector<string>&);
WordFreq& getFreq(vector<WordFreq>&, string&);
void sortFreqs(vector<WordFreq>&);
void writeFreqsToFile(ofstream&, vector<WordFreq>&);
int main(int argc, char** argv) {
if (argc < 3) {
cout << "Synopsis: freqs <input> <output>" << endl;
return NOT_ENOUGH_ARGS;
}
ifstream input(argv[INPUT]);
vector<string> words;
if (!input.good()) {
input.close();
return READ_DATA_ERROR;
}
words = getWordsFromFile(input);
input.close();
vector<WordFreq> freqs = wordsToFreqs(words);
sortFreqs(freqs);
ofstream output(argv[OUTPUT]);
writeFreqsToFile(output, freqs);
output.close();
return OK;
}
vector<WordFreq> wordsToFreqs(vector<string>& words) {
vector<WordFreq> freqs;
for (int i = 0; i < words.size(); i++)
getFreq(freqs, words[i]).inc();
return freqs;
}
WordFreq& getFreq(vector<WordFreq>& freqs, string& word) {
for (int i = 0; i < freqs.size(); i++) {
if (freqs[i].getWord() == word)
return freqs[i];
}
freqs.push_back(WordFreq(word));
return freqs.back();
}
vector<string> getWordsFromFile(ifstream& file) {
vector<string> words, split;
string part, word;
while (!(file >> ws).eof()) {
file >> part;
if (file.fail())
break;
do {
split = splitPart(part);
word = part;
if (!split.empty()) {
part = split.back();
split.pop_back();
word = split.back();
}
if (word.length() > 1)
words.push_back(toLowerCase(word));
} while (!split.empty());
}
return words;
}
vector<string> splitPart(string& part) {
size_t charPos = part.find_first_not_of(alph);
vector<string> res;
if (charPos != string::npos) {
res.push_back(part.substr(0, charPos));
res.push_back(part.substr(charPos + 1));
}
return res;
}
string& toLowerCase(string& str) {
for (int i = 0; i < str.length(); i++)
str[i] = std::tolower(str[i]);
return str;
}
void swapFreqs(vector<WordFreq>& freqs, int i, int j) {
WordFreq temp = freqs[i];
freqs[i] = freqs[j];
freqs[j] = temp;
}
void sortFreqs(vector<WordFreq>& freqs) {
int size = freqs.size();
for (int i = 0; i < size - 1; i++) {
for (int j = 0; j < size - 1; j++) {
unsigned int current = freqs[j].getCount();
unsigned int next = freqs[j+1].getCount();
bool needSwap = false;
if (current < next) {
needSwap = true;
} else if (current == next) {
if (freqs[j].getWord() > freqs[j+1].getWord())
needSwap = true;
}
if (needSwap)
swapFreqs(freqs, j, j+1);
}
}
}
void writeFreqsToFile(ofstream& output, vector<WordFreq>& freqs) {
for (int i = 0; i < freqs.size(); i++) {
output << freqs[i].getCount() << ' ' << freqs[i].getWord() << endl;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment