Last active
March 12, 2017 16:54
-
-
Save reeFridge/950b697daf0c3a7437daf6f7bacc5453 to your computer and use it in GitHub Desktop.
Word frequency counter.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Author: reefridge <reefridgerator@gmail.com> | |
* Compilation: `clang++ freqs.cpp -o freqs` | |
* Synopsis: `freqs <input> <output>` | |
*/ | |
#include<string> | |
#include<vector> | |
#include<cctype> | |
#include<fstream> | |
#include<iostream> | |
using namespace std; | |
enum Arg { | |
INPUT = 1, | |
OUTPUT | |
}; | |
enum ExitCode { | |
OK, | |
NOT_ENOUGH_ARGS, | |
READ_DATA_ERROR | |
}; | |
struct WordFreq { | |
private: | |
unsigned int count; | |
string token; | |
public: | |
WordFreq(string w) : token(w), count(0) {}; | |
const string& getWord() { | |
return token; | |
} | |
const unsigned int& getCount() { | |
return count; | |
} | |
void inc() { | |
count++; | |
} | |
}; | |
const string alph = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; | |
vector<string> getWordsFromFile(ifstream&); | |
vector<string> splitPart(string&); | |
string& toLowerCase(string& str); | |
vector<WordFreq> wordsToFreqs(vector<string>&); | |
WordFreq& getFreq(vector<WordFreq>&, string&); | |
void sortFreqs(vector<WordFreq>&); | |
void writeFreqsToFile(ofstream&, vector<WordFreq>&); | |
int main(int argc, char** argv) { | |
if (argc < 3) { | |
cout << "Synopsis: freqs <input> <output>" << endl; | |
return NOT_ENOUGH_ARGS; | |
} | |
ifstream input(argv[INPUT]); | |
vector<string> words; | |
if (!input.good()) { | |
input.close(); | |
return READ_DATA_ERROR; | |
} | |
words = getWordsFromFile(input); | |
input.close(); | |
vector<WordFreq> freqs = wordsToFreqs(words); | |
sortFreqs(freqs); | |
ofstream output(argv[OUTPUT]); | |
writeFreqsToFile(output, freqs); | |
output.close(); | |
return OK; | |
} | |
vector<WordFreq> wordsToFreqs(vector<string>& words) { | |
vector<WordFreq> freqs; | |
for (int i = 0; i < words.size(); i++) | |
getFreq(freqs, words[i]).inc(); | |
return freqs; | |
} | |
WordFreq& getFreq(vector<WordFreq>& freqs, string& word) { | |
for (int i = 0; i < freqs.size(); i++) { | |
if (freqs[i].getWord() == word) | |
return freqs[i]; | |
} | |
freqs.push_back(WordFreq(word)); | |
return freqs.back(); | |
} | |
vector<string> getWordsFromFile(ifstream& file) { | |
vector<string> words, split; | |
string part, word; | |
while (!(file >> ws).eof()) { | |
file >> part; | |
if (file.fail()) | |
break; | |
do { | |
split = splitPart(part); | |
word = part; | |
if (!split.empty()) { | |
part = split.back(); | |
split.pop_back(); | |
word = split.back(); | |
} | |
if (word.length() > 1) | |
words.push_back(toLowerCase(word)); | |
} while (!split.empty()); | |
} | |
return words; | |
} | |
vector<string> splitPart(string& part) { | |
size_t charPos = part.find_first_not_of(alph); | |
vector<string> res; | |
if (charPos != string::npos) { | |
res.push_back(part.substr(0, charPos)); | |
res.push_back(part.substr(charPos + 1)); | |
} | |
return res; | |
} | |
string& toLowerCase(string& str) { | |
for (int i = 0; i < str.length(); i++) | |
str[i] = std::tolower(str[i]); | |
return str; | |
} | |
void swapFreqs(vector<WordFreq>& freqs, int i, int j) { | |
WordFreq temp = freqs[i]; | |
freqs[i] = freqs[j]; | |
freqs[j] = temp; | |
} | |
void sortFreqs(vector<WordFreq>& freqs) { | |
int size = freqs.size(); | |
for (int i = 0; i < size - 1; i++) { | |
for (int j = 0; j < size - 1; j++) { | |
unsigned int current = freqs[j].getCount(); | |
unsigned int next = freqs[j+1].getCount(); | |
bool needSwap = false; | |
if (current < next) { | |
needSwap = true; | |
} else if (current == next) { | |
if (freqs[j].getWord() > freqs[j+1].getWord()) | |
needSwap = true; | |
} | |
if (needSwap) | |
swapFreqs(freqs, j, j+1); | |
} | |
} | |
} | |
void writeFreqsToFile(ofstream& output, vector<WordFreq>& freqs) { | |
for (int i = 0; i < freqs.size(); i++) { | |
output << freqs[i].getCount() << ' ' << freqs[i].getWord() << endl; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment