Created
April 16, 2019 16:52
-
-
Save backsy/a08bf9dcb07c7874ae5839f16d781fb0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <iomanip> | |
#include <vector> | |
#include <set> | |
#include <algorithm> | |
#include <errno.h> | |
const unsigned int DISPLAY_N_RESULTS = 10; | |
const char* FILENAME = "test_text.txt"; | |
struct comp | |
{ | |
template<typename T> | |
bool operator()(const T& l, const T& r) const | |
{ | |
if (l.second == r.second) | |
return l.first < r.first; | |
return l.second < r.second; | |
} | |
}; | |
void check_file_opened(std::ifstream &stream); | |
void increment_counter(std::vector<std::pair<std::string, int> > &vector, std::string &word); | |
std::set<std::pair<std::string, int>, comp> most_common( | |
std::vector<std::pair<std::string, int> > &vector, | |
unsigned int n); | |
int main () { | |
std::ifstream filestream(FILENAME); | |
check_file_opened(filestream); | |
char ch; | |
int total_combinations = 0; | |
std::string word_buffer; | |
std::string temporary_subword; | |
std::vector<std::pair<std::string, int> > counter; | |
// loop getting single characters | |
while (filestream.get(ch)) | |
{ | |
if (isalpha(ch)){ | |
if (isupper(ch)) ch = tolower(ch); | |
// add valid chars to word_buffer | |
word_buffer.push_back(ch); | |
// if more than 3 chars in word_buffer add the combinations to counter | |
if (word_buffer.length() > 3) { | |
temporary_subword = word_buffer; | |
while (temporary_subword.length() > 3) | |
{ | |
increment_counter(counter, temporary_subword); | |
total_combinations += 1; | |
// remove first character from word and try adding that combination too | |
temporary_subword.erase(temporary_subword.begin()); | |
} | |
// empty the string for next combination | |
temporary_subword.clear(); | |
} | |
// only invalid characters reset the word_buffer | |
} else { | |
word_buffer.clear(); | |
} | |
} | |
if (filestream.bad()) | |
{ | |
perror("error while reading file"); | |
} | |
filestream.close(); | |
// print the results | |
std::set<std::pair<std::string, int>, comp> top = most_common(counter, DISPLAY_N_RESULTS); | |
std::set<std::pair<std::string, int> >::reverse_iterator it2; | |
double percent = 0; | |
std::cout << "Frequency:" << std::endl; | |
for(it2 = top.rbegin(); it2 != top.rend(); it2++) | |
{ | |
percent = (it2->second/(double) total_combinations) * 100; | |
std::cout | |
<< std::left | |
<< std::setw(10) | |
<< it2->first | |
<< ": " | |
<< std::right | |
<< std::fixed | |
<< std::setprecision(2) | |
<< std::setw(6) | |
<< percent << "% " | |
<< std::left | |
<< std::setw(50) | |
<< std::string(percent, '=') | |
<< std::endl; | |
} | |
return 0; | |
} | |
void check_file_opened(std::ifstream &stream) | |
{ | |
if (stream.fail()) | |
{ | |
switch (errno) | |
{ | |
case EACCES: | |
// this is set if the drive is not ready in DOS | |
std::cout << "Drive not ready or permission denied" << std::endl; | |
break; | |
case ENOENT: | |
std::cout << "Could not find this file" << std::endl; | |
break; | |
default: | |
perror("opening data file"); | |
} | |
} | |
} | |
void increment_counter( | |
std::vector<std::pair<std::string, int> > &vector, | |
std::string &word) | |
{ | |
// if the word is in the vector add 1 to its counter | |
std::vector<std::pair<std::string, int> >::iterator it; | |
for(it = vector.begin(); it != vector.end(); it++) | |
{ | |
if(it->first == word) | |
{ | |
it->second += 1; | |
return; | |
} | |
} | |
// if the word is not in the vector add it to it | |
vector.push_back(std::pair<std::string, int>(word,1)); | |
return; | |
} | |
std::set<std::pair<std::string, int>, comp> most_common( | |
std::vector<std::pair<std::string, int> > &vector, | |
unsigned int n) | |
{ | |
std::set<std::pair<std::string, int>, comp> top_n; | |
std::vector<std::pair<std::string,int> >::iterator it; | |
// make sure you dont try to access values outside the vector | |
if(n > vector.size()) | |
{ | |
n = vector.size(); | |
} | |
// limiting the scope of i | |
{ | |
// add first n items to set | |
unsigned int i = 0; | |
it = vector.begin(); | |
while(i < n) | |
{ | |
top_n.insert(std::pair<std::string, int>(it->first, it->second)); | |
i++; | |
it++; | |
} | |
} | |
for(it = vector.begin() + n; it != vector.end(); it++) | |
{ | |
// compare with top most value, if greater then add it to the set | |
// remove the top most value which is the smallest | |
// this uses reverse order than desired because | |
// erasing and comparing at the bottom runs into random segfaults | |
if(it->second > top_n.begin()->second) | |
{ | |
top_n.insert(std::pair<std::string, int>(it->first, it->second)); | |
top_n.erase(top_n.begin()); | |
} | |
} | |
return top_n; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment