Skip to content

Instantly share code, notes, and snippets.

@alipha
Created March 14, 2019 14:32
Show Gist options
  • Save alipha/a9bafec8a7d2a4fe5209eb3ea0c88413 to your computer and use it in GitHub Desktop.
Save alipha/a9bafec8a7d2a4fe5209eb3ea0c88413 to your computer and use it in GitHub Desktop.
Find all strings in a set which are not a substring of another string within the set
#include <string>
#include <iostream>
#include <fstream>
#include <ctime>
#include <cstring>
#include <set>
#include <algorithm>
using namespace std;
class cstring {
public:
cstring(const char *s) : str(s) {}
const char *str;
};
bool operator<(const cstring &left, const cstring &right) {
return strcmp(left.str, right.str) < 0;
}
cstring to_lower(char *str) {
transform(str, str + strlen(str), str, ::tolower);
return str;
}
void explode(set<cstring> &segments, const char *str) {
for(size_t i = 1, len = strlen(str); i < len; i++)
segments.insert(str + i);
}
void explode_all(set<cstring> &segments, set<cstring> &words) {
for(set<cstring>::iterator word = words.begin(); word != words.end(); word++)
explode(segments, word->str);
}
bool word_found(const set<cstring> &segments, const cstring &word, bool use_lower) {
set<cstring>::iterator it = use_lower ? segments.lower_bound(word) : segments.upper_bound(word);
if(it == segments.end())
return false;
//cout << word.str << " == " << it->str << endl;
return strncmp(word.str, it->str, strlen(word.str)) == 0 && it->str != word.str;
}
bool slow_word_found(const set<cstring> &words, const cstring &word) {
for(set<cstring>::iterator it = words.begin(); it != words.end(); it++) {
if(strstr(it->str, word.str) && it->str != word.str)
return true;
}
return false;
}
void load_file(const char *filename, set<cstring> &words) {
string line;
ifstream file(filename);
while(getline(file, line)) {
char *word = new char[line.size() + 1];
strcpy(word, line.c_str());
words.insert(to_lower(word));
}
file.close();
}
void save(const char *filename, const set<cstring> &output) {
ofstream file(filename);
for(set<cstring>::const_iterator it = output.begin(); it != output.end(); it++) {
file << it->str << endl;
}
file.close();
}
int main(int argc, char **argv) {
set<cstring> words;
set<cstring> segments;
set<cstring> output;
set<cstring> slow_output;
if(argc < 2) {
cout << "Must specify input filename" << endl;
return 1;
}
load_file(argv[1], words);
clock_t start = clock();
explode_all(segments, words);
clock_t end = clock();
cout << "Explode: " << ((double)end - start) / CLOCKS_PER_SEC << endl;
/*
for(set<cstring>::iterator it = segments.begin(); it != segments.end(); it++) {
cout << it->str << endl;
}
cout << endl;
*/
start = clock();
for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) {
if(!word_found(words, *word, false) && !word_found(segments, *word, true))
output.insert(*word);
}
end = clock();
cout << " Filter: " << ((double)end - start) / CLOCKS_PER_SEC << endl;
save("output.txt", output);
time_t sec_start = time(0);
start = clock();
for(set<cstring>::iterator word = words.begin(); word != words.end(); word++) {
if(!slow_word_found(words, *word))
slow_output.insert(*word);
}
end = clock();
time_t sec_end = time(0);
cout << " Slow: " << ((double)end - start) / CLOCKS_PER_SEC << endl;
cout << "in secs: " << sec_end - sec_start << endl;
save("slow_output.txt", slow_output);
cout << words.size() << " words" << endl;
cout << segments.size() << " segments" << endl;
cout << output.size() << " unique" << endl;
cout << slow_output.size() << " unique (double check)" << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment