Created
September 29, 2016 15:01
-
-
Save TinoDidriksen/7c8f7a2eee178073dcfc41578e9b455d to your computer and use it in GitHub Desktop.
DM555 Assignment 1: Apriori Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <set> | |
#include <vector> | |
#include <map> | |
#include <algorithm> | |
#include <string> | |
using namespace std; // Wrong, but easier to read in homework | |
// Helper types | |
typedef set<char> row_t; | |
typedef vector<row_t> db_t; | |
typedef map<row_t, size_t> level_t; | |
typedef vector<level_t> levels_t; | |
// Helper function for pretty-printing rows | |
string to_string(const row_t& row) { | |
string rv{ "{ " }; | |
for (auto c : row) { | |
rv += c; | |
rv += ' '; | |
} | |
rv += '}'; | |
return rv; | |
} | |
// Helper function for making all combinations | |
db_t combinations(const row_t& row, size_t c) { | |
db_t cs; | |
if (row.size() < c) { | |
return cs; | |
} | |
vector<bool> bitset(row.size() - c, 0); | |
bitset.resize(row.size(), 1); | |
do { | |
row_t nr; | |
auto it = row.begin(); | |
for (size_t i = 0; i < row.size(); ++i, ++it) { | |
if (bitset[i]) { | |
nr.insert(*it); | |
} | |
} | |
cs.push_back(nr); | |
} while (next_permutation(bitset.begin(), bitset.end())); | |
return cs; | |
} | |
// Helper function to remove candidates below support threshold | |
level_t remove_infrequent(level_t cands, size_t e) { | |
for (auto it = cands.begin() ; it != cands.end() ; ) { | |
if (it->second < e) { | |
cout << "Discarding " << to_string(it->first) << " for e=" << it->second << endl; | |
it = cands.erase(it); | |
} | |
else { | |
++it; | |
} | |
} | |
return cands; | |
} | |
// Actual worker function | |
levels_t apriori(const db_t& db, size_t e) { | |
// Set up 1-itemsets | |
levels_t levels(1); | |
for (auto& row : db) { | |
for (auto c : row) { | |
++levels[0][row_t{ c }]; | |
} | |
} | |
levels[0] = remove_infrequent(levels[0], e); | |
for (size_t k = 2 ; ; ++k) { | |
cout << "Level " << k << endl; | |
// Construct candidates from previous level | |
level_t cands; | |
for (auto& a : levels[k - 2]) { | |
for (auto& b : levels[k - 2]) { | |
bool good = (a.first != b.first); // Don't even try to union same row | |
// Check that the first | |
if (good && k > 2) { | |
auto end = a.first.begin(); | |
advance(end, k - 2); | |
good = equal(a.first.begin(), end, b.first.begin()); | |
} | |
if (good) { | |
row_t row = a.first; | |
row.insert(b.first.begin(), b.first.end()); | |
cands[row]; | |
} | |
} | |
} | |
cout << "Candidate sets are:" << endl; | |
for (auto& row : cands) { | |
cout << to_string(row.first) << endl; | |
} | |
cout << endl; | |
// Determine which candidates can't possibly be frequent | |
for (auto cand = cands.begin() ; cand != cands.end() ; ) { | |
bool good = true; | |
for (auto& comb : combinations(cand->first, k - 1)) { | |
if (levels[k - 2].count(comb) == 0) { | |
cout << "Discarding " << to_string(cand->first) << " for subset " << to_string(comb) << endl; | |
good = false; | |
break; | |
} | |
} | |
if (good == false) { | |
cand = cands.erase(cand); | |
} | |
else { | |
++cand; | |
} | |
} | |
// Determine frequency, then prune infrequent | |
for (auto& row : db) { | |
for (auto& cand : cands) { | |
if (includes(row.begin(), row.end(), cand.first.begin(), cand.first.end())) { | |
++cand.second; | |
} | |
} | |
} | |
cands = remove_infrequent(cands, e); | |
cout << endl; | |
if (cands.empty()) { | |
cout << "No candidates remain - ending search" << endl << endl; | |
break; | |
} | |
cout << "Supported sets are:" << endl; | |
for (auto& row : cands) { | |
cout << to_string(row.first) << " e=" << row.second << endl; | |
} | |
levels.push_back(cands); | |
cout << endl; | |
} | |
cout << "Final supported sets are:" << endl; | |
for (auto& cands : levels) { | |
for (auto& row : cands) { | |
cout << to_string(row.first) << " e=" << row.second << endl; | |
} | |
} | |
return levels; | |
} | |
int main() { | |
cout << boolalpha; | |
db_t db{ | |
{ 'A', 'B', 'E' }, | |
{ 'B', 'D' }, | |
{ 'C', 'D', 'F' }, | |
{ 'A', 'B', 'D' }, | |
{ 'A', 'C', 'E' }, | |
{ 'B', 'C', 'E', 'F' }, | |
{ 'A', 'C', 'E' }, | |
{ 'A', 'B', 'C', 'E' }, | |
{ 'A', 'B', 'C', 'D', 'F' }, | |
{ 'B', 'C', 'D', 'E' }, | |
}; | |
auto levels = apriori(db, 2); | |
for (size_t i = 0 ; i < levels.size() ; ++i) { | |
for (auto& row : levels[i]) { | |
bool mfi = true; | |
bool cfi = true; | |
if (i < levels.size() - 1) { | |
for (auto& sup : levels[i + 1]) { | |
if (includes(sup.first.begin(), sup.first.end(), row.first.begin(), row.first.end())) { | |
mfi = false; | |
if (sup.second == row.second) { | |
cfi = false; | |
} | |
} | |
} | |
} | |
cout << to_string(row.first) << " is mfi=" << mfi << " and cfi=" << cfi << endl; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Level 2 | |
Candidate sets are: | |
{ A B } | |
{ A C } | |
{ A D } | |
{ A E } | |
{ A F } | |
{ B C } | |
{ B D } | |
{ B E } | |
{ B F } | |
{ C D } | |
{ C E } | |
{ C F } | |
{ D E } | |
{ D F } | |
{ E F } | |
Discarding { A F } for e=1 | |
Discarding { D E } for e=1 | |
Discarding { E F } for e=1 | |
Supported sets are: | |
{ A B } e=4 | |
{ A C } e=4 | |
{ A D } e=2 | |
{ A E } e=4 | |
{ B C } e=4 | |
{ B D } e=4 | |
{ B E } e=4 | |
{ B F } e=2 | |
{ C D } e=3 | |
{ C E } e=5 | |
{ C F } e=3 | |
{ D F } e=2 | |
Level 3 | |
Candidate sets are: | |
{ A B C } | |
{ A B D } | |
{ A B E } | |
{ A C D } | |
{ A C E } | |
{ A D E } | |
{ B C D } | |
{ B C E } | |
{ B C F } | |
{ B D E } | |
{ B D F } | |
{ B E F } | |
{ C D E } | |
{ C D F } | |
{ C E F } | |
Discarding { A D E } for subset { D E } | |
Discarding { B D E } for subset { D E } | |
Discarding { B E F } for subset { E F } | |
Discarding { C D E } for subset { D E } | |
Discarding { C E F } for subset { E F } | |
Discarding { A C D } for e=1 | |
Discarding { B D F } for e=1 | |
Supported sets are: | |
{ A B C } e=2 | |
{ A B D } e=2 | |
{ A B E } e=2 | |
{ A C E } e=3 | |
{ B C D } e=2 | |
{ B C E } e=3 | |
{ B C F } e=2 | |
{ C D F } e=2 | |
Level 4 | |
Candidate sets are: | |
{ A B C D } | |
{ A B C E } | |
{ A B D E } | |
{ B C D E } | |
{ B C D F } | |
{ B C E F } | |
Discarding { A B C D } for subset { A C D } | |
Discarding { A B D E } for subset { B D E } | |
Discarding { B C D E } for subset { C D E } | |
Discarding { B C D F } for subset { B D F } | |
Discarding { B C E F } for subset { C E F } | |
Discarding { A B C E } for e=1 | |
No candidates remain - ending search | |
Final supported sets are: | |
{ A } e=6 | |
{ B } e=7 | |
{ C } e=7 | |
{ D } e=5 | |
{ E } e=6 | |
{ F } e=3 | |
{ A B } e=4 | |
{ A C } e=4 | |
{ A D } e=2 | |
{ A E } e=4 | |
{ B C } e=4 | |
{ B D } e=4 | |
{ B E } e=4 | |
{ B F } e=2 | |
{ C D } e=3 | |
{ C E } e=5 | |
{ C F } e=3 | |
{ D F } e=2 | |
{ A B C } e=2 | |
{ A B D } e=2 | |
{ A B E } e=2 | |
{ A C E } e=3 | |
{ B C D } e=2 | |
{ B C E } e=3 | |
{ B C F } e=2 | |
{ C D F } e=2 | |
{ A } is mfi=false and cfi=true | |
{ B } is mfi=false and cfi=true | |
{ C } is mfi=false and cfi=true | |
{ D } is mfi=false and cfi=true | |
{ E } is mfi=false and cfi=true | |
{ F } is mfi=false and cfi=false | |
{ A B } is mfi=false and cfi=true | |
{ A C } is mfi=false and cfi=true | |
{ A D } is mfi=false and cfi=false | |
{ A E } is mfi=false and cfi=true | |
{ B C } is mfi=false and cfi=true | |
{ B D } is mfi=false and cfi=true | |
{ B E } is mfi=false and cfi=true | |
{ B F } is mfi=false and cfi=false | |
{ C D } is mfi=false and cfi=true | |
{ C E } is mfi=false and cfi=true | |
{ C F } is mfi=false and cfi=true | |
{ D F } is mfi=false and cfi=false | |
{ A B C } is mfi=true and cfi=true | |
{ A B D } is mfi=true and cfi=true | |
{ A B E } is mfi=true and cfi=true | |
{ A C E } is mfi=true and cfi=true | |
{ B C D } is mfi=true and cfi=true | |
{ B C E } is mfi=true and cfi=true | |
{ B C F } is mfi=true and cfi=true | |
{ C D F } is mfi=true and cfi=true |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment