Skip to content

Instantly share code, notes, and snippets.

@lovromazgon lovromazgon/Apriori.java forked from monperrus/Apriori.java
Last active Dec 10, 2015

Embed
What would you like to do?
Java implementation of the Apriori algorithm for mining frequent itemsets - using multiple threads (default 8)
import java.io.*;
import java.util.*;
/** The class encapsulates an implementation of the Apriori algorithm to compute frequent itemsets.
*
* This version uses multiple threads to go through the dataset and is therefore 3x faster than the 1 thread version.
*
* Notice: To enable progress tracking (CLIProgressBar) follow these steps:
* 1. download this class and put it on the classpath: https://gist.github.com/lovromazgon/9c801554ceb56157de30
* 2. uncomment lines 229 and 243
*
* Usage as library: see {@link ExampleOfClientCodeOfApriori}
*
* @author Lovro Mazgon, implemented multithreaded version, 2015
* Original authors:
* @author Martin Monperrus, University of Darmstadt, 2010
* @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009.
* @copyright GNU General Public License v3
* No reproduction in whole or part without maintaining this copyright notice
* and imposing this condition on any subsequent users.
*/
public class Apriori extends Observable {
private static final int NUMBER_OF_THREADS = 8;
/** all input items */
private final Iterable<int[]> allItems;
/** the list of current itemsets */
private List<int[]> itemsets ;
/** set of distinct items in the dataset */
private List<Integer> distinctItems;
/** number of different items in the dataset */
private int numItems;
/** total number of transactions in transaFile */
private int numTransactions;
/** minimum support for a frequent itemset in percentage, e.g. 0.8 */
private double minSup;
/** This is the main interface to use this class as a library */
public Apriori(Observer ob, Iterable<int[]> itemsets, double minSupport) {
allItems = itemsets;
configure(allItems, minSupport);
this.addObserver(ob);
}
/** computes numItems, numTransactions, and sets minSup */
private void configure(Iterable<int[]> allItems, double minSupport) {
// setting minsupport
minSup = minSupport;
if (minSup>1 || minSup<0) throw new RuntimeException("minSup: bad value");
// going thourgh the itemsets to compute distinctItems and numTransactions
numTransactions = 0;
Set<Integer> itemsSet = new HashSet<>();
for (int[] itemset : allItems) {
numTransactions++;
for (int i : itemset) {
itemsSet.add(i);
}
}
this.distinctItems = new ArrayList<>(itemsSet);
// setting numItems
numItems = distinctItems.size();
outputConfig();
}
/** outputs the current configuration */
private void outputConfig() {
//output config info to the user
log("Input configuration: " + numItems + " items, " + numTransactions + " transactions, ");
log("minsup = " + (minSup*100) + "%");
}
/** starts the algorithm after configuration */
public void run() throws Exception {
//start timer
long start = System.currentTimeMillis();
// first we generate the candidates of size 1
createItemsetsOfSize1();
int itemsetNumber = 1; //the current itemset being looked at
int nbFrequentSets = 0;
while (itemsets.size() > 0) {
calculateFrequentItemsets();
if (itemsets.size()!=0) {
nbFrequentSets += itemsets.size();
log("Found " + itemsets.size() + " frequent itemsets of size " + itemsetNumber + " (with support " + (minSup*100) + "%)");
createNewItemsetsFromPreviousOnes();
}
itemsetNumber++;
}
//display the execution time
long end = System.currentTimeMillis();
log("Execution time is: "+((double)(end-start)/1000) + " seconds.");
log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")");
log("Done");
}
/** puts in itemsets all sets of size 1,
* i.e. all possibles items of the datasets
*/
private void createItemsetsOfSize1() {
itemsets = new ArrayList<int[]>();
for(int i=0; i<numItems; i++)
{
int[] cand = {i};
itemsets.add(cand);
}
}
/** triggers actions if a frequent item set has been found */
private void foundFrequentItemSet(int[] itemset, int support) {
int[] convertedItemset = new int[itemset.length];
for (int i = 0; i < itemset.length; i++) {
convertedItemset[i] = distinctItems.get(itemset[i]);
}
this.setChanged();
notifyObservers(convertedItemset);
System.out.println(Arrays.toString(convertedItemset) + " (" + ((support / (double) numTransactions))+" " + support + ")");
}
/** outputs a message in Sys.err if not used as library */
private void log(String message) {
System.err.println(message);
}
/**
* if m is the size of the current itemsets,
* generate all possible itemsets of size n+1 from pairs of current itemsets
* replaces the itemsets of itemsets by the new ones
*/
private void createNewItemsetsFromPreviousOnes()
{
// by construction, all existing itemsets have the same size
int currentSizeOfItemsets = itemsets.get(0).length;
log("Creating itemsets of size " + (currentSizeOfItemsets+1) + " based on " + itemsets.size() + " itemsets of size " + currentSizeOfItemsets);
HashMap<String, int[]> tempCandidates = new HashMap<>(); //temporary candidates
// compare each pair of itemsets of size n-1
for(int i = 0; i < itemsets.size(); i++) {
for(int j = i+1; j < itemsets.size(); j++) {
int[] X = itemsets.get(i);
int[] Y = itemsets.get(j);
assert(X.length == Y.length);
//make a string of the first n-2 tokens of the strings
int[] newCand = new int[currentSizeOfItemsets+1];
for (int s=0; s < newCand.length-1; s++) {
newCand[s] = X[s];
}
int ndifferent = 0;
// then we find the missing value
for (int s1=0; s1 < Y.length; s1++) {
boolean found = false;
// is Y[s1] in X?
for(int s2=0; s2 < X.length; s2++) {
if (X[s2]==Y[s1]) {
found = true;
break;
}
}
if (!found){ // Y[s1] is not in X
ndifferent++;
// we put the missing value at the end of newCand
newCand[newCand.length -1] = Y[s1];
}
}
// we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates
assert(ndifferent>0);
if (ndifferent==1) {
// HashMap does not have the correct "equals" for int[] :-(
// I have to create the hash myself using a String :-(
// I use Arrays.toString to reuse equals and hashcode of String
Arrays.sort(newCand);
tempCandidates.put(Arrays.toString(newCand),newCand);
}
}
}
//set the new itemsets
itemsets = new ArrayList<>(tempCandidates.values());
log("Created " + itemsets.size() + " unique itemsets of size "+(currentSizeOfItemsets+1));
}
/** put "true" in trans[i] if the integer i is in line */
private boolean[] itemsetToBooleanArray(int[] itemset) {
boolean[] trans = new boolean[numItems];
for (int i : itemset) {
trans[distinctItems.indexOf(i)] = true;
}
return trans;
}
/** passes through the data to measure the frequency of sets in itemsets,
* then filters thoses who are under the minimum support (minSup)
*/
private void calculateFrequentItemsets() throws Exception
{
log("Passing through the data to compute the frequency of " + itemsets.size() + " itemsets of size " + itemsets.get(0).length);
List<int[]> frequentCandidates = new ArrayList<>(); //the frequent candidates for the current itemset
int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros
// create worker threads
Iterator<int[]> iterator = allItems.iterator();
List<WorkerThread> threads = new ArrayList<>();
for( int i = 0; i < NUMBER_OF_THREADS; i++) {
threads.add(new WorkerThread(count, iterator));
}
// start worker threads
threads.forEach(WorkerThread::start);
// CLIProgressBar progressBar = new CLIProgressBar(numTransactions);
boolean running = true;
int progress;
while(running) {
//check if all are still running
running = false;
progress = 0;
for (WorkerThread wt : threads) {
progress += wt.getNumberOfProcessedItemsets();
if (!running && wt.isAlive()) {
running = true;
}
}
// progressBar.print(progress);
Thread.sleep(5000);
}
for (int i = 0; i < itemsets.size(); i++) {
// if the count% is larger than the minSup%, add to the candidate to
// the frequent candidates
if ((count[i] / (double) (numTransactions)) >= minSup) {
foundFrequentItemSet(itemsets.get(i), count[i]);
frequentCandidates.add(itemsets.get(i));
}
}
//new candidates are only the frequent candidates
itemsets = frequentCandidates;
}
private class WorkerThread extends Thread {
int[] count;
Iterator<int[]> iterator;
int numberOfProcessedItemsets;
public WorkerThread(int[] count, Iterator<int[]> itemsetsIterator) {
this.count = count;
this.iterator = itemsetsIterator;
this.numberOfProcessedItemsets = 0;
}
@Override
public void run() {
try {
while (true) {
int[] itemset = iterator.next();
boolean[] trans = itemsetToBooleanArray(itemset);
// check each candidate
checkCandidate:
for (int c = 0; c < itemsets.size(); c++) {
// tokenize the candidate so that we know what items need to be present for a match
int[] cand = itemsets.get(c);
// check each item in the itemset to see if it is present in the transaction
for (int xx : cand) {
if (trans[xx] == false) {
continue checkCandidate;
}
}
// at this point it is a match, increase the count
count[c]++;
}
numberOfProcessedItemsets++;
}
} catch (NoSuchElementException e) {
//nothing wrong with this, means that we're done with the iteration
}
}
public int getNumberOfProcessedItemsets() {
return numberOfProcessedItemsets;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.