lovromazgon/Apriori.java

## Apriori.java
import java.io.*;
import java.util.*;

/** The class encapsulates an implementation of the Apriori algorithm to compute frequent itemsets.
 *
 * This version uses multiple threads to go through the dataset and is therefore 3x faster than the 1 thread version.
 *
 * Notice: To enable progress tracking (CLIProgressBar) follow these steps:
 *            1. download this class and put it on the classpath: https://gist.github.com/lovromazgon/9c801554ceb56157de30
 *            2. uncomment lines 229 and 243
 *
 * Usage as library: see {@link ExampleOfClientCodeOfApriori}
 *
 * @author Lovro Mazgon, implemented multithreaded version, 2015
 * Original authors:
 * @author Martin Monperrus, University of Darmstadt, 2010
 * @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009.
 * @copyright GNU General Public License v3
 * No reproduction in whole or part without maintaining this copyright notice
 * and imposing this condition on any subsequent users.
 */
public class Apriori extends Observable {
	private static final int NUMBER_OF_THREADS = 8;

	/** all input items */
	private final Iterable<int[]> allItems;
	/** the list of current itemsets */
	private List<int[]> itemsets ;
	/** set of distinct items in the dataset */
	private List<Integer> distinctItems;
	/** number of different items in the dataset */
	private int numItems;
	/** total number of transactions in transaFile */
	private int numTransactions;
	/** minimum support for a frequent itemset in percentage, e.g. 0.8 */
	private double minSup;

	/** This is the main interface to use this class as a library */
	public  Apriori(Observer ob, Iterable<int[]> itemsets, double minSupport) {
		allItems = itemsets;
		configure(allItems, minSupport);
		this.addObserver(ob);
	}

	/** computes numItems, numTransactions, and sets minSup */
	private void configure(Iterable<int[]> allItems, double minSupport) {
		// setting minsupport
		minSup = minSupport;
		if (minSup>1 || minSup<0) throw new RuntimeException("minSup: bad value");

		// going thourgh the itemsets to compute distinctItems and numTransactions
		numTransactions = 0;
		Set<Integer> itemsSet = new HashSet<>();
		for (int[] itemset : allItems) {
			numTransactions++;
			for (int i : itemset) {
				itemsSet.add(i);
			}
		}
		this.distinctItems = new ArrayList<>(itemsSet);

		// setting numItems
		numItems = distinctItems.size();

		outputConfig();

	}

	/** outputs the current configuration */
	private void outputConfig() {
		//output config info to the user
		log("Input configuration: " + numItems + " items, " + numTransactions + " transactions, ");
		log("minsup = " + (minSup*100) + "%");
	}

	/** starts the algorithm after configuration */
	public void run() throws Exception {
		//start timer
		long start = System.currentTimeMillis();

		// first we generate the candidates of size 1
		createItemsetsOfSize1();

		int itemsetNumber = 1; //the current itemset being looked at
		int nbFrequentSets = 0;

		while (itemsets.size() > 0) {
			calculateFrequentItemsets();

			if (itemsets.size()!=0) {
				nbFrequentSets += itemsets.size();
				log("Found " + itemsets.size() + " frequent itemsets of size " + itemsetNumber + " (with support " + (minSup*100) + "%)");
				createNewItemsetsFromPreviousOnes();
			}

			itemsetNumber++;
		}

		//display the execution time
		long end = System.currentTimeMillis();
		log("Execution time is: "+((double)(end-start)/1000) + " seconds.");
		log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")");
		log("Done");
	}

	/** puts in itemsets all sets of size 1,
	 * i.e. all possibles items of the datasets
	 */
	private void createItemsetsOfSize1() {
		itemsets = new ArrayList<int[]>();
		for(int i=0; i<numItems; i++)
		{
			int[] cand = {i};
			itemsets.add(cand);
		}
	}

	/** triggers actions if a frequent item set has been found  */
	private void foundFrequentItemSet(int[] itemset, int support) {
		int[] convertedItemset = new int[itemset.length];
		for (int i = 0; i < itemset.length; i++) {
			convertedItemset[i] = distinctItems.get(itemset[i]);
		}

		this.setChanged();
		notifyObservers(convertedItemset);
		System.out.println(Arrays.toString(convertedItemset) + "  (" + ((support / (double) numTransactions))+" " + support + ")");
	}

	/** outputs a message in Sys.err if not used as library */
	private void log(String message) {
		System.err.println(message);
	}

	/**
	 * if m is the size of the current itemsets,
	 * generate all possible itemsets of size n+1 from pairs of current itemsets
	 * replaces the itemsets of itemsets by the new ones
	 */
	private void createNewItemsetsFromPreviousOnes()
	{
		// by construction, all existing itemsets have the same size
		int currentSizeOfItemsets = itemsets.get(0).length;
		log("Creating itemsets of size " + (currentSizeOfItemsets+1) + " based on " + itemsets.size() + " itemsets of size " + currentSizeOfItemsets);

		HashMap<String, int[]> tempCandidates = new HashMap<>(); //temporary candidates

		// compare each pair of itemsets of size n-1
		for(int i = 0; i < itemsets.size(); i++) {
			for(int j = i+1; j < itemsets.size(); j++) {
				int[] X = itemsets.get(i);
				int[] Y = itemsets.get(j);

				assert(X.length == Y.length);

				//make a string of the first n-2 tokens of the strings
				int[] newCand = new int[currentSizeOfItemsets+1];
				for (int s=0; s < newCand.length-1; s++) {
					newCand[s] = X[s];
				}

				int ndifferent = 0;
				// then we find the missing value
				for (int s1=0; s1 < Y.length; s1++) {
					boolean found = false;
					// is Y[s1] in X?
					for(int s2=0; s2 < X.length; s2++) {
						if (X[s2]==Y[s1]) {
							found = true;
							break;
						}
					}
					if (!found){ // Y[s1] is not in X
						ndifferent++;
						// we put the missing value at the end of newCand
						newCand[newCand.length -1] = Y[s1];
					}
				}

				// we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates
				assert(ndifferent>0);


				if (ndifferent==1) {
					// HashMap does not have the correct "equals" for int[] :-(
					// I have to create the hash myself using a String :-(
					// I use Arrays.toString to reuse equals and hashcode of String
					Arrays.sort(newCand);
					tempCandidates.put(Arrays.toString(newCand),newCand);
				}
			}
		}

		//set the new itemsets
		itemsets = new ArrayList<>(tempCandidates.values());
		log("Created " + itemsets.size() + " unique itemsets of size "+(currentSizeOfItemsets+1));
	}

	/** put "true" in trans[i] if the integer i is in line */
	private boolean[] itemsetToBooleanArray(int[] itemset) {
		boolean[] trans = new boolean[numItems];
		for (int i : itemset) {
			trans[distinctItems.indexOf(i)] = true;
		}
		return trans;
	}

	/** passes through the data to measure the frequency of sets in itemsets,
	 *  then filters thoses who are under the minimum support (minSup)
	 */
	private void calculateFrequentItemsets() throws Exception
	{

		log("Passing through the data to compute the frequency of " + itemsets.size() + " itemsets of size " + itemsets.get(0).length);

		List<int[]> frequentCandidates = new ArrayList<>(); //the frequent candidates for the current itemset

		int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros

		// create worker threads
		Iterator<int[]> iterator = allItems.iterator();
		List<WorkerThread> threads = new ArrayList<>();

		for( int i = 0; i < NUMBER_OF_THREADS; i++) {
			threads.add(new WorkerThread(count, iterator));
		}

		// start worker threads
		threads.forEach(WorkerThread::start);

		// CLIProgressBar progressBar = new CLIProgressBar(numTransactions);
		boolean running = true;
		int progress;
		while(running) {
			//check if all are still running
			running = false;
			progress = 0;
			for (WorkerThread wt : threads) {
				progress += wt.getNumberOfProcessedItemsets();
				if (!running && wt.isAlive()) {
					running = true;
				}
			}

			// progressBar.print(progress);
			Thread.sleep(5000);
		}

		for (int i = 0; i < itemsets.size(); i++) {
			// if the count% is larger than the minSup%, add to the candidate to
			// the frequent candidates
			if ((count[i] / (double) (numTransactions)) >= minSup) {
				foundFrequentItemSet(itemsets.get(i), count[i]);
				frequentCandidates.add(itemsets.get(i));
			}
		}

		//new candidates are only the frequent candidates
		itemsets = frequentCandidates;
	}


	private class WorkerThread extends Thread {
		int[] count;
		Iterator<int[]> iterator;
		int numberOfProcessedItemsets;

		public WorkerThread(int[] count, Iterator<int[]> itemsetsIterator) {
			this.count = count;
			this.iterator = itemsetsIterator;
			this.numberOfProcessedItemsets = 0;
		}

		@Override
		public void run() {
			try {
				while (true) {
					int[] itemset = iterator.next();
					boolean[] trans = itemsetToBooleanArray(itemset);

					// check each candidate
					checkCandidate:
					for (int c = 0; c < itemsets.size(); c++) {
						// tokenize the candidate so that we know what items need to be present for a match
						int[] cand = itemsets.get(c);
						// check each item in the itemset to see if it is present in the transaction
						for (int xx : cand) {
							if (trans[xx] == false) {
								continue checkCandidate;
							}
						}
						// at this point it is a match, increase the count
						count[c]++;
					}
					numberOfProcessedItemsets++;
				}
			} catch (NoSuchElementException e) {
				//nothing wrong with this, means that we're done with the iteration
			}
		}

		public int getNumberOfProcessedItemsets() {
			return numberOfProcessedItemsets;
		}
	}
}
	import java.io.*;
	import java.util.*;

	/** The class encapsulates an implementation of the Apriori algorithm to compute frequent itemsets.
	*
	* This version uses multiple threads to go through the dataset and is therefore 3x faster than the 1 thread version.
	*
	* Notice: To enable progress tracking (CLIProgressBar) follow these steps:
	* 1. download this class and put it on the classpath: https://gist.github.com/lovromazgon/9c801554ceb56157de30
	* 2. uncomment lines 229 and 243
	*
	* Usage as library: see {@link ExampleOfClientCodeOfApriori}
	*
	* @author Lovro Mazgon, implemented multithreaded version, 2015
	* Original authors:
	* @author Martin Monperrus, University of Darmstadt, 2010
	* @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009.
	* @copyright GNU General Public License v3
	* No reproduction in whole or part without maintaining this copyright notice
	* and imposing this condition on any subsequent users.
	*/
	public class Apriori extends Observable {
	private static final int NUMBER_OF_THREADS = 8;

	/** all input items */
	private final Iterable<int[]> allItems;
	/** the list of current itemsets */
	private List<int[]> itemsets ;
	/** set of distinct items in the dataset */
	private List<Integer> distinctItems;
	/** number of different items in the dataset */
	private int numItems;
	/** total number of transactions in transaFile */
	private int numTransactions;
	/** minimum support for a frequent itemset in percentage, e.g. 0.8 */
	private double minSup;

	/** This is the main interface to use this class as a library */
	public Apriori(Observer ob, Iterable<int[]> itemsets, double minSupport) {
	allItems = itemsets;
	configure(allItems, minSupport);
	this.addObserver(ob);
	}

	/** computes numItems, numTransactions, and sets minSup */
	private void configure(Iterable<int[]> allItems, double minSupport) {
	// setting minsupport
	minSup = minSupport;
	if (minSup>1 \|\| minSup<0) throw new RuntimeException("minSup: bad value");

	// going thourgh the itemsets to compute distinctItems and numTransactions
	numTransactions = 0;
	Set<Integer> itemsSet = new HashSet<>();
	for (int[] itemset : allItems) {
	numTransactions++;
	for (int i : itemset) {
	itemsSet.add(i);
	}
	}
	this.distinctItems = new ArrayList<>(itemsSet);

	// setting numItems
	numItems = distinctItems.size();

	outputConfig();

	}

	/** outputs the current configuration */
	private void outputConfig() {
	//output config info to the user
	log("Input configuration: " + numItems + " items, " + numTransactions + " transactions, ");
	log("minsup = " + (minSup*100) + "%");
	}

	/** starts the algorithm after configuration */
	public void run() throws Exception {
	//start timer
	long start = System.currentTimeMillis();

	// first we generate the candidates of size 1
	createItemsetsOfSize1();

	int itemsetNumber = 1; //the current itemset being looked at
	int nbFrequentSets = 0;

	while (itemsets.size() > 0) {
	calculateFrequentItemsets();

	if (itemsets.size()!=0) {
	nbFrequentSets += itemsets.size();
	log("Found " + itemsets.size() + " frequent itemsets of size " + itemsetNumber + " (with support " + (minSup*100) + "%)");
	createNewItemsetsFromPreviousOnes();
	}

	itemsetNumber++;
	}

	//display the execution time
	long end = System.currentTimeMillis();
	log("Execution time is: "+((double)(end-start)/1000) + " seconds.");
	log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup100)+"% (absolute "+Math.round(numTransactionsminSup)+")");
	log("Done");
	}

	/** puts in itemsets all sets of size 1,
	* i.e. all possibles items of the datasets
	*/
	private void createItemsetsOfSize1() {
	itemsets = new ArrayList<int[]>();
	for(int i=0; i<numItems; i++)
	{
	int[] cand = {i};
	itemsets.add(cand);
	}
	}

	/** triggers actions if a frequent item set has been found */
	private void foundFrequentItemSet(int[] itemset, int support) {
	int[] convertedItemset = new int[itemset.length];
	for (int i = 0; i < itemset.length; i++) {
	convertedItemset[i] = distinctItems.get(itemset[i]);
	}

	this.setChanged();
	notifyObservers(convertedItemset);
	System.out.println(Arrays.toString(convertedItemset) + " (" + ((support / (double) numTransactions))+" " + support + ")");
	}

	/** outputs a message in Sys.err if not used as library */
	private void log(String message) {
	System.err.println(message);
	}

	/**
	* if m is the size of the current itemsets,
	* generate all possible itemsets of size n+1 from pairs of current itemsets
	* replaces the itemsets of itemsets by the new ones
	*/
	private void createNewItemsetsFromPreviousOnes()
	{
	// by construction, all existing itemsets have the same size
	int currentSizeOfItemsets = itemsets.get(0).length;
	log("Creating itemsets of size " + (currentSizeOfItemsets+1) + " based on " + itemsets.size() + " itemsets of size " + currentSizeOfItemsets);

	HashMap<String, int[]> tempCandidates = new HashMap<>(); //temporary candidates

	// compare each pair of itemsets of size n-1
	for(int i = 0; i < itemsets.size(); i++) {
	for(int j = i+1; j < itemsets.size(); j++) {
	int[] X = itemsets.get(i);
	int[] Y = itemsets.get(j);

	assert(X.length == Y.length);

	//make a string of the first n-2 tokens of the strings
	int[] newCand = new int[currentSizeOfItemsets+1];
	for (int s=0; s < newCand.length-1; s++) {
	newCand[s] = X[s];
	}

	int ndifferent = 0;
	// then we find the missing value
	for (int s1=0; s1 < Y.length; s1++) {
	boolean found = false;
	// is Y[s1] in X?
	for(int s2=0; s2 < X.length; s2++) {
	if (X[s2]==Y[s1]) {
	found = true;
	break;
	}
	}
	if (!found){ // Y[s1] is not in X
	ndifferent++;
	// we put the missing value at the end of newCand
	newCand[newCand.length -1] = Y[s1];
	}
	}

	// we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates
	assert(ndifferent>0);


	if (ndifferent==1) {
	// HashMap does not have the correct "equals" for int[] :-(
	// I have to create the hash myself using a String :-(
	// I use Arrays.toString to reuse equals and hashcode of String
	Arrays.sort(newCand);
	tempCandidates.put(Arrays.toString(newCand),newCand);
	}
	}
	}

	//set the new itemsets
	itemsets = new ArrayList<>(tempCandidates.values());
	log("Created " + itemsets.size() + " unique itemsets of size "+(currentSizeOfItemsets+1));
	}

	/** put "true" in trans[i] if the integer i is in line */
	private boolean[] itemsetToBooleanArray(int[] itemset) {
	boolean[] trans = new boolean[numItems];
	for (int i : itemset) {
	trans[distinctItems.indexOf(i)] = true;
	}
	return trans;
	}

	/** passes through the data to measure the frequency of sets in itemsets,
	* then filters thoses who are under the minimum support (minSup)
	*/
	private void calculateFrequentItemsets() throws Exception
	{

	log("Passing through the data to compute the frequency of " + itemsets.size() + " itemsets of size " + itemsets.get(0).length);

	List<int[]> frequentCandidates = new ArrayList<>(); //the frequent candidates for the current itemset

	int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros

	// create worker threads
	Iterator<int[]> iterator = allItems.iterator();
	List<WorkerThread> threads = new ArrayList<>();

	for( int i = 0; i < NUMBER_OF_THREADS; i++) {
	threads.add(new WorkerThread(count, iterator));
	}

	// start worker threads
	threads.forEach(WorkerThread::start);

	// CLIProgressBar progressBar = new CLIProgressBar(numTransactions);
	boolean running = true;
	int progress;
	while(running) {
	//check if all are still running
	running = false;
	progress = 0;
	for (WorkerThread wt : threads) {
	progress += wt.getNumberOfProcessedItemsets();
	if (!running && wt.isAlive()) {
	running = true;
	}
	}

	// progressBar.print(progress);
	Thread.sleep(5000);
	}

	for (int i = 0; i < itemsets.size(); i++) {
	// if the count% is larger than the minSup%, add to the candidate to
	// the frequent candidates
	if ((count[i] / (double) (numTransactions)) >= minSup) {
	foundFrequentItemSet(itemsets.get(i), count[i]);
	frequentCandidates.add(itemsets.get(i));
	}
	}

	//new candidates are only the frequent candidates
	itemsets = frequentCandidates;
	}



	private class WorkerThread extends Thread {
	int[] count;
	Iterator<int[]> iterator;
	int numberOfProcessedItemsets;

	public WorkerThread(int[] count, Iterator<int[]> itemsetsIterator) {
	this.count = count;
	this.iterator = itemsetsIterator;
	this.numberOfProcessedItemsets = 0;
	}

	@Override
	public void run() {
	try {
	while (true) {
	int[] itemset = iterator.next();
	boolean[] trans = itemsetToBooleanArray(itemset);

	// check each candidate
	checkCandidate:
	for (int c = 0; c < itemsets.size(); c++) {
	// tokenize the candidate so that we know what items need to be present for a match
	int[] cand = itemsets.get(c);
	// check each item in the itemset to see if it is present in the transaction
	for (int xx : cand) {
	if (trans[xx] == false) {
	continue checkCandidate;
	}
	}
	// at this point it is a match, increase the count
	count[c]++;
	}
	numberOfProcessedItemsets++;
	}
	} catch (NoSuchElementException e) {
	//nothing wrong with this, means that we're done with the iteration
	}
	}

	public int getNumberOfProcessedItemsets() {
	return numberOfProcessedItemsets;
	}
	}
	}