Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Java implementation of the Apriori algorithm for mining frequent itemsets
import java.io.*;
import java.util.*;
/** The class encapsulates an implementation of the Apriori algorithm
* to compute frequent itemsets.
*
* Datasets contains integers (>=0) separated by spaces, one transaction by line, e.g.
* 1 2 3
* 0 9
* 1 9
*
* Usage with the command line :
* $ java mining.Apriori fileName support
* $ java mining.Apriori /tmp/data.dat 0.8
* $ java mining.Apriori /tmp/data.dat 0.8 > frequent-itemsets.txt
*
* Usage as library: see {@link ExampleOfClientCodeOfApriori}
*
* @author Martin Monperrus, University of Darmstadt, 2010
* @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009.
* @copyright GNU General Public License v3
* No reproduction in whole or part without maintaining this copyright notice
* and imposing this condition on any subsequent users.
*/
public class Apriori extends Observable {
public static void main(String[] args) throws Exception {
Apriori ap = new Apriori(args);
}
/** the list of current itemsets */
private List<int[]> itemsets ;
/** the name of the transcation file */
private String transaFile;
/** number of different items in the dataset */
private int numItems;
/** total number of transactions in transaFile */
private int numTransactions;
/** minimum support for a frequent itemset in percentage, e.g. 0.8 */
private double minSup;
/** by default, Apriori is used with the command line interface */
private boolean usedAsLibrary = false;
/** This is the main interface to use this class as a library */
public Apriori(String[] args, Observer ob) throws Exception
{
usedAsLibrary = true;
configure(args);
this.addObserver(ob);
go();
}
/** generates the apriori itemsets from a file
*
* @param args configuration parameters: args[0] is a filename, args[1] the min support (e.g. 0.8 for 80%)
*/
public Apriori(String[] args) throws Exception
{
configure(args);
go();
}
/** starts the algorithm after configuration */
private void go() throws Exception {
//start timer
long start = System.currentTimeMillis();
// first we generate the candidates of size 1
createItemsetsOfSize1();
int itemsetNumber=1; //the current itemset being looked at
int nbFrequentSets=0;
while (itemsets.size()>0)
{
calculateFrequentItemsets();
if(itemsets.size()!=0)
{
nbFrequentSets+=itemsets.size();
log("Found "+itemsets.size()+" frequent itemsets of size " + itemsetNumber + " (with support "+(minSup*100)+"%)");;
createNewItemsetsFromPreviousOnes();
}
itemsetNumber++;
}
//display the execution time
long end = System.currentTimeMillis();
log("Execution time is: "+((double)(end-start)/1000) + " seconds.");
log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")");
log("Done");
}
/** triggers actions if a frequent item set has been found */
private void foundFrequentItemSet(int[] itemset, int support) {
if (usedAsLibrary) {
this.setChanged();
notifyObservers(itemset);
}
else {System.out.println(Arrays.toString(itemset) + " ("+ ((support / (double) numTransactions))+" "+support+")");}
}
/** outputs a message in Sys.err if not used as library */
private void log(String message) {
if (!usedAsLibrary) {
System.err.println(message);
}
}
/** computes numItems, numTransactions, and sets minSup */
private void configure(String[] args) throws Exception
{
// setting transafile
if (args.length!=0) transaFile = args[0];
else transaFile = "chess.dat"; // default
// setting minsupport
if (args.length>=2) minSup=(Double.valueOf(args[1]).doubleValue());
else minSup = .8;// by default
if (minSup>1 || minSup<0) throw new Exception("minSup: bad value");
// going thourgh the file to compute numItems and numTransactions
numItems = 0;
numTransactions=0;
BufferedReader data_in = new BufferedReader(new FileReader(transaFile));
while (data_in.ready()) {
String line=data_in.readLine();
if (line.matches("\\s*")) continue; // be friendly with empty lines
numTransactions++;
StringTokenizer t = new StringTokenizer(line," ");
while (t.hasMoreTokens()) {
int x = Integer.parseInt(t.nextToken());
//log(x);
if (x+1>numItems) numItems=x+1;
}
}
outputConfig();
}
/** outputs the current configuration
*/
private void outputConfig() {
//output config info to the user
log("Input configuration: "+numItems+" items, "+numTransactions+" transactions, ");
log("minsup = "+minSup+"%");
}
/** puts in itemsets all sets of size 1,
* i.e. all possibles items of the datasets
*/
private void createItemsetsOfSize1() {
itemsets = new ArrayList<int[]>();
for(int i=0; i<numItems; i++)
{
int[] cand = {i};
itemsets.add(cand);
}
}
/**
* if m is the size of the current itemsets,
* generate all possible itemsets of size n+1 from pairs of current itemsets
* replaces the itemsets of itemsets by the new ones
*/
private void createNewItemsetsFromPreviousOnes()
{
// by construction, all existing itemsets have the same size
int currentSizeOfItemsets = itemsets.get(0).length;
log("Creating itemsets of size "+(currentSizeOfItemsets+1)+" based on "+itemsets.size()+" itemsets of size "+currentSizeOfItemsets);
HashMap<String, int[]> tempCandidates = new HashMap<String, int[]>(); //temporary candidates
// compare each pair of itemsets of size n-1
for(int i=0; i<itemsets.size(); i++)
{
for(int j=i+1; j<itemsets.size(); j++)
{
int[] X = itemsets.get(i);
int[] Y = itemsets.get(j);
assert (X.length==Y.length);
//make a string of the first n-2 tokens of the strings
int [] newCand = new int[currentSizeOfItemsets+1];
for(int s=0; s<newCand.length-1; s++) {
newCand[s] = X[s];
}
int ndifferent = 0;
// then we find the missing value
for(int s1=0; s1<Y.length; s1++)
{
boolean found = false;
// is Y[s1] in X?
for(int s2=0; s2<X.length; s2++) {
if (X[s2]==Y[s1]) {
found = true;
break;
}
}
if (!found){ // Y[s1] is not in X
ndifferent++;
// we put the missing value at the end of newCand
newCand[newCand.length -1] = Y[s1];
}
}
// we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates
assert(ndifferent>0);
if (ndifferent==1) {
// HashMap does not have the correct "equals" for int[] :-(
// I have to create the hash myself using a String :-(
// I use Arrays.toString to reuse equals and hashcode of String
Arrays.sort(newCand);
tempCandidates.put(Arrays.toString(newCand),newCand);
}
}
}
//set the new itemsets
itemsets = new ArrayList<int[]>(tempCandidates.values());
log("Created "+itemsets.size()+" unique itemsets of size "+(currentSizeOfItemsets+1));
}
/** put "true" in trans[i] if the integer i is in line */
private void line2booleanArray(String line, boolean[] trans) {
Arrays.fill(trans, false);
StringTokenizer stFile = new StringTokenizer(line, " "); //read a line from the file to the tokenizer
//put the contents of that line into the transaction array
while (stFile.hasMoreTokens())
{
int parsedVal = Integer.parseInt(stFile.nextToken());
trans[parsedVal]=true; //if it is not a 0, assign the value to true
}
}
/** passes through the data to measure the frequency of sets in {@link itemsets},
* then filters thoses who are under the minimum support (minSup)
*/
private void calculateFrequentItemsets() throws Exception
{
log("Passing through the data to compute the frequency of " + itemsets.size()+ " itemsets of size "+itemsets.get(0).length);
List<int[]> frequentCandidates = new ArrayList<int[]>(); //the frequent candidates for the current itemset
boolean match; //whether the transaction has all the items in an itemset
int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros
// load the transaction file
BufferedReader data_in = new BufferedReader(new InputStreamReader(new FileInputStream(transaFile)));
boolean[] trans = new boolean[numItems];
// for each transaction
for (int i = 0; i < numTransactions; i++) {
// boolean[] trans = extractEncoding1(data_in.readLine());
String line = data_in.readLine();
line2booleanArray(line, trans);
// check each candidate
for (int c = 0; c < itemsets.size(); c++) {
match = true; // reset match to false
// tokenize the candidate so that we know what items need to be
// present for a match
int[] cand = itemsets.get(c);
//int[] cand = candidatesOptimized[c];
// check each item in the itemset to see if it is present in the
// transaction
for (int xx : cand) {
if (trans[xx] == false) {
match = false;
break;
}
}
if (match) { // if at this point it is a match, increase the count
count[c]++;
//log(Arrays.toString(cand)+" is contained in trans "+i+" ("+line+")");
}
}
}
data_in.close();
for (int i = 0; i < itemsets.size(); i++) {
// if the count% is larger than the minSup%, add to the candidate to
// the frequent candidates
if ((count[i] / (double) (numTransactions)) >= minSup) {
foundFrequentItemSet(itemsets.get(i),count[i]);
frequentCandidates.add(itemsets.get(i));
}
//else log("-- Remove candidate: "+ Arrays.toString(candidates.get(i)) + " is: "+ ((count[i] / (double) numTransactions)));
}
//new candidates are only the frequent candidates
itemsets = frequentCandidates;
}
}
@mihairaulea

This comment has been minimized.

Show comment
Hide comment
@mihairaulea

mihairaulea Jan 11, 2014

Usage as library: see {@link ExampleOfClientCodeOfApriori} where is this example?

mihairaulea commented Jan 11, 2014

Usage as library: see {@link ExampleOfClientCodeOfApriori} where is this example?

@SkandaB

This comment has been minimized.

Show comment
Hide comment
@SkandaB

SkandaB Aug 25, 2014

Where is the data set (chess.dat) for running this algorithm.
Please share a sample data-set.

SkandaB commented Aug 25, 2014

Where is the data set (chess.dat) for running this algorithm.
Please share a sample data-set.

@lathasiva

This comment has been minimized.

Show comment
Hide comment
@lathasiva

lathasiva Oct 7, 2014

need chess.dat ....plz help

lathasiva commented Oct 7, 2014

need chess.dat ....plz help

@sankettandulwadkar

This comment has been minimized.

Show comment
Hide comment

sankettandulwadkar commented Nov 12, 2014

http://fimi.ua.ac.be/data/

Here are some datasets.

@ravikirangoud009

This comment has been minimized.

Show comment
Hide comment
@ravikirangoud009

ravikirangoud009 Dec 25, 2014

I have a program for finding frequent itemsets.Does anyone has program for generating association rules from these frequent patterns
my id:ravi66364@gmail.com

ravikirangoud009 commented Dec 25, 2014

I have a program for finding frequent itemsets.Does anyone has program for generating association rules from these frequent patterns
my id:ravi66364@gmail.com

@Ravi52

This comment has been minimized.

Show comment
Hide comment

Ravi52 commented Jan 31, 2015

@Ravi52

This comment has been minimized.

Show comment
Hide comment
@Ravi52

Ravi52 Jan 31, 2015

i want code for infrequent items and have a configurtion value also

Ravi52 commented Jan 31, 2015

i want code for infrequent items and have a configurtion value also

@llbbnn

This comment has been minimized.

Show comment
Hide comment
@llbbnn

llbbnn Mar 19, 2015

I am working in implementing the association rule
I read the arff file and get the data and then I put it in an array list
But I do not know how to check through the items 1 itemsets, two itemsets,three itemsets ....etc
How can I calculate the average of those Itemsets together
And also here in the algorithm when we build the three itemsets it is build above the two item sets
So, how can I hold the last itemsets and then add the new one to them
I mean , how can the program differentiate and be sure that each item is from different column

llbbnn commented Mar 19, 2015

I am working in implementing the association rule
I read the arff file and get the data and then I put it in an array list
But I do not know how to check through the items 1 itemsets, two itemsets,three itemsets ....etc
How can I calculate the average of those Itemsets together
And also here in the algorithm when we build the three itemsets it is build above the two item sets
So, how can I hold the last itemsets and then add the new one to them
I mean , how can the program differentiate and be sure that each item is from different column

@llbbnn

This comment has been minimized.

Show comment
Hide comment
@llbbnn

llbbnn Mar 19, 2015

By taking attention to that
number of columns is not fixed
number of values of each columns is not known b
Maybe there is a value of one column equal to value of other column ,so the values must be characterized by there indexes not their values because they are not unique

llbbnn commented Mar 19, 2015

By taking attention to that
number of columns is not fixed
number of values of each columns is not known b
Maybe there is a value of one column equal to value of other column ,so the values must be characterized by there indexes not their values because they are not unique

@ENGAsmaaSaad

This comment has been minimized.

Show comment
Hide comment
@ENGAsmaaSaad

ENGAsmaaSaad Apr 13, 2015

Please I need help in my code can any one help me
Thanks in advance

ENGAsmaaSaad commented Apr 13, 2015

Please I need help in my code can any one help me
Thanks in advance

@Sadhana2

This comment has been minimized.

Show comment
Hide comment
@Sadhana2

Sadhana2 Jan 23, 2016

.I have given file name.but after executing it is showing file not found exception.

Sadhana2 commented Jan 23, 2016

.I have given file name.but after executing it is showing file not found exception.

@hinal12

This comment has been minimized.

Show comment
Hide comment
@hinal12

hinal12 Mar 19, 2016

where do i have to store my chess,dat file in my computer to run the program
i cant get access to chess.dat file while running the program..plz help

hinal12 commented Mar 19, 2016

where do i have to store my chess,dat file in my computer to run the program
i cant get access to chess.dat file while running the program..plz help

@Atlantis11

This comment has been minimized.

Show comment
Hide comment
@Atlantis11

Atlantis11 Apr 24, 2016

llbbnn did you implement the association rules from this code? can you help..?

Atlantis11 commented Apr 24, 2016

llbbnn did you implement the association rules from this code? can you help..?

@sydul

This comment has been minimized.

Show comment
Hide comment
@sydul

sydul Apr 26, 2016

i need code for fast distributed mining algorithm for association rules.
please help me.

sydul commented Apr 26, 2016

i need code for fast distributed mining algorithm for association rules.
please help me.

@cibergirl92

This comment has been minimized.

Show comment
Hide comment
@cibergirl92

cibergirl92 May 5, 2016

u must put the chess.dat file in the folder of ur project, Im working on NetBeans. I hope that this can help for the ones who are asking about where the chess.dat should go, :D
1

cibergirl92 commented May 5, 2016

u must put the chess.dat file in the folder of ur project, Im working on NetBeans. I hope that this can help for the ones who are asking about where the chess.dat should go, :D
1

@cibergirl92

This comment has been minimized.

Show comment
Hide comment
@cibergirl92

cibergirl92 May 5, 2016

No, the association rule is NOT implemented in this code :(, just the Apriori Algorithm

cibergirl92 commented May 5, 2016

No, the association rule is NOT implemented in this code :(, just the Apriori Algorithm

@MedElfadhelELHACHEMI

This comment has been minimized.

Show comment
Hide comment
@MedElfadhelELHACHEMI

MedElfadhelELHACHEMI Oct 20, 2016

what if i'm getting my data from a database , how do i structure the data for the algorithm to use it

MedElfadhelELHACHEMI commented Oct 20, 2016

what if i'm getting my data from a database , how do i structure the data for the algorithm to use it

@waani

This comment has been minimized.

Show comment
Hide comment
@waani

waani Nov 5, 2016

plz provide me code for partition on apriori algo or divisive apriori algo in java

waani commented Nov 5, 2016

plz provide me code for partition on apriori algo or divisive apriori algo in java

@khachanehetal

This comment has been minimized.

Show comment
Hide comment
@khachanehetal

khachanehetal Mar 21, 2017

plz provide me code of eclat algorithm in c++
khachanehetal@gmail.com this is my mail address

khachanehetal commented Mar 21, 2017

plz provide me code of eclat algorithm in c++
khachanehetal@gmail.com this is my mail address

@sathyaSankar29

This comment has been minimized.

Show comment
Hide comment
@sathyaSankar29

sathyaSankar29 May 1, 2017

Please provide me code for reverse apriori algorithm in R or java
sathyamphil2016@gmail.com this is my mail id

sathyaSankar29 commented May 1, 2017

Please provide me code for reverse apriori algorithm in R or java
sathyamphil2016@gmail.com this is my mail id

@chandu8542

This comment has been minimized.

Show comment
Hide comment
@chandu8542

chandu8542 May 10, 2017

Usage as library: see {@link ExampleOfClientCodeOfApriori} where is this example?

chandu8542 commented May 10, 2017

Usage as library: see {@link ExampleOfClientCodeOfApriori} where is this example?

@chandu8542

This comment has been minimized.

Show comment
Hide comment
@chandu8542

chandu8542 May 10, 2017

@monperrus Everyone, be aware with the usage of the code. The code assumes that your transactions DB contains records all from 0 to n. If your records don't start with 0, e..g [209 212 209 212 212 212; 45 63 89; 89 53 63], above code will not work. The author should make appropriate changes in config function.

chandu8542 commented May 10, 2017

@monperrus Everyone, be aware with the usage of the code. The code assumes that your transactions DB contains records all from 0 to n. If your records don't start with 0, e..g [209 212 209 212 212 212; 45 63 89; 89 53 63], above code will not work. The author should make appropriate changes in config function.

@tomrockdsouza

This comment has been minimized.

Show comment
Hide comment
@tomrockdsouza

tomrockdsouza Jun 1, 2017

I forked this code and added association rules to it enjoy ;)
My Forked Apriori.java

tomrockdsouza commented Jun 1, 2017

I forked this code and added association rules to it enjoy ;)
My Forked Apriori.java

@backtodream

This comment has been minimized.

Show comment
Hide comment
@backtodream

backtodream Nov 23, 2017

@ENGINEER-RC thanks bro

backtodream commented Nov 23, 2017

@ENGINEER-RC thanks bro

@hemalathaPounraj

This comment has been minimized.

Show comment
Hide comment
@hemalathaPounraj

hemalathaPounraj Jan 6, 2018

I need the description of the data set "retail.gz " available in the link "http://fimi.ua.ac.be/data/." please help me.

hemalathaPounraj commented Jan 6, 2018

I need the description of the data set "retail.gz " available in the link "http://fimi.ua.ac.be/data/." please help me.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment