Laisky/Apriori_with_hconfidence.py

## Apriori_with_hconfidence.py
import datetime
from pprint import pprint
from collections import defaultdict

import numpy as np
import pandas as pd


def apriori(dataset, min_support=0.5, min_hconf=0.5, verbose=False):
    """Implements the Apriori algorithm.

    The Apriori algorithm will iteratively generate new candidate
    k-itemsets using the frequent (k-1)-itemsets found in the previous
    iteration.

    Parameters
    ----------
    dataset : list
        The dataset (a list of transactions) from which to generate
        candidate itemsets.

    min_support : float
        The minimum support threshold. Defaults to 0.5.

    Returns
    -------
    F : list
        The list of frequent itemsets.

    support_data : dict
        The support data for all candidate itemsets.

    References
    ----------
    .. [1] R. Agrawal, R. Srikant, "Fast Algorithms for Mining Association
           Rules", 1994.

    """
    C1 = create_candidates(dataset)  # generate 1-item sets
    D = list(map(set, dataset))
    support_data = {}
    # prune candidate 1-itemsets
    F1 = support_prune(D, C1, min_support, min_hconf,
                       support_data, verbose=verbose)
    F = [F1]  # list of frequent itemsets; initialized to frequent 1-itemsets
    k = 2  # the itemset cardinality
    while (len(F[k - 2]) > 0):
        Ck = apriori_gen(F[k - 2], k)  # generate candidate itemsets
        # prune candidate itemsets
        Fk = support_prune(D, Ck, min_support, min_hconf,
                           support_data, verbose=verbose)
        # add the pruned candidate itemsets to the list of frequent itemsets
        F.append(Fk)
        k += 1

    if verbose:
        # Print a list of all the frequent itemsets.
        for kset in F:
            for item in kset:
                print(item, round(support_data[item], 3))

    return F, support_data


def create_candidates(dataset, verbose=False):
    """Creates a list of candidate 1-itemsets from a list of transactions.

    Parameters
    ----------
    dataset : list
        The dataset (a list of transactions) from which to generate candidate
        itemsets.

    Returns
    -------
    The list of candidate itemsets (c1) passed as a frozenset (a set that is
    immutable and hashable).
    """
    c1 = set()  # list of all items in the database of transactions
    for transaction in dataset:
        c1.update([frozenset([i]) for i in transaction])

    if verbose:
        # Print a list of all the candidate items.
        pprint(c1)
    # Map c1 to a frozenset because it will be the key of a dictionary.
    return c1


def support_prune(dataset, candidates, min_support, min_hconf,
                  support_data, verbose=False):
    """Returns all candidate itemsets that meet a minimum support threshold.

    By the apriori principle, if an itemset is frequent, then all of its
    subsets must also be frequent. As a result, we can perform support-based
    pruning to systematically control the exponential growth of candidate
    itemsets. Thus, itemsets that do not meet the minimum support level are
    pruned from the input list of itemsets (dataset).

    Parameters
    ----------
    dataset : list
        The dataset (a list of transactions) from which to generate candidate
        itemsets.

    candidates : frozenset
        The list of candidate itemsets.

    min_support : float
        The minimum support threshold.

    support_data : dict
        The support data for all candidate itemsets.

    Returns
    -------
    retlist : list
        The list of frequent itemsets.

    """

    # updata support data
    sscnt = defaultdict(lambda: 0)  # set for support counts
    for tid in dataset:
        for cand in candidates:
            if cand.issubset(tid):
                sscnt[cand] += 1

    # total number of transactions in the dataset
    n_all_items = len(dataset)
    retlist = []  # array for unpruned itemsets
    for cand, n_supp in sscnt.items():
        # Calculate the support of itemset cand.
        support = n_supp / n_all_items

        if len(cand) > 1:
            # Calculate h-confidence
            hconf = support / max([support_data[frozenset([_])] for _ in cand])
        else:
            hconf = 1

        if support >= min_support and hconf >= min_hconf:
            retlist.insert(0, cand)

        support_data[cand] = support

    # Print a list of the pruned itemsets.
    if verbose:
        pprint(retlist)

        for cand, cnt in sscnt.items():
            print(cand, cnt, support_data[cand])

    return retlist


def apriori_gen(freq_sets, k):
    """Generates candidate itemsets (via the F_k-1 x F_k-1 method).

    This operation generates new candidate k-itemsets based on the frequent
    (k-1)-itemsets found in the previous iteration. The candidate generation
    procedure merges a pair of frequent (k-1)-itemsets only if their first k-2
    items are identical.

    Parameters
    ----------
    freq_sets : list
        The list of frequent (k-1)-itemsets.

    k : integer
        The cardinality of the current itemsets being evaluated.

    Returns
    -------
    retlist : list
        The list of merged frequent itemsets.
    """
    retList = []  # list of merged frequent itemsets
    lenLk = len(freq_sets)  # number of frequent itemsets
    freq_sets = [frozenset(sorted(_)) for _ in freq_sets]
    for i in range(lenLk):
        a = list(freq_sets[i])
        F1 = a[: k - 2]  # first k-2 items of freq_sets[i]
        for j in range(i + 1, lenLk):
            b = list(freq_sets[j])
            F2 = b[: k - 2]  # first k-2 items of freq_sets[j]

            if F1 == F2:  # if the first k-2 items are identical
                # Merge the frequent itemsets.
                retList.append(freq_sets[i] | freq_sets[j])

    return retList


def main():
    dataset = np.random.exponential(scale=10, size=(1000, 10)).astype(np.int64)
    # print(dataset)
    r = apriori(dataset, min_support=0.0, min_hconf=0.5, verbose=False)
    pprint(r[0][1:])
    print(sum([len(_) for _ in r[0]]))


if __name__ == '__main__':
    main()
	import datetime
	from pprint import pprint
	from collections import defaultdict

	import numpy as np
	import pandas as pd


	def apriori(dataset, min_support=0.5, min_hconf=0.5, verbose=False):
	"""Implements the Apriori algorithm.

	The Apriori algorithm will iteratively generate new candidate
	k-itemsets using the frequent (k-1)-itemsets found in the previous
	iteration.

	Parameters
	----------
	dataset : list
	The dataset (a list of transactions) from which to generate
	candidate itemsets.

	min_support : float
	The minimum support threshold. Defaults to 0.5.

	Returns
	-------
	F : list
	The list of frequent itemsets.

	support_data : dict
	The support data for all candidate itemsets.

	References
	----------
	.. [1] R. Agrawal, R. Srikant, "Fast Algorithms for Mining Association
	Rules", 1994.

	"""
	C1 = create_candidates(dataset) # generate 1-item sets
	D = list(map(set, dataset))
	support_data = {}
	# prune candidate 1-itemsets
	F1 = support_prune(D, C1, min_support, min_hconf,
	support_data, verbose=verbose)
	F = [F1] # list of frequent itemsets; initialized to frequent 1-itemsets
	k = 2 # the itemset cardinality
	while (len(F[k - 2]) > 0):
	Ck = apriori_gen(F[k - 2], k) # generate candidate itemsets
	# prune candidate itemsets
	Fk = support_prune(D, Ck, min_support, min_hconf,
	support_data, verbose=verbose)
	# add the pruned candidate itemsets to the list of frequent itemsets
	F.append(Fk)
	k += 1

	if verbose:
	# Print a list of all the frequent itemsets.
	for kset in F:
	for item in kset:
	print(item, round(support_data[item], 3))

	return F, support_data


	def create_candidates(dataset, verbose=False):
	"""Creates a list of candidate 1-itemsets from a list of transactions.

	Parameters
	----------
	dataset : list
	The dataset (a list of transactions) from which to generate candidate
	itemsets.

	Returns
	-------
	The list of candidate itemsets (c1) passed as a frozenset (a set that is
	immutable and hashable).
	"""
	c1 = set() # list of all items in the database of transactions
	for transaction in dataset:
	c1.update([frozenset([i]) for i in transaction])

	if verbose:
	# Print a list of all the candidate items.
	pprint(c1)
	# Map c1 to a frozenset because it will be the key of a dictionary.
	return c1


	def support_prune(dataset, candidates, min_support, min_hconf,
	support_data, verbose=False):
	"""Returns all candidate itemsets that meet a minimum support threshold.

	By the apriori principle, if an itemset is frequent, then all of its
	subsets must also be frequent. As a result, we can perform support-based
	pruning to systematically control the exponential growth of candidate
	itemsets. Thus, itemsets that do not meet the minimum support level are
	pruned from the input list of itemsets (dataset).

	Parameters
	----------
	dataset : list
	The dataset (a list of transactions) from which to generate candidate
	itemsets.

	candidates : frozenset
	The list of candidate itemsets.

	min_support : float
	The minimum support threshold.

	support_data : dict
	The support data for all candidate itemsets.

	Returns
	-------
	retlist : list
	The list of frequent itemsets.

	"""

	# updata support data
	sscnt = defaultdict(lambda: 0) # set for support counts
	for tid in dataset:
	for cand in candidates:
	if cand.issubset(tid):
	sscnt[cand] += 1

	# total number of transactions in the dataset
	n_all_items = len(dataset)
	retlist = [] # array for unpruned itemsets
	for cand, n_supp in sscnt.items():
	# Calculate the support of itemset cand.
	support = n_supp / n_all_items

	if len(cand) > 1:
	# Calculate h-confidence
	hconf = support / max([support_data[frozenset([_])] for _ in cand])
	else:
	hconf = 1

	if support >= min_support and hconf >= min_hconf:
	retlist.insert(0, cand)

	support_data[cand] = support

	# Print a list of the pruned itemsets.
	if verbose:
	pprint(retlist)

	for cand, cnt in sscnt.items():
	print(cand, cnt, support_data[cand])

	return retlist


	def apriori_gen(freq_sets, k):
	"""Generates candidate itemsets (via the F_k-1 x F_k-1 method).

	This operation generates new candidate k-itemsets based on the frequent
	(k-1)-itemsets found in the previous iteration. The candidate generation
	procedure merges a pair of frequent (k-1)-itemsets only if their first k-2
	items are identical.

	Parameters
	----------
	freq_sets : list
	The list of frequent (k-1)-itemsets.

	k : integer
	The cardinality of the current itemsets being evaluated.

	Returns
	-------
	retlist : list
	The list of merged frequent itemsets.
	"""
	retList = [] # list of merged frequent itemsets
	lenLk = len(freq_sets) # number of frequent itemsets
	freq_sets = [frozenset(sorted(_)) for _ in freq_sets]
	for i in range(lenLk):
	a = list(freq_sets[i])
	F1 = a[: k - 2] # first k-2 items of freq_sets[i]
	for j in range(i + 1, lenLk):
	b = list(freq_sets[j])
	F2 = b[: k - 2] # first k-2 items of freq_sets[j]

	if F1 == F2: # if the first k-2 items are identical
	# Merge the frequent itemsets.
	retList.append(freq_sets[i] \| freq_sets[j])

	return retList


	def main():
	dataset = np.random.exponential(scale=10, size=(1000, 10)).astype(np.int64)
	# print(dataset)
	r = apriori(dataset, min_support=0.0, min_hconf=0.5, verbose=False)
	pprint(r[0][1:])
	print(sum([len(_) for _ in r[0]]))


	if __name__ == '__main__':
	main()