public
Last active

Apriori.py

  • Download Gist
Apriori.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
#-*- coding:utf-8 - *-
 
 
def load_dataset():
"Load the sample dataset."
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
 
 
def createC1(dataset):
"Create a list of candidate item sets of size one."
c1 = []
for transaction in dataset:
for item in transaction:
if not [item] in c1:
c1.append([item])
c1.sort()
#frozenset because it will be a ket of a dictionary.
return map(frozenset, c1)
 
 
def scanD(dataset, candidates, min_support):
"Returns all candidates that meets a minimum support level"
sscnt = {}
for tid in dataset:
for can in candidates:
if can.issubset(tid):
sscnt.setdefault(can, 0)
sscnt[can] += 1
 
num_items = float(len(dataset))
retlist = []
support_data = {}
for key in sscnt:
support = sscnt[key] / num_items
if support >= min_support:
retlist.insert(0, key)
support_data[key] = support
return retlist, support_data
 
 
def aprioriGen(freq_sets, k):
"Generate the joint transactions from candidate sets"
retList = []
lenLk = len(freq_sets)
for i in range(lenLk):
for j in range(i + 1, lenLk):
L1 = list(freq_sets[i])[:k - 2]
L2 = list(freq_sets[j])[:k - 2]
L1.sort()
L2.sort()
if L1 == L2:
retList.append(freq_sets[i] | freq_sets[j])
return retList
 
 
def apriori(dataset, minsupport=0.5):
"Generate a list of candidate item sets"
C1 = createC1(dataset)
D = map(set, dataset)
L1, support_data = scanD(D, C1, minsupport)
L = [L1]
k = 2
while (len(L[k - 2]) > 0):
Ck = aprioriGen(L[k - 2], k)
Lk, supK = scanD(D, Ck, minsupport)
support_data.update(supK)
L.append(Lk)
k += 1
 
return L, support_data

I think line 27 should be between line 25 and line 26. Because each item value in sscnt should be setdefalut to 0 even if it does not appear in any dataset.

sccnt = defaultdict(int), which yields 0, or default(lambda : 0), would be a little cleaner and more idiomatic.
Overall though, awesome article and gist

Why is it necessary to set k:=2 (line 62) and then use everywhere k - 2 (lines 63, 64, 47, 48? Why not just set k:=0?

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.