Skip to content

Instantly share code, notes, and snippets.

@cs
Created June 11, 2012 08:34
Show Gist options
  • Save cs/2909095 to your computer and use it in GitHub Desktop.
Save cs/2909095 to your computer and use it in GitHub Desktop.
Implementation of the Apriori Algorithm (as presented in the lecture "Knowledge Discovery in Databases" at LMU Munich) in Haskell including HUnit tests.
module Main where
import qualified Data.List as List
import qualified Data.Set as Set
import Test.HUnit
main :: IO ()
main = do { runTestTT allTests ; return () }
data Item = Item Char deriving (Eq, Ord)
instance Show Item where
show (Item c) = [c]
data Itemset = Itemset (Set.Set Item) deriving (Eq, Ord)
instance Show Itemset where
show (Itemset is) = let helper = foldl (++) "" $ map show $ Set.toList is in
"{" ++ (List.intersperse ',' helper) ++ "}"
type Support = Double
type Confidence = Double
data AssociationRule = AssociationRule Itemset Itemset Support Confidence deriving (Eq, Ord)
instance Show AssociationRule where
show (AssociationRule x y s c) = (show x) ++ " => " ++ (show y) ++ " (" ++ (show s) ++ ", " ++ (show c) ++ ")"
showList [] s = "[]" ++ s
showList (x:xs) s = "[\n " ++ (show x) ++ (showl xs)
where
showl [] = "\n]" ++ s
showl (y:ys) = ",\n " ++ show y ++ showl ys
itemsetFromString :: String -> Itemset
itemsetFromString s = Itemset $ Set.fromList $ map Item s
itemsetsFromStrings :: [String] -> [Itemset]
itemsetsFromStrings ss = map itemsetFromString ss
allItems :: Itemset
allItems = itemsetFromString "ABCDEFGHIKLM"
allItemsCount (Itemset is) = Set.size is
oneItemsets :: [Itemset]
oneItemsets = let asList (Itemset is) = Set.toList is in
map (\i -> Itemset $ Set.fromList [i]) $ asList allItems
transactions :: [Itemset]
transactions = itemsetsFromStrings ["BEGH", "ABCEGH", "ABCEFH", "BCDEFGHL", "ABEKH", "BEFGHIK", "ABDGH", "ABDG", "BDFG", "CEF", "ACEFH", "ABEG"]
transactionCount = length transactions
supportCount :: Itemset -> Int
supportCount (Itemset is) = length $ filter (Set.isSubsetOf is) $ map (\(Itemset x) -> x) transactions
support :: Itemset -> Support
support is = (fromIntegral $ supportCount is) / (fromIntegral $ transactionCount)
confidence :: Itemset -> Itemset -> Confidence
confidence lhsis@(Itemset lhs) (Itemset rhs) = (support $ Itemset $ lhs `Set.union` rhs) / (support lhsis)
------------------------------------------------------------------------
-- Utilities
------------------------------------------------------------------------
allSubsets :: (Ord a) => Set.Set a -> [Set.Set a]
allSubsets s = map Set.fromList $ List.subsequences $ Set.toList s
allMaximalProperSubsets :: (Ord a) => Set.Set a -> [Set.Set a]
allMaximalProperSubsets s = filter (\x -> (Set.size x) == (Set.size s) - 1) $ allSubsets s
------------------------------------------------------------------------
-- Frequent Itemset Mining (FIM)
-- Usage: frequentItemsetMining 0.3
------------------------------------------------------------------------
frequentItemsetMining :: Support -> [Itemset]
frequentItemsetMining ms = fim oneItemsets
where
fim :: [Itemset] -> [Itemset]
fim [] = []
fim iss = let piss = prune iss in piss ++ (fim $ frequentItemsetMiningCandidates piss)
-- prune is not entirely correct: this algorithm is iterating over all transactions for each of the candidates.
-- it's possible to achieve the same result with a single iteration. however, i'm to lazy to implement that now.
prune :: [Itemset] -> [Itemset]
prune iss = filter checkMinSupport iss
checkMinSupport is = support(is) >= ms
frequentItemsetMiningCandidates :: [Itemset] -> [Itemset]
frequentItemsetMiningCandidates iss =
-- the two k - 1 sets should differ in excactly 1 element before joining:
let validateCandidate a b = (Set.size $ a `Set.difference` b) == 1 in
let selfJoin = [Itemset (a `Set.union` b) | (Itemset a) <- iss, (Itemset b) <- iss, validateCandidate a b] in
let nonFrequentSubsets (Itemset s) = all (\s -> (Itemset s) `elem` iss) (allMaximalProperSubsets s) in
List.nub $ filter nonFrequentSubsets selfJoin
------------------------------------------------------------------------
-- Association Rule Mining (ARM)
-- Usage: associationRuleMining 0.6 $ frequentItemsetMining 0.3
------------------------------------------------------------------------
associationRuleMining :: Confidence -> [Itemset] -> [AssociationRule]
associationRuleMining mc iss = filter checkMinConfidence $ allRules iss
where
allRules :: [Itemset] -> [AssociationRule]
allRules [] = []
allRules ((Itemset is):iss) =
let validSubsets = filter (\x -> (not $ Set.null x) && x /= is) $ allSubsets is in
let lhss = map (\subset -> Itemset subset) validSubsets in
let rhss = map (\(Itemset lhs) -> Itemset $ Set.difference is lhs) lhss in
let newRule (lhs, rhs) = AssociationRule lhs rhs (support lhs) (confidence lhs rhs) in
(map newRule $ zip lhss rhss) ++ (allRules iss)
checkMinConfidence (AssociationRule _ _ _ c) = c >= mc
------------------------------------------------------------------------
-- Closed Frequent Itemsets (CFI) - A frequent itemset X is called
-- closed if there exists no frequent superset Y ⊇ X
-- with support(X) = support(Y).
-- Usage: closedFrequentItemsets $ frequentItemsetMining 0.3
------------------------------------------------------------------------
closedFrequentItemsets :: [Itemset] -> [Itemset]
closedFrequentItemsets iss = filter (not . hasSupersetWithEqualSupport) iss
where
hasSupersetWithEqualSupport :: Itemset -> Bool
hasSupersetWithEqualSupport is@(Itemset s) =
let equalSupport x y = (supportCount x) == (supportCount y) in
any (\ix@(Itemset x) -> s `Set.isProperSubsetOf` x && (equalSupport ix is)) iss
------------------------------------------------------------------------
-- Maximal Frequent Itemsets (MFI) - A frequent itemset is called
-- maximal if it is not a subset of any other frequent itemset.
-- Usage: maximalFrequentItemsets $ frequentItemsetMining 0.3
------------------------------------------------------------------------
maximalFrequentItemsets :: [Itemset] -> [Itemset]
maximalFrequentItemsets iss = filter (not . isSubset) iss
where
isSubset :: Itemset -> Bool
isSubset (Itemset is) = any (\(Itemset x) -> is `Set.isProperSubsetOf` x) iss
------------------------------------------------------------------------
-- Unit Tests
------------------------------------------------------------------------
allTests = TestList [fimTests, armTests, cfiTests, mfiTests]
------------------------------------------------------------------------
-- Unit Tests for FIM
------------------------------------------------------------------------
fimTests = TestLabel "Frequent Itemset Mining" $ TestList [fimCompareWithOfficialSolution, fimCandidatesTest]
fimCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
where
expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4
expected1 = ["A", "B", "C", "D", "E", "F", "G", "H"]
expected2 = ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"]
expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH", "EGH"]
expected4 = ["BEGH"]
actual = frequentItemsetMining 0.3
fimCandidatesTest = (List.sort expected) ~=? (List.sort actual)
where
expected = itemsetsFromStrings ["ABE", "ABG", "ABH", "AEG", "AEH", "AGH", "BDG", "BEF", "BEG", "BEH", "BFH", "BGH", "CEF", "CEH", "CFH", "EFH", "EGH"]
actual = frequentItemsetMiningCandidates $ itemsetsFromStrings ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"]
------------------------------------------------------------------------
-- Unit Tests for ARM
------------------------------------------------------------------------
armTests = TestLabel "Association Rule Mining" $ TestList [armCompareWithOfficialSolution, armAllRules]
armCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
where
expected = [newRule "BEG" "H",
newRule "EG" "BH",
newRule "BEH" "G",
newRule "BGH" "E",
newRule "GH" "BE",
newRule "EGH" "B"]
newRule x y = let xs = itemsetFromString x in
let ys = itemsetFromString y in
AssociationRule xs ys (support xs) (confidence xs ys)
actual = associationRuleMining 0.6 $ [itemsetFromString "BEGH"]
armAllRules = (List.sort expected) ~=? (List.sort actual)
where
expected = [newRule "BEG" "H",
newRule "BE" "GH",
newRule "BG" "EH",
newRule "B" "EGH", -- this rule is missing from the offical solution
newRule "EG" "BH",
newRule "E" "BGH",
newRule "G" "BEH",
newRule "BEH" "G",
newRule "BH" "EG",
newRule "EH" "BG",
newRule "BGH" "E",
newRule "GH" "BE",
newRule "H" "BEG",
newRule "EGH" "B"]
newRule x y = let xs = itemsetFromString x in
let ys = itemsetFromString y in
AssociationRule xs ys (support xs) (confidence xs ys)
actual = associationRuleMining 0.0 $ [itemsetFromString "BEGH"]
------------------------------------------------------------------------
-- Unit Tests for CFI
------------------------------------------------------------------------
cfiTests = TestLabel "Closed Frequent Itemsets" $ TestList [cfiCompareWithOfficialSolution]
cfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
where
expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4
expected1 = ["A", "B", "E", "F", "H"]
expected2 = ["AB", "AE", "AH", "BE", "BF", "BG", "BH", "CE", "EF", "EH"]
expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH"]
expected4 = ["BEGH"]
actual = closedFrequentItemsets $ frequentItemsetMining 0.3
------------------------------------------------------------------------
-- Unit Tests for MFI
------------------------------------------------------------------------
mfiTests = TestLabel "Maximal Frequent Itemsets" $ TestList [mfiCompareWithOfficialSolution]
mfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
where
expected = itemsetsFromStrings $ expected2 ++ expected3 ++ expected4
expected2 = ["BF"]
expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "CEF", "CEH", "EFH"]
expected4 = ["BEGH"]
actual = maximalFrequentItemsets $ frequentItemsetMining 0.3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment