cs/apriori.hs

## apriori.hs
module Main where

import qualified Data.List as List
import qualified Data.Set as Set

import Test.HUnit

main :: IO ()
main = do { runTestTT allTests ; return () }

data Item = Item Char deriving (Eq, Ord)
instance Show Item where
  show (Item c) = [c]

data Itemset = Itemset (Set.Set Item) deriving (Eq, Ord)
instance Show Itemset where
  show (Itemset is) = let helper = foldl (++) "" $ map show $ Set.toList is in
                      "{" ++ (List.intersperse ',' helper) ++ "}"

type Support = Double
type Confidence = Double

data AssociationRule = AssociationRule Itemset Itemset Support Confidence deriving (Eq, Ord)
instance Show AssociationRule where
  show (AssociationRule x y s c) = (show x) ++ " => " ++ (show y) ++ " (" ++ (show s) ++ ", " ++ (show c) ++ ")"
  showList []     s = "[]" ++ s
  showList (x:xs) s = "[\n  " ++ (show x) ++ (showl xs)
    where
      showl []     = "\n]" ++ s
      showl (y:ys) = ",\n  " ++ show y ++ showl ys

itemsetFromString :: String -> Itemset
itemsetFromString s = Itemset $ Set.fromList $ map Item s

itemsetsFromStrings :: [String] -> [Itemset]
itemsetsFromStrings ss = map itemsetFromString ss

allItems :: Itemset
allItems = itemsetFromString "ABCDEFGHIKLM"
allItemsCount (Itemset is) = Set.size is

oneItemsets :: [Itemset]
oneItemsets = let asList (Itemset is) = Set.toList is in
              map (\i -> Itemset $ Set.fromList [i]) $ asList allItems

transactions :: [Itemset]
transactions = itemsetsFromStrings ["BEGH", "ABCEGH", "ABCEFH", "BCDEFGHL", "ABEKH", "BEFGHIK", "ABDGH", "ABDG", "BDFG", "CEF", "ACEFH", "ABEG"]
transactionCount = length transactions

supportCount :: Itemset -> Int
supportCount (Itemset is) = length $ filter (Set.isSubsetOf is) $ map (\(Itemset x) -> x) transactions

support :: Itemset -> Support
support is = (fromIntegral $ supportCount is) / (fromIntegral $ transactionCount)

confidence :: Itemset -> Itemset -> Confidence
confidence lhsis@(Itemset lhs) (Itemset rhs) = (support $ Itemset $ lhs `Set.union` rhs) / (support lhsis)

------------------------------------------------------------------------
-- Utilities
------------------------------------------------------------------------

allSubsets :: (Ord a) => Set.Set a -> [Set.Set a]
allSubsets s = map Set.fromList $ List.subsequences $ Set.toList s

allMaximalProperSubsets :: (Ord a) => Set.Set a -> [Set.Set a]
allMaximalProperSubsets s = filter (\x -> (Set.size x) == (Set.size s) - 1) $ allSubsets s

------------------------------------------------------------------------
-- Frequent Itemset Mining (FIM)
-- Usage: frequentItemsetMining 0.3
------------------------------------------------------------------------
frequentItemsetMining :: Support -> [Itemset]
frequentItemsetMining ms = fim oneItemsets
  where
    fim :: [Itemset] -> [Itemset]
    fim []  = []
    fim iss = let piss = prune iss in piss ++ (fim $ frequentItemsetMiningCandidates piss)
    -- prune is not entirely correct: this algorithm is iterating over all transactions for each of the candidates.
    -- it's possible to achieve the same result with a single iteration. however, i'm to lazy to implement that now.
    prune :: [Itemset] -> [Itemset]
    prune iss = filter checkMinSupport iss
    checkMinSupport is = support(is) >= ms

frequentItemsetMiningCandidates :: [Itemset] -> [Itemset]
frequentItemsetMiningCandidates iss =
  -- the two k - 1 sets should differ in excactly 1 element before joining:
  let validateCandidate a b = (Set.size $ a `Set.difference` b) == 1 in
  let selfJoin = [Itemset (a `Set.union` b) | (Itemset a) <- iss, (Itemset b) <- iss, validateCandidate a b] in
  let nonFrequentSubsets (Itemset s) = all (\s -> (Itemset s) `elem` iss) (allMaximalProperSubsets s) in
  List.nub $ filter nonFrequentSubsets selfJoin

------------------------------------------------------------------------
-- Association Rule Mining (ARM)
-- Usage: associationRuleMining 0.6 $ frequentItemsetMining 0.3
------------------------------------------------------------------------
associationRuleMining :: Confidence -> [Itemset] -> [AssociationRule]
associationRuleMining mc iss = filter checkMinConfidence $ allRules iss
  where
    allRules :: [Itemset] -> [AssociationRule]
    allRules []                 = []
    allRules ((Itemset is):iss) =
      let validSubsets = filter (\x -> (not $ Set.null x) && x /= is) $ allSubsets is in
      let lhss = map (\subset -> Itemset subset) validSubsets in
      let rhss = map (\(Itemset lhs) -> Itemset $ Set.difference is lhs) lhss in
      let newRule (lhs, rhs) = AssociationRule lhs rhs (support lhs) (confidence lhs rhs) in
      (map newRule $ zip lhss rhss) ++ (allRules iss)
    checkMinConfidence (AssociationRule _ _ _ c) = c >= mc

------------------------------------------------------------------------
-- Closed Frequent Itemsets (CFI) - A frequent itemset X is called
-- closed if there exists no frequent superset Y ⊇ X
-- with support(X) = support(Y).
-- Usage: closedFrequentItemsets $ frequentItemsetMining 0.3
------------------------------------------------------------------------
closedFrequentItemsets :: [Itemset] -> [Itemset]
closedFrequentItemsets iss = filter (not . hasSupersetWithEqualSupport) iss
  where
    hasSupersetWithEqualSupport :: Itemset -> Bool
    hasSupersetWithEqualSupport is@(Itemset s) =
      let equalSupport x y = (supportCount x) == (supportCount y) in
      any (\ix@(Itemset x) -> s `Set.isProperSubsetOf` x && (equalSupport ix is)) iss

------------------------------------------------------------------------
-- Maximal Frequent Itemsets (MFI) - A frequent itemset is called
-- maximal if it is not a subset of any other frequent itemset.
-- Usage: maximalFrequentItemsets $ frequentItemsetMining 0.3
------------------------------------------------------------------------
maximalFrequentItemsets :: [Itemset] -> [Itemset]
maximalFrequentItemsets iss = filter (not . isSubset) iss
  where
    isSubset :: Itemset -> Bool
    isSubset (Itemset is) = any (\(Itemset x) -> is `Set.isProperSubsetOf` x) iss

------------------------------------------------------------------------
-- Unit Tests
------------------------------------------------------------------------
allTests = TestList [fimTests, armTests, cfiTests, mfiTests]

------------------------------------------------------------------------
-- Unit Tests for FIM
------------------------------------------------------------------------
fimTests = TestLabel "Frequent Itemset Mining"   $ TestList [fimCompareWithOfficialSolution, fimCandidatesTest]
fimCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
  where
    expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4
    expected1 = ["A", "B", "C", "D", "E", "F", "G", "H"]
    expected2 = ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"]
    expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH", "EGH"]
    expected4 = ["BEGH"]
    actual = frequentItemsetMining 0.3
fimCandidatesTest = (List.sort expected) ~=? (List.sort actual)
  where
    expected = itemsetsFromStrings ["ABE", "ABG", "ABH", "AEG", "AEH", "AGH", "BDG", "BEF", "BEG", "BEH", "BFH", "BGH", "CEF", "CEH", "CFH", "EFH", "EGH"]
    actual = frequentItemsetMiningCandidates $ itemsetsFromStrings ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"]

------------------------------------------------------------------------
-- Unit Tests for ARM
------------------------------------------------------------------------
armTests = TestLabel "Association Rule Mining"   $ TestList [armCompareWithOfficialSolution, armAllRules]
armCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
  where
    expected = [newRule "BEG" "H",
                newRule "EG" "BH",
                newRule "BEH" "G",
                newRule "BGH" "E",
                newRule "GH" "BE",
                newRule "EGH" "B"]
    newRule x y = let xs = itemsetFromString x in
                  let ys = itemsetFromString y in
                  AssociationRule xs ys (support xs) (confidence xs ys)
    actual = associationRuleMining 0.6 $ [itemsetFromString "BEGH"]

armAllRules = (List.sort expected) ~=? (List.sort actual)
  where
    expected = [newRule "BEG" "H",
                newRule "BE" "GH",
                newRule "BG" "EH",
                newRule "B" "EGH", -- this rule is missing from the offical solution
                newRule "EG" "BH",
                newRule "E" "BGH",
                newRule "G" "BEH",
                newRule "BEH" "G",
                newRule "BH" "EG",
                newRule "EH" "BG",
                newRule "BGH" "E",
                newRule "GH" "BE",
                newRule "H" "BEG",
                newRule "EGH" "B"]
    newRule x y = let xs = itemsetFromString x in
                  let ys = itemsetFromString y in
                  AssociationRule xs ys (support xs) (confidence xs ys)
    actual = associationRuleMining 0.0 $ [itemsetFromString "BEGH"]

------------------------------------------------------------------------
-- Unit Tests for CFI
------------------------------------------------------------------------
cfiTests = TestLabel "Closed Frequent Itemsets"  $ TestList [cfiCompareWithOfficialSolution]
cfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
  where
    expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4
    expected1 = ["A", "B", "E", "F", "H"]
    expected2 = ["AB", "AE", "AH", "BE", "BF", "BG", "BH", "CE", "EF", "EH"]
    expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH"]
    expected4 = ["BEGH"]
    actual = closedFrequentItemsets $ frequentItemsetMining 0.3

------------------------------------------------------------------------
-- Unit Tests for MFI
------------------------------------------------------------------------
mfiTests = TestLabel "Maximal Frequent Itemsets" $ TestList [mfiCompareWithOfficialSolution]
mfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
  where
    expected = itemsetsFromStrings $ expected2 ++ expected3 ++ expected4
    expected2 = ["BF"]
    expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "CEF", "CEH", "EFH"]
    expected4 = ["BEGH"]
    actual = maximalFrequentItemsets $ frequentItemsetMining 0.3
	module Main where

	import qualified Data.List as List
	import qualified Data.Set as Set

	import Test.HUnit

	main :: IO ()
	main = do { runTestTT allTests ; return () }

	data Item = Item Char deriving (Eq, Ord)
	instance Show Item where
	show (Item c) = [c]

	data Itemset = Itemset (Set.Set Item) deriving (Eq, Ord)
	instance Show Itemset where
	show (Itemset is) = let helper = foldl (++) "" $ map show $ Set.toList is in
	"{" ++ (List.intersperse ',' helper) ++ "}"

	type Support = Double
	type Confidence = Double

	data AssociationRule = AssociationRule Itemset Itemset Support Confidence deriving (Eq, Ord)
	instance Show AssociationRule where
	show (AssociationRule x y s c) = (show x) ++ " => " ++ (show y) ++ " (" ++ (show s) ++ ", " ++ (show c) ++ ")"
	showList [] s = "[]" ++ s
	showList (x:xs) s = "[\n " ++ (show x) ++ (showl xs)
	where
	showl [] = "\n]" ++ s
	showl (y:ys) = ",\n " ++ show y ++ showl ys

	itemsetFromString :: String -> Itemset
	itemsetFromString s = Itemset $ Set.fromList $ map Item s

	itemsetsFromStrings :: [String] -> [Itemset]
	itemsetsFromStrings ss = map itemsetFromString ss

	allItems :: Itemset
	allItems = itemsetFromString "ABCDEFGHIKLM"
	allItemsCount (Itemset is) = Set.size is

	oneItemsets :: [Itemset]
	oneItemsets = let asList (Itemset is) = Set.toList is in
	map (\i -> Itemset $ Set.fromList [i]) $ asList allItems

	transactions :: [Itemset]
	transactions = itemsetsFromStrings ["BEGH", "ABCEGH", "ABCEFH", "BCDEFGHL", "ABEKH", "BEFGHIK", "ABDGH", "ABDG", "BDFG", "CEF", "ACEFH", "ABEG"]
	transactionCount = length transactions

	supportCount :: Itemset -> Int
	supportCount (Itemset is) = length $ filter (Set.isSubsetOf is) $ map (\(Itemset x) -> x) transactions

	support :: Itemset -> Support
	support is = (fromIntegral $ supportCount is) / (fromIntegral $ transactionCount)

	confidence :: Itemset -> Itemset -> Confidence
	confidence lhsis@(Itemset lhs) (Itemset rhs) = (support $ Itemset $ lhs `Set.union` rhs) / (support lhsis)

	------------------------------------------------------------------------
	-- Utilities
	------------------------------------------------------------------------

	allSubsets :: (Ord a) => Set.Set a -> [Set.Set a]
	allSubsets s = map Set.fromList $ List.subsequences $ Set.toList s

	allMaximalProperSubsets :: (Ord a) => Set.Set a -> [Set.Set a]
	allMaximalProperSubsets s = filter (\x -> (Set.size x) == (Set.size s) - 1) $ allSubsets s

	------------------------------------------------------------------------
	-- Frequent Itemset Mining (FIM)
	-- Usage: frequentItemsetMining 0.3
	------------------------------------------------------------------------
	frequentItemsetMining :: Support -> [Itemset]
	frequentItemsetMining ms = fim oneItemsets
	where
	fim :: [Itemset] -> [Itemset]
	fim [] = []
	fim iss = let piss = prune iss in piss ++ (fim $ frequentItemsetMiningCandidates piss)
	-- prune is not entirely correct: this algorithm is iterating over all transactions for each of the candidates.
	-- it's possible to achieve the same result with a single iteration. however, i'm to lazy to implement that now.
	prune :: [Itemset] -> [Itemset]
	prune iss = filter checkMinSupport iss
	checkMinSupport is = support(is) >= ms

	frequentItemsetMiningCandidates :: [Itemset] -> [Itemset]
	frequentItemsetMiningCandidates iss =
	-- the two k - 1 sets should differ in excactly 1 element before joining:
	let validateCandidate a b = (Set.size $ a `Set.difference` b) == 1 in
	let selfJoin = [Itemset (a `Set.union` b) \| (Itemset a) <- iss, (Itemset b) <- iss, validateCandidate a b] in
	let nonFrequentSubsets (Itemset s) = all (\s -> (Itemset s) `elem` iss) (allMaximalProperSubsets s) in
	List.nub $ filter nonFrequentSubsets selfJoin

	------------------------------------------------------------------------
	-- Association Rule Mining (ARM)
	-- Usage: associationRuleMining 0.6 $ frequentItemsetMining 0.3
	------------------------------------------------------------------------
	associationRuleMining :: Confidence -> [Itemset] -> [AssociationRule]
	associationRuleMining mc iss = filter checkMinConfidence $ allRules iss
	where
	allRules :: [Itemset] -> [AssociationRule]
	allRules [] = []
	allRules ((Itemset is):iss) =
	let validSubsets = filter (\x -> (not $ Set.null x) && x /= is) $ allSubsets is in
	let lhss = map (\subset -> Itemset subset) validSubsets in
	let rhss = map (\(Itemset lhs) -> Itemset $ Set.difference is lhs) lhss in
	let newRule (lhs, rhs) = AssociationRule lhs rhs (support lhs) (confidence lhs rhs) in
	(map newRule $ zip lhss rhss) ++ (allRules iss)
	checkMinConfidence (AssociationRule _ _ _ c) = c >= mc

	------------------------------------------------------------------------
	-- Closed Frequent Itemsets (CFI) - A frequent itemset X is called
	-- closed if there exists no frequent superset Y ⊇ X
	-- with support(X) = support(Y).
	-- Usage: closedFrequentItemsets $ frequentItemsetMining 0.3
	------------------------------------------------------------------------
	closedFrequentItemsets :: [Itemset] -> [Itemset]
	closedFrequentItemsets iss = filter (not . hasSupersetWithEqualSupport) iss
	where
	hasSupersetWithEqualSupport :: Itemset -> Bool
	hasSupersetWithEqualSupport is@(Itemset s) =
	let equalSupport x y = (supportCount x) == (supportCount y) in
	any (\ix@(Itemset x) -> s `Set.isProperSubsetOf` x && (equalSupport ix is)) iss

	------------------------------------------------------------------------
	-- Maximal Frequent Itemsets (MFI) - A frequent itemset is called
	-- maximal if it is not a subset of any other frequent itemset.
	-- Usage: maximalFrequentItemsets $ frequentItemsetMining 0.3
	------------------------------------------------------------------------
	maximalFrequentItemsets :: [Itemset] -> [Itemset]
	maximalFrequentItemsets iss = filter (not . isSubset) iss
	where
	isSubset :: Itemset -> Bool
	isSubset (Itemset is) = any (\(Itemset x) -> is `Set.isProperSubsetOf` x) iss

	------------------------------------------------------------------------
	-- Unit Tests
	------------------------------------------------------------------------
	allTests = TestList [fimTests, armTests, cfiTests, mfiTests]

	------------------------------------------------------------------------
	-- Unit Tests for FIM
	------------------------------------------------------------------------
	fimTests = TestLabel "Frequent Itemset Mining" $ TestList [fimCompareWithOfficialSolution, fimCandidatesTest]
	fimCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
	where
	expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4
	expected1 = ["A", "B", "C", "D", "E", "F", "G", "H"]
	expected2 = ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"]
	expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH", "EGH"]
	expected4 = ["BEGH"]
	actual = frequentItemsetMining 0.3
	fimCandidatesTest = (List.sort expected) ~=? (List.sort actual)
	where
	expected = itemsetsFromStrings ["ABE", "ABG", "ABH", "AEG", "AEH", "AGH", "BDG", "BEF", "BEG", "BEH", "BFH", "BGH", "CEF", "CEH", "CFH", "EFH", "EGH"]
	actual = frequentItemsetMiningCandidates $ itemsetsFromStrings ["AB", "AE", "AG", "AH", "BD", "BE", "BF", "BG", "BH", "CE", "CF", "CH", "DG", "EF", "EG", "EH", "FH", "GH"]

	------------------------------------------------------------------------
	-- Unit Tests for ARM
	------------------------------------------------------------------------
	armTests = TestLabel "Association Rule Mining" $ TestList [armCompareWithOfficialSolution, armAllRules]
	armCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
	where
	expected = [newRule "BEG" "H",
	newRule "EG" "BH",
	newRule "BEH" "G",
	newRule "BGH" "E",
	newRule "GH" "BE",
	newRule "EGH" "B"]
	newRule x y = let xs = itemsetFromString x in
	let ys = itemsetFromString y in
	AssociationRule xs ys (support xs) (confidence xs ys)
	actual = associationRuleMining 0.6 $ [itemsetFromString "BEGH"]

	armAllRules = (List.sort expected) ~=? (List.sort actual)
	where
	expected = [newRule "BEG" "H",
	newRule "BE" "GH",
	newRule "BG" "EH",
	newRule "B" "EGH", -- this rule is missing from the offical solution
	newRule "EG" "BH",
	newRule "E" "BGH",
	newRule "G" "BEH",
	newRule "BEH" "G",
	newRule "BH" "EG",
	newRule "EH" "BG",
	newRule "BGH" "E",
	newRule "GH" "BE",
	newRule "H" "BEG",
	newRule "EGH" "B"]
	newRule x y = let xs = itemsetFromString x in
	let ys = itemsetFromString y in
	AssociationRule xs ys (support xs) (confidence xs ys)
	actual = associationRuleMining 0.0 $ [itemsetFromString "BEGH"]

	------------------------------------------------------------------------
	-- Unit Tests for CFI
	------------------------------------------------------------------------
	cfiTests = TestLabel "Closed Frequent Itemsets" $ TestList [cfiCompareWithOfficialSolution]
	cfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
	where
	expected = itemsetsFromStrings $ expected1 ++ expected2 ++ expected3 ++ expected4
	expected1 = ["A", "B", "E", "F", "H"]
	expected2 = ["AB", "AE", "AH", "BE", "BF", "BG", "BH", "CE", "EF", "EH"]
	expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "BEG", "BEH", "BGH", "CEF", "CEH", "EFH"]
	expected4 = ["BEGH"]
	actual = closedFrequentItemsets $ frequentItemsetMining 0.3

	------------------------------------------------------------------------
	-- Unit Tests for MFI
	------------------------------------------------------------------------
	mfiTests = TestLabel "Maximal Frequent Itemsets" $ TestList [mfiCompareWithOfficialSolution]
	mfiCompareWithOfficialSolution = (List.sort expected) ~=? (List.sort actual)
	where
	expected = itemsetsFromStrings $ expected2 ++ expected3 ++ expected4
	expected2 = ["BF"]
	expected3 = ["ABE", "ABG", "ABH", "AEH", "BDG", "CEF", "CEH", "EFH"]
	expected4 = ["BEGH"]
	actual = maximalFrequentItemsets $ frequentItemsetMining 0.3