Skip to content

Instantly share code, notes, and snippets.

@mattpolicastro
Last active January 18, 2017 05:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattpolicastro/7f4db08545dbe41d354debf997dfd15c to your computer and use it in GitHub Desktop.
Save mattpolicastro/7f4db08545dbe41d354debf997dfd15c to your computer and use it in GitHub Desktop.
################################################################
# This script is a rough hack at the Apriori algorithm for IS 7036, Homework 1, Problem 1.
# The first section uses no validity checking or subsetting which effectively ignores Apriori
# but brute-forces an answer. Following that, there is a semi-validated section which confirms
# the results of the first approach.
#
# Matt Policastro, 2017-01-18 00:21 EST
################################################################
################
# Setup
################
# Project settings/defaults
project <- list(
# Add empty itemsets to store results later
supported_itemsets = list(),
unsupported_itemsets = list(),
# Support and confidence thresholds
params = list(
minsup = .6,
minconf = .8
),
# Define shopping baskets
baskets = list(
T100 = c("M","O","N","K","E","Y"),
T200 = c("D","O","N","K","E","Y"),
T300 = c("M","A","K","E"),
T400 = c("M","U","C","K","Y"),
T500 = c("C","O","O","K","I","E")
)
)
# Remove duplicates items from baskets (really only applies to "COOKIES")
project$baskets <- sapply(project$baskets, unique)
# Collapse baskets to find set of possible, unique items
project$original_items <- as.list(sort(unique(unlist(project$baskets))))
################
# Helper Functions
################
# Count support for each itemset
count_itemsets_support <- function(itemsets, project) {
counts <- sapply(itemsets, function(itemset) {
return(sum(
sapply(project$baskets, function(basket) {
return(all(itemset %in% basket))
})
))
})
names(itemsets) <- counts
return(itemsets)
}
# Generate higher-n itemsets and remove those that contain unsupported subsets
generate_itemsets <- function(new_length, project, check_itemset_validity = T) {
new_itemsets_raw <- t(combn(project$original_items, new_length))
new_itemsets <- list(rep(NA, nrow(new_itemsets_raw)))
for (i in 1:nrow(new_itemsets_raw)) {
new_itemsets[[i]] <- unlist(new_itemsets_raw[i,])
}
# If there are registered, unsupported itemsets in the project
if (length(project$unsupported_itemsets) > 0 && check_itemset_validity) {
# supported <- sapply(new_itemsets, function(new_itemset) {
contains_unsupported <- sapply(new_itemsets, function(new_itemset) {
# Create vector of false values to insert results into
contains_no_unsupported <- rep(F, length(project$unsupported_itemsets))
i <- 1
# Use while loop so it can be short-circuited as soon as an
# unsupported itemset is found
while (i <= length(project$unsupported_itemsets)) {
if (project$unsupported_itemsets[i] %in% new_itemset) {
break
} else {
contains_no_unsupported[i] <- T
i <- i + 1
}
}
# Only return true if no unsupported
if (all(contains_no_unsupported)) {
return(F)
} else {
return(T)
}
})
new_itemsets <- new_itemsets[!contains_unsupported]
}
return(new_itemsets)
}
# Register unsupported itemsets
find_unsupported_itemsets <- function(itemsets, project) {
# Select only itemsets with supports smaller than the necessary threshold
new_unsupported_itemsets <- itemsets[names(itemsets) < length(project$baskets) * project$params$minsup]
# Create list of all unique, unsupported itemsets
all_unsupported_itemsets <- unique(append(
new_unsupported_itemsets,
project$unsupported_itemsets
))
# Return the project object
return(all_unsupported_itemsets)
}
find_supported_itemsets <- function(itemsets, project) {
new_supported_itemsets <- itemsets[!itemsets %in% project$unsupported_itemsets]
new_supported_itemsets_names <- names(new_supported_itemsets)
old_supported_itemsets_names <- names(project$supported_itemsets)
all_supported_itemsets <- unique(append(
new_supported_itemsets,
project$supported_itemsets
))
names(all_supported_itemsets) <- append(new_supported_itemsets_names, old_supported_itemsets_names)
return(all_supported_itemsets)
}
################################
# With no validity checks, computationally "intensive" (not really, these are small sets)
################################
itemsets1 <- generate_itemsets(1, project, check_itemset_validity = F)
itemsets1 <- count_itemsets_support(itemsets1, project)
itemsets <- append(itemsets1, c())
remove(itemsets1)
itemsets2 <- generate_itemsets(2, project, check_itemset_validity = F)
itemsets2 <- count_itemsets_support(itemsets2, project)
itemsets <- append(itemsets2, itemsets)
remove(itemsets2)
itemsets3 <- generate_itemsets(3, project, check_itemset_validity = F)
itemsets3 <- count_itemsets_support(itemsets3, project)
itemsets <- append(itemsets3, itemsets)
remove(itemsets3)
itemsets4 <- generate_itemsets(4, project, check_itemset_validity = F)
itemsets4 <- count_itemsets_support(itemsets4, project)
itemsets <- append(itemsets4, itemsets)
remove(itemsets4)
itemsets_brute <- lapply(itemsets, function(itemset) {paste(itemset, collapse = "") })
itemsets_brute_ordered <- itemsets[order(names(itemsets), decreasing = T)]
# Itemsets at or above minsup
itemsets_brute <- itemsets_brute_ordered[names(itemsets_brute_ordered) >= 3]
# Clean up
remove(itemsets, itemsets_brute_ordered)
################################
# With some validity checks
################################
################
# 1-itemsets
################
# Create 1-itemsets (technically redundant but doing it for clarity)
itemsets1 <- generate_itemsets(1, project)
# Count support for itemsets1
itemsets1 <- count_itemsets_support(itemsets1, project)
# Register all unsupported itemsets into the project settings
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets1, project)
project$supported_itemsets <- find_supported_itemsets(itemsets1, project)
# Clean up itemsets1
remove(itemsets1)
################
# 2-itemsets
################
# Create 2-itemsets
itemsets2 <- generate_itemsets(2, project)
# Count support for itemsets2
itemsets2 <- count_itemsets_support(itemsets2, project)
# Register all unsupported itemsets
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets2, project)
# Filter unsupported itemsets
project$supported_itemsets <- find_supported_itemsets(itemsets2, project)
# Clean up itemsets2
remove(itemsets2)
################
# 3-itemsets
################
# Create 3-itemsets
itemsets3 <- generate_itemsets(3, project)
# Count support for itemsets2
itemsets3 <- count_itemsets_support(itemsets3, project)
# Register all unsupported itemsets
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets3, project)
# Filter unsupported itemsets
project$supported_itemsets <- find_supported_itemsets(itemsets3, project)
# Clean up itemsets3
remove(itemsets3)
################
# 4-itemsets
################
# Create 4-itemsets
itemsets4 <- generate_itemsets(4, project)
# Count support for itemsets2
itemsets4 <- count_itemsets_support(itemsets4, project)
# Register all unsupported itemsets
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets4, project)
# Filter unsupported itemsets
project$supported_itemsets <- find_supported_itemsets(itemsets4, project)
# Clean up itemsets4
remove(itemsets4)
# Comparison
project$supported_itemsets[order(names(project$supported_itemsets), decreasing = T)]
itemsets_brute
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment