Last active
January 18, 2017 05:57
-
-
Save mattpolicastro/7f4db08545dbe41d354debf997dfd15c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################ | |
# This script is a rough hack at the Apriori algorithm for IS 7036, Homework 1, Problem 1. | |
# The first section uses no validity checking or subsetting which effectively ignores Apriori | |
# but brute-forces an answer. Following that, there is a semi-validated section which confirms | |
# the results of the first approach. | |
# | |
# Matt Policastro, 2017-01-18 00:21 EST | |
################################################################ | |
################ | |
# Setup | |
################ | |
# Project settings/defaults | |
project <- list( | |
# Add empty itemsets to store results later | |
supported_itemsets = list(), | |
unsupported_itemsets = list(), | |
# Support and confidence thresholds | |
params = list( | |
minsup = .6, | |
minconf = .8 | |
), | |
# Define shopping baskets | |
baskets = list( | |
T100 = c("M","O","N","K","E","Y"), | |
T200 = c("D","O","N","K","E","Y"), | |
T300 = c("M","A","K","E"), | |
T400 = c("M","U","C","K","Y"), | |
T500 = c("C","O","O","K","I","E") | |
) | |
) | |
# Remove duplicates items from baskets (really only applies to "COOKIES") | |
project$baskets <- sapply(project$baskets, unique) | |
# Collapse baskets to find set of possible, unique items | |
project$original_items <- as.list(sort(unique(unlist(project$baskets)))) | |
################ | |
# Helper Functions | |
################ | |
# Count support for each itemset | |
count_itemsets_support <- function(itemsets, project) { | |
counts <- sapply(itemsets, function(itemset) { | |
return(sum( | |
sapply(project$baskets, function(basket) { | |
return(all(itemset %in% basket)) | |
}) | |
)) | |
}) | |
names(itemsets) <- counts | |
return(itemsets) | |
} | |
# Generate higher-n itemsets and remove those that contain unsupported subsets | |
generate_itemsets <- function(new_length, project, check_itemset_validity = T) { | |
new_itemsets_raw <- t(combn(project$original_items, new_length)) | |
new_itemsets <- list(rep(NA, nrow(new_itemsets_raw))) | |
for (i in 1:nrow(new_itemsets_raw)) { | |
new_itemsets[[i]] <- unlist(new_itemsets_raw[i,]) | |
} | |
# If there are registered, unsupported itemsets in the project | |
if (length(project$unsupported_itemsets) > 0 && check_itemset_validity) { | |
# supported <- sapply(new_itemsets, function(new_itemset) { | |
contains_unsupported <- sapply(new_itemsets, function(new_itemset) { | |
# Create vector of false values to insert results into | |
contains_no_unsupported <- rep(F, length(project$unsupported_itemsets)) | |
i <- 1 | |
# Use while loop so it can be short-circuited as soon as an | |
# unsupported itemset is found | |
while (i <= length(project$unsupported_itemsets)) { | |
if (project$unsupported_itemsets[i] %in% new_itemset) { | |
break | |
} else { | |
contains_no_unsupported[i] <- T | |
i <- i + 1 | |
} | |
} | |
# Only return true if no unsupported | |
if (all(contains_no_unsupported)) { | |
return(F) | |
} else { | |
return(T) | |
} | |
}) | |
new_itemsets <- new_itemsets[!contains_unsupported] | |
} | |
return(new_itemsets) | |
} | |
# Register unsupported itemsets | |
find_unsupported_itemsets <- function(itemsets, project) { | |
# Select only itemsets with supports smaller than the necessary threshold | |
new_unsupported_itemsets <- itemsets[names(itemsets) < length(project$baskets) * project$params$minsup] | |
# Create list of all unique, unsupported itemsets | |
all_unsupported_itemsets <- unique(append( | |
new_unsupported_itemsets, | |
project$unsupported_itemsets | |
)) | |
# Return the project object | |
return(all_unsupported_itemsets) | |
} | |
find_supported_itemsets <- function(itemsets, project) { | |
new_supported_itemsets <- itemsets[!itemsets %in% project$unsupported_itemsets] | |
new_supported_itemsets_names <- names(new_supported_itemsets) | |
old_supported_itemsets_names <- names(project$supported_itemsets) | |
all_supported_itemsets <- unique(append( | |
new_supported_itemsets, | |
project$supported_itemsets | |
)) | |
names(all_supported_itemsets) <- append(new_supported_itemsets_names, old_supported_itemsets_names) | |
return(all_supported_itemsets) | |
} | |
################################ | |
# With no validity checks, computationally "intensive" (not really, these are small sets) | |
################################ | |
itemsets1 <- generate_itemsets(1, project, check_itemset_validity = F) | |
itemsets1 <- count_itemsets_support(itemsets1, project) | |
itemsets <- append(itemsets1, c()) | |
remove(itemsets1) | |
itemsets2 <- generate_itemsets(2, project, check_itemset_validity = F) | |
itemsets2 <- count_itemsets_support(itemsets2, project) | |
itemsets <- append(itemsets2, itemsets) | |
remove(itemsets2) | |
itemsets3 <- generate_itemsets(3, project, check_itemset_validity = F) | |
itemsets3 <- count_itemsets_support(itemsets3, project) | |
itemsets <- append(itemsets3, itemsets) | |
remove(itemsets3) | |
itemsets4 <- generate_itemsets(4, project, check_itemset_validity = F) | |
itemsets4 <- count_itemsets_support(itemsets4, project) | |
itemsets <- append(itemsets4, itemsets) | |
remove(itemsets4) | |
itemsets_brute <- lapply(itemsets, function(itemset) {paste(itemset, collapse = "") }) | |
itemsets_brute_ordered <- itemsets[order(names(itemsets), decreasing = T)] | |
# Itemsets at or above minsup | |
itemsets_brute <- itemsets_brute_ordered[names(itemsets_brute_ordered) >= 3] | |
# Clean up | |
remove(itemsets, itemsets_brute_ordered) | |
################################ | |
# With some validity checks | |
################################ | |
################ | |
# 1-itemsets | |
################ | |
# Create 1-itemsets (technically redundant but doing it for clarity) | |
itemsets1 <- generate_itemsets(1, project) | |
# Count support for itemsets1 | |
itemsets1 <- count_itemsets_support(itemsets1, project) | |
# Register all unsupported itemsets into the project settings | |
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets1, project) | |
project$supported_itemsets <- find_supported_itemsets(itemsets1, project) | |
# Clean up itemsets1 | |
remove(itemsets1) | |
################ | |
# 2-itemsets | |
################ | |
# Create 2-itemsets | |
itemsets2 <- generate_itemsets(2, project) | |
# Count support for itemsets2 | |
itemsets2 <- count_itemsets_support(itemsets2, project) | |
# Register all unsupported itemsets | |
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets2, project) | |
# Filter unsupported itemsets | |
project$supported_itemsets <- find_supported_itemsets(itemsets2, project) | |
# Clean up itemsets2 | |
remove(itemsets2) | |
################ | |
# 3-itemsets | |
################ | |
# Create 3-itemsets | |
itemsets3 <- generate_itemsets(3, project) | |
# Count support for itemsets2 | |
itemsets3 <- count_itemsets_support(itemsets3, project) | |
# Register all unsupported itemsets | |
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets3, project) | |
# Filter unsupported itemsets | |
project$supported_itemsets <- find_supported_itemsets(itemsets3, project) | |
# Clean up itemsets3 | |
remove(itemsets3) | |
################ | |
# 4-itemsets | |
################ | |
# Create 4-itemsets | |
itemsets4 <- generate_itemsets(4, project) | |
# Count support for itemsets2 | |
itemsets4 <- count_itemsets_support(itemsets4, project) | |
# Register all unsupported itemsets | |
project$unsupported_itemsets <- find_unsupported_itemsets(itemsets4, project) | |
# Filter unsupported itemsets | |
project$supported_itemsets <- find_supported_itemsets(itemsets4, project) | |
# Clean up itemsets4 | |
remove(itemsets4) | |
# Comparison | |
project$supported_itemsets[order(names(project$supported_itemsets), decreasing = T)] | |
itemsets_brute | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment