Skip to content

Instantly share code, notes, and snippets.

@meefen
Last active February 7, 2018 07:31
Show Gist options
  • Save meefen/7454d2fc8b9bc821c12a to your computer and use it in GitHub Desktop.
Save meefen/7454d2fc8b9bc821c12a to your computer and use it in GitHub Desktop.
Frequent Sequence Mining
========================================================
```{r knitr-options}
library(knitr)
options(width=200, digits=2)
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE, tidy = FALSE, size="small")
```
## Data preparation
Convert data into FSM format
```{r}
# Read data
dft <- read.csv("threads.csv")
dfc <- read.csv("coding.csv")
dft$grade <- factor(dft$grade)
dfc$coding <- factor(dfc$coding, levels=c("Q","T","OE","WE","SY","S"))
# Reformat dfc
dfc$sequenceID <- as.numeric(dfc$thread)
dfc$eventID <- dfc$sequenceID * 100000 + dfc$id
dfc$size <- 1
dfc$items <- dfc$coding
# Partition by thread type
t_e <- dft$thread[which(dft$type == "e")]
t_i <- dft$thread[which(dft$type == "i")]
dfc_e <- subset(dfc, thread %in% t_e, select=c(sequenceID, eventID, size, items))
dfc_i <- subset(dfc, thread %in% t_i, select=c(sequenceID, eventID, size, items))
dfc_e <- dfc_e[with(dfc_e, order(eventID)), ]
dfc_i <- dfc_i[with(dfc_i, order(eventID)), ]
# Save files
basket_files <- c("effective", "improvable")
write.table(dfc_e, file=basket_files[1], quote=FALSE, row.names=FALSE, col.names=FALSE)
write.table(dfc_i, file=basket_files[2], quote=FALSE, row.names=FALSE, col.names=FALSE)
```
## Frequent sequence mining
```{r}
library(arulesSequences)
source("utils.R") # Load functions
# mine frequent sequences
fs_all <- data.frame(group=character(), # empty df for frequent sequences
sequence=character(), support=character())
r_all <- data.frame(group=character(), # empty df for rules
rule=character(), support=character(),
confidence=character(), lift=character())
for(file in basket_files) {
bskt <- read_baskets(file, info=c("sequenceID","eventID","SIZE")) # read baskets
# mine frequent sequences
fs <- cspade(bskt, parameter = list(support = 0.1, maxlen=4, maxgap=2),
control = list(verbose = TRUE, bfstype=TRUE))
fs_df <- cbind(group=file, as(fs, "data.frame"))
fs_all <- rbind(fs_all, fs_df)
# select interesting sequences / rules
rules <- ruleInduction(fs, confidence = 0.6, # induct rules >= a confidence level
control = list(verbose = TRUE))
rules.df <- subset(as(rules, "data.frame"), lift > 1.2) # filter by lift
if(nrow(rules.df) > 0)
r_all <- rbind(r_all, cbind(group=file, rules.df))
}
## Check results
str(fs_all)
## Get single-event sequences
fs_single <- fs_all[!grepl("\\},\\{", fs_all$sequence), ]
fs_single_pretty <- Prettify(fs_single)
write.csv(fs_single_pretty, file="fs_single_pretty.csv", row.names=FALSE)
# Get multi-event sequences
fs_multi <- fs_all[grep("\\},\\{", fs_all$sequence), ]
fs_multi_pretty <- Prettify(fs_multi)
write.csv(fs_multi_pretty, file="fs_multi_pretty.csv", row.names=FALSE)
```
## Interpret results
```{r}
Differentiate(fs_single_pretty, basket_files, 0.05) # compare single
Differentiate(fs_multi_pretty, basket_files, 0.2) # compare multiple
Differentiate(fs_multi_pretty, basket_files, 0.1, 0.2) # compare multiple
```
Induct rules
```{r}
r_all[with(r_all, order(group, -lift)), ]
```
### Utils for FSM
Prettify <- function(df) {
## Function to prettify fs df
sequences <- as.character(unique(df$sequence))
len <- length(sequences)
df_pretty <- data.frame(sequence = sequences)
for(file in unique(df$group)) {
sub <- df[df$group == file, ]
vec <- rep(NA, len)
vec[match(sub$sequence, sequences)] <- sub$support
df_pretty <- cbind(df_pretty, vec)
names(df_pretty)[ncol(df_pretty)] <- file
}
return(df_pretty)
}
ClearStandloneEvents <- function(df) {
## Function to clear standalone video events
df[!grepl("\\{(pause|out-of-sequence|seeked|short-event|
immediate-review|ratechange|seen.before)\\}", df$sequence), ]
}
FilterSequences <- function(df, threshold = 0.0) {
## Function to filter the frequence/support of identified sequences
df[df$support >= threshold, ]
}
Differentiate <- function(fs, groups=NULL, diff_min=0.05, diff_max=NULL) {
### Function to find differences among specified groups,
### in terms of frequent sequences
###
### Parameters:
### fs: data.frame (prettifed) containing data -- row: sequences; col: groups
### groups: character vector specifying groups to compare
### diff: threshold of difference
sub <- subset(fs, select=c("sequence", groups))
choose <- apply(sub[, groups], 1, function(x) {
if(sum(!is.na(x)) == 0) return(FALSE)
if(is.null(diff_max))
diff(range(x, na.rm=TRUE)) > diff_min
else
diff(range(x, na.rm=TRUE)) > diff_min & diff(range(x, na.rm=TRUE)) < diff_max
})
fs.d <- sub[choose, ]
fs.d$diff <- fs.d[, 2] - fs.d[, 3]
fs.d[order(-fs.d[ , 4]),]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment