Create a gist now

Instantly share code, notes, and snippets.

Embed
library(lubridate)
library(rvest)
library(readr)
library(stringr)
library(data.table)
library(dplyr)
library(arules)
############################################################
################# basic utility function ###################
############################################################
get_path <- function(..., base = getwd()) {
stopifnot(dir.exists(base))
file.path(base, paste(list(...), collapse = '/'))
}
get_files <- function(path, extension = '.csv.gz', min_date = NULL, max_date = NULL) {
files <- data.frame(filename = list.files(path), stringsAsFactors = FALSE) %>%
mutate(date = as.Date(sub(extension, '', filename))) %>%
filter(!is.na(date))
if(!is.null(min_date)) files <- files %>% filter(date >= min_date)
if(!is.null(max_date)) files <- files %>% filter(date <= max_date)
files %>% select(filename) %>% unlist() %>% unname()
}
concat_items <- function(data, idcol = 'id') {
df <- do.call(rbind, lapply(as.data.frame(t(data)), function(e) {
splt_fac <- ifelse(names(e) == idcol, 'id', 'items')
splt <- split(e, splt_fac)
id <- unname(splt[['id']])
items <- unname(splt[['items']][splt[['items']] != ''])
data.frame(items = paste0('{', paste(items, collapse = ","), '}'),
id = id, stringsAsFactors = FALSE)
}))
rownames(df) <- NULL
df
}
############################################################
####### functions related to data collection ###############
############################################################
get_args <- function(args, what, ifnot) {
if (!is.null(args[[what]])) {
args[[what]]
} else {
ifnot
}
}
## download data
# url pattern - http://cran-logs.rstudio.com/2017/2017-04-26.csv.gz
# items <- seq(as.Date('2017-01-01'), as.Date('2017-01-02'), by = 'day')
# out1 <- process(f = download_log, items = items, cores = 1, download_folder = get_path('raw'))
# out2 <- process(f = download_log, items = items, cores = 2, download_folder = get_path('raw'))
download_log <- function(items, ...) {
args <- list(...)
download_folder <- get_args(args, 'download_folder', getwd())
download_log <- function(item, download_folder) {
base_url <- 'http://cran-logs.rstudio.com'
year <- lubridate::year(lubridate::ymd(item))
file_name <- paste0(item, '.csv.gz')
url <- paste(base_url, year, file_name, sep = '/')
download.file(url, file.path(download_folder, file_name))
}
dir.create(download_folder, showWarnings = FALSE)
do.call(rbind, lapply(items, function(itm) {
tryCatch({
message('msg: start to download data for ', itm)
download_log(itm, download_folder = download_folder)
data.frame(date = itm, is_downloaded = TRUE, stringsAsFactors = FALSE)
}, error = function(e) {
message('error: fails to download data for ', itm)
data.frame(date = itm, is_downloaded = FALSE, stringsAsFactors = FALSE)
})
}))
}
## read files
# items <- file.path(get_path('raw'), list.files(get_path('raw'))[1:2])
# out1 <- process(f = read_files, items = items, cores = 1)
# init_str <- '{ library(readr); NULL }'
# out2 <- process(f = read_files, items = items, cores = 2, init_str = init_str, combine = rbind)
read_files <- function(items, ...) {
do.call(rbind, lapply(items, function(itm) {
read_csv(itm, ...)
}))
}
## package name
get_pkg_names <- function(base_url = 'https://cran.r-project.org/web/packages/') {
by_name_url <- paste0(base_url, 'available_packages_by_name.html')
read_html(by_name_url) %>% html_nodes('td a') %>% html_text()
}
## package information
# items <- get_pkg_names()
# out1 <- process(f = get_package_info, items = items[1:2], cores = 1, attribute = c('version', 'depends', 'imports', 'suggests', 'published'))
# init_str <- '{ library(rvest); NULL }'
# out2 <- process(f = get_package_info, items = items[1:4], cores = 2, init_str = init_str, combine = rbind)
get_package_info <- function(items, ...) {
args <- list(...)
attribute <- get_args(args, 'attribute', c('version', 'depends', 'imports', 'suggests', 'published'))
base_url <- get_args(args, 'base_url', 'https://cran.r-project.org/web/packages/')
info <- do.call(rbind, lapply(items, function(itm) {
message('msg: get information of ', itm)
htm <- read_html(paste0(base_url, itm))
title <- tryCatch({
htm %>% html_nodes('h2') %>% html_text()
}, error = function(e) {
NA
})
desc <- tryCatch({
ext <- htm %>% html_nodes('p') %>% html_text()
if (length(ext) > 1) {
ext <- ext[1]
}
}, error = function(e) {
NA
})
att <- tryCatch({
htm %>% html_nodes('td') %>% html_text()
}, error = function(e) {
NULL
})
atts <- if (!is.null(att)) {
do.call(c, lapply(attribute, function(a) {
# grep beginning of string
ind <- grep(paste0('^', a), att, ignore.case = TRUE)
if (length(ind) > 0) ind <- ind[1]
att_values <- if (length(ind) > 0 && length(att) > ind) {
att[ind + 1]
} else {
NA
}
}))
} else {
rep(NA, length(attribute))
}
data.frame(itm, title, desc, t(atts), stringsAsFactors = FALSE)
}))
names(info) <- c('package', 'title', 'desc', attribute)
rownames(info) <- NULL
info
}
############################################################
############## parallel processing function ################
############################################################
process <- function(f, items, cores = detectCores() - 1, init_str = NULL, combine = rbind, ...) {
stopifnot(cores > 0)
set_env <- function(init_str) {
e <- new.env()
e$init_str <- init_str
e
}
if (cores == 1) {
results <- f(items = items, ...)
} else {
message('msg: create cluster of ', cores, ' nodes')
splits <- split(items, 1:cores)
cl <- makeCluster(cores)
if (!is.null(init_str)) {
clusterExport(cl, 'init_str', envir = set_env(init_str))
init <- clusterEvalQ(cl, eval(parse(text = init_str)))
}
results <- parLapplyLB(cl, splits, f, ...)
stopCluster(cl)
message('msg: combine results')
results <- do.call(combine, results)
}
results
}
############################################################
###### functions related to constructing transactions ######
############################################################
filter_log <- function(log, max_download = NULL) {
log <- log %>% filter(size > 1024, !is.na(r_version), !is.na(r_arch), !is.na(r_os)) %>%
select(date, ip_id, r_version, r_arch, r_os, package) %>%
distinct(date, ip_id, r_version, r_arch, r_os, package) %>%
group_by(date, ip_id, r_version, r_arch, r_os) %>% mutate(count = n())
if (!is.null(max_download)) {
log %>% filter(count <= max_download)
} else {
log
}
}
add_group_idx <- function(log) {
group_idx <- log %>% group_indices()
bind_cols(log, data.frame(id = group_idx)) %>% ungroup()
}
keep_trans_cols <- function(log) {
bind_rows(log) %>% mutate(trans_id = paste(gsub('-', '', date), id, sep = '_')) %>%
select(trans_id, package, count)
}
set_split_map <- function() {
data.frame(count = 1:20, splt = c(1, rep(2, 2), rep(3, 3), rep(4:5, each = 4), rep(6, 6)))
}
split_log <- function(log, max_download, split_map = set_split_map()) {
log <- log %>% filter_log(max_download = max_download) %>% add_group_idx() %>% keep_trans_cols() %>%
inner_join(split_map, by = 'count') %>% setDT() %>% setkey(trans_id)
split(log, by = 'splt')
}
construct_trans <- function(log) {
log_cast <- log %>% select(trans_id, package) %>%
dcast(formula = trans_id ~ package, fun.aggregate = length, value.var = 'package')
ids <- log_cast$trans_id
log_cast <- log_cast[, -1] %>% as.matrix()
rownames(log_cast) <- ids
as(log_cast, 'transactions')
}
# files <- get_files(path = get_path('raw'), extension = '.csv.gz', min_date = as.Date('2017-04-01'))
# items <- file.path(get_path('raw'), files)
# init_str <- "{ source(file.path(getwd(), 'src', 'utils.R')); NULL}"
# trans_save <- process(f = save_trans, items = items[1:2], cores = 2, init_str = init_str, combine = rbind)
save_trans <- function(items, ...) {
args <- list(...)
max_download <- get_args(args, 'max_download', 20)
split_map <- get_args(args, 'split_map', set_split_map())
trans_folder <- get_args(args, 'trans_folder', get_path('trans'))
dir.create(trans_folder, showWarnings = FALSE)
do.call(rbind, lapply(items, function(itm) {
gc()
date <- str_extract(itm, '[0-9]{4}-[0-9]{2}-[0-9]{2}')
message('msg: current date ', date)
tryCatch({
logs <- read_csv(itm) %>%
split_log(max_download = max_download, split_map = split_map)
trans <- lapply(names(logs), function(nm) {
message('msg: contructing logs of split group ', nm)
construct_trans(logs[[nm]])
})
names(trans) <- names(logs)
trans_name <- paste0(date, '.rds')
write_rds(trans, file.path(trans_folder, trans_name), compress = 'gz')
data.frame(date = date, is_saved = TRUE, stringsAsFactors = FALSE)
}, error = function(err) {
warning('err: fails to create transactions')
data.frame(date = date, is_saved = FALSE, stringsAsFactors = FALSE)
})
}))
}
############################################################
######### functions related to merging transactions ########
############################################################
get_trans_info <- function(trans) {
m <- trans@data
items <- trans@itemInfo %>% unlist() %>% unname()
itemsets <- trans@itemsetInfo %>% unlist() %>% unname()
dimnames(m) <- list(items, itemsets)
list(m = m, items = items, itemsets = itemsets)
}
get_sm <- function(info, items_all) {
m <- info$m
items <- info$items
itemsets <- info$itemsets
sm <- sparseMatrix(
i = (length(items_all) - length(items)),
j = ncol(m),
x = 0,
dimnames = list(items_all[!items_all %in% items], colnames(m))
) %>% as('ngCMatrix')
sm[nrow(sm), ncol(sm)] <- FALSE
sm
}
# do.call(bind_trans, list(trans1, trans2, trans3))
bind_trans <- function(...) {
trans <- list(...)
infos <- lapply(trans, get_trans_info)
ms <- lapply(infos, function(info) info[['m']])
items <- lapply(infos, function(info) info[['items']])
itemsets <- lapply(infos, function(info) info[['itemsets']])
items_all <- sort(unique(unlist(items)))
sms <- lapply(infos, get_sm, items_all = items_all)
m <- do.call(cBind, lapply(1:length(ms), function(i) {
mb <- rBind(ms[[i]], sms[[i]])
mb[sort(rownames(mb)), ]
}))
as(m, 'transactions')
}
# bind_trans_pair(trans1, trans2)
bind_trans_pair <- function(trans1, trans2) {
info1 <- get_trans_info(trans1)
m1 <- info1$m
items1 <- info1$items
itemsets1 <- info1$itemsets
info2 <- get_trans_info(trans2)
m2 <- info2$m
items2 <- info2$items
itemsets2 <- info2$itemsets
# create 0 ngCMatrix for items not existing
items_all <- sort(unique(c(items1, items2)))
sm1 <- get_sm(info1, items_all)
sm2 <- get_sm(info2, items_all)
m1 <- rBind(m1, sm1)
m2 <- rBind(m2, sm2)
m <- cBind(m1[sort(rownames(m1)),], m2[sort(rownames(m2)), ])
as(m, 'transactions')
}
# files <- get_files(path = get_path('trans'), extension = '.rds')
# items <- file.path(get_path('trans'), files)
# init_str <- "{ source(file.path(getwd(), 'src', 'utils.R')); NULL}"
# trans_all <- process(f = read_trans, items = items, cores = detectCores() - 1,
# init_str = init_str, combine = bind_trans, excl_group = NULL)
read_trans <- function(items, ...) {
args <- list(...)
excl_group <- get_args(args, 'excl_group', NULL)
if (!is.null(excl_group) && all(sort(excl_group) == unique(set_split_map()$splt))) {
stop('exclusion group should be NULL or smaller than all exclusion groups')
}
inner <- lapply(items, function(itm) {
date <- str_extract(itm, '[0-9]{4}-[0-9]{2}-[0-9]{2}')
message('msg: processing transactions for ', date)
trans <- read_rds(itm)
if (!is.null(excl_group)) {
trans <- trans[!names(trans) %in% excl_group]
}
if (length(trans) < 2) {
trans[[1]]
} else {
do.call(bind_trans, trans)
}
})
if (length(inner) < 2) {
inner
} else {
do.call(bind_trans, inner)
}
}
############################################################
###### functions related to executing HITS algorithm #######
############################################################
get_adj <- function(trans) {
itemM <- trans@data
item_info <- if ('levels' %in% names(trans@itemInfo)) {
trans@itemInfo[, 'levels']
} else {
trans@itemInfo[[1]]
}
item_no <- length(item_info)
itemset_info <- trans@itemsetInfo[[1]]
itemset_no <- length(itemset_info)
leftM <- sparseMatrix(i = itemset_no, j = itemset_no, x = 0)
bottomM <- sparseMatrix(i = item_no, j = (itemset_no + item_no), x = 0)
rBind(cBind(leftM, t(itemM)), bottomM)
}
run_hits <- function(A, k = 100, tol = 1e-8, verbose = FALSE){
# mostly from Ch5 of Practical Graph Mining With R
# https://www.csc2.ncsu.edu/faculty/nfsamato/practical-graph-mining-with-R/PracticalGraphMiningWithR.html
#Get number of nodes(rows) in adjacency matrix
nodes <- dim(A)[1]
#Initialize authority and hub vector to 1 for each node
auth <- c(rep(1, nodes))
hub <- c(rep(1, nodes))
for (i in 1:k) {
auth_last <- auth
#Authority and Hub scores are calculated
auth <- t(A) %*% hub
hub <- A %*% auth
#Normalize Hub and Authority scores
auth <- auth/sqrt(sum(auth * auth))
hub <- hub/sqrt(sum(hub * hub))
err <- sum(abs(auth - auth_last))
if (verbose) message('msg: iteration ', i, ' error - ', err)
if (err < nodes * tol) {
break
}
}
if (err > nodes * tol) {
warning('power iteration failed to converge in ', (i+1), ' iterations')
}
return (list(auth = auth, hub = hub))
}
get_hits <- function(A, itemsets, items, k = 100, tol = 1e-6, verbose = FALSE) {
hits <- run_hits(A, k, tol, verbose)
hub <- hits$hub[1:length(itemsets)]
names(hub) <- itemsets
auth <- hits$auth[(length(itemsets)+1):length(hits$auth)]
names(auth) <- items
list(auth = auth, hub = hub)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment