Last active Oct 4, 2017
Function for calculating the Jaccard similarity and distance coefficients
 jaccard <- function(x, m) { if (m == 1 | m == 2) { M_00 <- apply(x, m, sum) == 0 M_11 <- apply(x, m, sum) == 2 if (m == 1) { x <- x[!M_00, ] JSim <- sum(M_11) / nrow(x) } else { x <- x[, !M_00] JSim <- sum(M_11) / length(x)
Last active Oct 5, 2017
k-means algorithm
 k_means <- function(x, k, iter.max = 10) { random_index <- sample(1:k, nrow(x), replace = TRUE) data_w_cluster <- cbind(x, clusterID = random_index) iterations <- 1 plot(data_w_cluster[, 1:2], xaxt = "n", yaxt = "n") legend("topright", paste0("i = ", 0), bg = NULL) while(TRUE) { centroids <- matrix(rep(0, times = k * ncol(x)), nrow = k, ncol = ncol(x)) for(i in 1:k) { obs_of_cluster_i <- data_w_cluster\$clusterID == i
Last active Jan 30, 2018
Boyer–Moore Majority Vote Algorithm Generalization
 #include #include using namespace std; struct Element { int value; int count; };
Created Jan 30, 2018
Boyer-More Majority Vote Algorithm
 # https://www.cs.utexas.edu/~moore/best-ideas/mjrty/index.html x <- c("A", "A", "A", "C", "C", "B", "B", "C", "C", "C", "B", "C", "C") # 7 C's out of 13 bv <- function(x) { v <- c() i <- 0 for (j in 1:length(x)) { if (i == 0) { v <- x[j] i <- 1
Last active May 8, 2018
 # Compare configurations h_configs <- dtwclust::compare_clusterings_configs( types = "hierarchical", k = 2L:30L, controls = list( hierarchical = hierarchical_control( method = "all" # distmat = d # Optional precomputed cross-distance matrix ) ),
Last active Sep 18, 2019
RStudio IDE Keyboard Shortcuts in JupyterLab
 { "shortcuts": [ { "command": "application:activate-next-tab", "keys": [ "Ctrl Shift ]" ], "selector": "body", "disabled": true },
Created Mar 5, 2020
SparkR versus sparklyr
 library(SparkR, lib.loc = paste(Sys.getenv("SPARK_HOME"), "/R/lib", sep = "")) sc <- sparkR.session(master = "local") df1 <- read.df("nycflights13.csv", source = "csv", header = "true", inferSchema = "true") ### SUMMARY TABLE WITH SQL createOrReplaceTempView(df1, "tbl1") summ <- sql("select month, avg(dep_time) as avg_dep, avg(arr_time) as avg_arr from tbl1 where month in (1, 3, 5) group by month") head(summ) # month avg_dep avg_arr # 1 1 1347.210 1523.155