Samuel samuel-bohman

## SparkR_vs_sparklyr.R
library(SparkR, lib.loc = paste(Sys.getenv("SPARK_HOME"), "/R/lib", sep = ""))
sc <- sparkR.session(master = "local")
df1 <- read.df("nycflights13.csv", source = "csv", header = "true", inferSchema = "true")

### SUMMARY TABLE WITH SQL
createOrReplaceTempView(df1, "tbl1")
summ <- sql("select month, avg(dep_time) as avg_dep, avg(arr_time) as avg_arr from tbl1 where month in (1, 3, 5) group by month")
head(summ)
#   month  avg_dep  avg_arr
# 1     1 1347.210 1523.155

## user_preferences.json
{
    "shortcuts": [
        {
            "command": "application:activate-next-tab",
            "keys": [
                "Ctrl Shift ]"
            ],
            "selector": "body",
            "disabled": true
        },

## hier-clust.R
# Compare configurations
h_configs <- dtwclust::compare_clusterings_configs(
  types = "hierarchical",
  k = 2L:30L,
  controls = list(
    hierarchical = hierarchical_control(
      method = "all"
      # distmat = d  # Optional precomputed cross-distance matrix
    )
  ),

## boyer_moore.R
# https://www.cs.utexas.edu/~moore/best-ideas/mjrty/index.html
x <- c("A", "A", "A", "C", "C", "B", "B", "C", "C", "C", "B", "C", "C")  # 7 C's out of 13

bv <- function(x) {
  v <- c()
  i <- 0
  for (j in 1:length(x)) {
    if (i == 0) {
      v <- x[j]
      i <- 1

## boyer_moore_generalization.cpp
#include <iostream>
#include <bits/stdc++.h>

using namespace std;

struct Element {
    int value;
    int count;
};

## k_means.R
k_means <- function(x, k, iter.max = 10) {
  random_index <- sample(1:k, nrow(x), replace = TRUE)
  data_w_cluster <- cbind(x, clusterID = random_index)
  iterations <- 1
  plot(data_w_cluster[, 1:2], xaxt = "n", yaxt = "n")
  legend("topright", paste0("i = ", 0), bg = NULL)
  while(TRUE) {
    centroids <- matrix(rep(0, times = k * ncol(x)), nrow = k, ncol = ncol(x))
    for(i in 1:k) {
      obs_of_cluster_i <- data_w_cluster$clusterID == i

## jaccard.R
jaccard <- function(x, m) {
  if (m == 1 | m == 2) {
    M_00 <- apply(x, m, sum) == 0
    M_11 <- apply(x, m, sum) == 2
    if (m == 1) {
      x <- x[!M_00, ]
      JSim <- sum(M_11) / nrow(x)
    } else {
      x <- x[, !M_00]
      JSim <- sum(M_11) / length(x)
	library(SparkR, lib.loc = paste(Sys.getenv("SPARK_HOME"), "/R/lib", sep = ""))
	sc <- sparkR.session(master = "local")
	df1 <- read.df("nycflights13.csv", source = "csv", header = "true", inferSchema = "true")

	### SUMMARY TABLE WITH SQL
	createOrReplaceTempView(df1, "tbl1")
	summ <- sql("select month, avg(dep_time) as avg_dep, avg(arr_time) as avg_arr from tbl1 where month in (1, 3, 5) group by month")
	head(summ)
	# month avg_dep avg_arr
	# 1 1 1347.210 1523.155
	{
	"shortcuts": [
	{
	"command": "application:activate-next-tab",
	"keys": [
	"Ctrl Shift ]"
	],
	"selector": "body",
	"disabled": true
	},
	# Compare configurations
	h_configs <- dtwclust::compare_clusterings_configs(
	types = "hierarchical",
	k = 2L:30L,
	controls = list(
	hierarchical = hierarchical_control(
	method = "all"
	# distmat = d # Optional precomputed cross-distance matrix
	)
	),
	# https://www.cs.utexas.edu/~moore/best-ideas/mjrty/index.html
	x <- c("A", "A", "A", "C", "C", "B", "B", "C", "C", "C", "B", "C", "C") # 7 C's out of 13

	bv <- function(x) {
	v <- c()
	i <- 0
	for (j in 1:length(x)) {
	if (i == 0) {
	v <- x[j]
	i <- 1
	#include <iostream>
	#include <bits/stdc++.h>

	using namespace std;

	struct Element {
	int value;
	int count;
	};
	k_means <- function(x, k, iter.max = 10) {
	random_index <- sample(1:k, nrow(x), replace = TRUE)
	data_w_cluster <- cbind(x, clusterID = random_index)
	iterations <- 1
	plot(data_w_cluster[, 1:2], xaxt = "n", yaxt = "n")
	legend("topright", paste0("i = ", 0), bg = NULL)
	while(TRUE) {
	centroids <- matrix(rep(0, times = k * ncol(x)), nrow = k, ncol = ncol(x))
	for(i in 1:k) {
	obs_of_cluster_i <- data_w_cluster$clusterID == i
	jaccard <- function(x, m) {
	if (m == 1 \| m == 2) {
	M_00 <- apply(x, m, sum) == 0
	M_11 <- apply(x, m, sum) == 2
	if (m == 1) {
	x <- x[!M_00, ]
	JSim <- sum(M_11) / nrow(x)
	} else {
	x <- x[, !M_00]
	JSim <- sum(M_11) / length(x)