mrgloom/CUR4FIC

## CUR4FIC
# clear the workspace
rm(list = ls())

# load the relevant libraries

# install.packages(rCUR)
library(rCUR)  # for CUR decomposition
# install.packages(irlba)
library(irlba)  # for fast svd

# ~~~~~~~~~~~~~~~~~
# generate data
# number of records
n <- 2000

# number of clusters/imputing points (20%)
ncl <- n * 0.2

# dataframe of datapoints (in covariate/feature space)
df <- data.frame(x = rnorm(n),
                 y = rnorm(n))


# compute k-means cluster centres (& time it)
kmn_time <- system.time(kmn <- kmeans(df, ncl))

# CUR - pre-calculations

# compute distance matrix & time it
dist_time <- system.time(d <- as.matrix(dist(df)))

# fast svd decomposition
svd_time <- system.time(svd <- irlba(d))

# compute the first CUR (optimal points) & time it
cur1_time <- system.time(cur1 <- CUR(d, ncl,
                                     method = "ortho.top.scores",
                                     sv = svd, alpha = 2))

# compute the second CUR (random weighted points) & time it
cur2_time <- system.time(cur2 <- CUR(d, ncl,
                                     method = "exact.num.random",
                                     sv = svd, alpha = 2))

# compute Cholesky pivots
chol_time <- system.time(ch <- attr(chol(exp(-0.5 * d),
                                         pivot = TRUE),
                                    'pivot'))

# set up plotting
par(mfrow = c(2, 2),
    mar = c(1, 1, 4, 1),
    pty = 's')

# compute and plot the k-means solution
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.6), cex = 0.5)
# plot the centres
points(kmn$centers, pch = 16, col = 'blue', cex = 0.5)
title(main = 'kmeans')

# compute and plot the CUR solution with optimal selection of columns/rows
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
# plot inducing points
points(y ~ x, data = df[cur1@C.index, ], pch = 16, col = 'blue', cex = 0.5)
title(main = 'CUR, ortho.top.scores')

# compute and plot the CUR solution with weighted random selection of columns/rows
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
# plot inducing points
points(y ~ x, data = df[cur2@C.index, ], pch = 16, col = 'blue', cex = 0.5)
title(main = 'CUR, exact.num.random')

# compute and plot the CUR solution with weighted random selection of columns/rows
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
# plot inducing points
points(y ~ x, data = df[ch[1:ncl], ], pch = 16, col = 'blue', cex = 0.5)
title(main = 'Cholesky pivots')


# ~~~~~~~~~~
# timings
# distance matrix computation time
dist_time

# svd computation time
svd_time

# cur1 (optimal) computation time
cur1_time

# cur2 (random) computation time
cur2_time

# pivoted cholesky decomposition time
chol_time
	# clear the workspace
	rm(list = ls())

	# load the relevant libraries

	# install.packages(rCUR)
	library(rCUR) # for CUR decomposition
	# install.packages(irlba)
	library(irlba) # for fast svd

	# ~~~~~~~~~~~~~~~~~
	# generate data
	# number of records
	n <- 2000

	# number of clusters/imputing points (20%)
	ncl <- n * 0.2

	# dataframe of datapoints (in covariate/feature space)
	df <- data.frame(x = rnorm(n),
	y = rnorm(n))



	# compute k-means cluster centres (& time it)
	kmn_time <- system.time(kmn <- kmeans(df, ncl))

	# CUR - pre-calculations

	# compute distance matrix & time it
	dist_time <- system.time(d <- as.matrix(dist(df)))

	# fast svd decomposition
	svd_time <- system.time(svd <- irlba(d))

	# compute the first CUR (optimal points) & time it
	cur1_time <- system.time(cur1 <- CUR(d, ncl,
	method = "ortho.top.scores",
	sv = svd, alpha = 2))

	# compute the second CUR (random weighted points) & time it
	cur2_time <- system.time(cur2 <- CUR(d, ncl,
	method = "exact.num.random",
	sv = svd, alpha = 2))

	# compute Cholesky pivots
	chol_time <- system.time(ch <- attr(chol(exp(-0.5 * d),
	pivot = TRUE),
	'pivot'))

	# set up plotting
	par(mfrow = c(2, 2),
	mar = c(1, 1, 4, 1),
	pty = 's')

	# compute and plot the k-means solution
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.6), cex = 0.5)
	# plot the centres
	points(kmn$centers, pch = 16, col = 'blue', cex = 0.5)
	title(main = 'kmeans')

	# compute and plot the CUR solution with optimal selection of columns/rows
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
	# plot inducing points
	points(y ~ x, data = df[cur1@C.index, ], pch = 16, col = 'blue', cex = 0.5)
	title(main = 'CUR, ortho.top.scores')

	# compute and plot the CUR solution with weighted random selection of columns/rows
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
	# plot inducing points
	points(y ~ x, data = df[cur2@C.index, ], pch = 16, col = 'blue', cex = 0.5)
	title(main = 'CUR, exact.num.random')

	# compute and plot the CUR solution with weighted random selection of columns/rows
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
	# plot inducing points
	points(y ~ x, data = df[ch[1:ncl], ], pch = 16, col = 'blue', cex = 0.5)
	title(main = 'Cholesky pivots')


	# ~~~~~~~~~~
	# timings
	# distance matrix computation time
	dist_time

	# svd computation time
	svd_time

	# cur1 (optimal) computation time
	cur1_time

	# cur2 (random) computation time
	cur2_time

	# pivoted cholesky decomposition time
	chol_time