khakieconomics/very_large_classification.R

## very_large_classification.R
library(dplyr)

# Let's make some fake data!
# These will contain true labels and 500 binary features
# We'll follow the general approach here: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
# but rather than using a penalized likelihood estimator for the label-activity probabilities, we'll approximate
# it by taking the average of the frequency of activity within label and 0.5.

N <- 100000
I <- 30000
A <- 500

# User are the labels
user <- rep(NA, I)
# Activities are the features
activities <- matrix(NA, N, A)

# Draw some generative values of alpha
alpha <- matrix(rnorm(I*A, 0, 2), I, A)

# Draw some 0-1 activities with probabilities given by logit^-1(alpha)
for(n in 1:N) {
  # Draw the user
  user[n] <- sample(1:I, 1)
  # Given her alphas, draw an activity profile
  activities[n,] <- as.numeric(rbinom(A, 1, arm::invlogit(alpha[user[n],])))
}

full_data <- bind_cols(user = user, as_data_frame(activities))

# Create 10% holdout, 90% training
full_data_training <- full_data[1:(N - round(N/10)),]

testing <- full_data[(N - round(N/10) + 1):N,-1] %>% as.matrix()
test_labels <- full_data[(N - round(N/10) + 1):N,1] %>% as.matrix


# full_data_training contains binary features and a column "user" which contains the categorical label
# testing is a feature matrix for observations to be labelled
mean_2 <- function(x) mean(c(x, .5))
probs <- full_data_training %>% group_by(user) %>% summarise_all(.funs = funs(mean_2))
# Create a labels vector and remove that column from the probs matrix
probs_labels <- probs$user; probs$user <- NULL
# log probabilities of 1s and 0s
t_probs <- log(probs) %>% as.matrix()
n_probs <- log(1 - probs) %>% as.matrix()
# Approximate log likelihood
log_likelihoods <- testing %*% t(t_probs) + (1 - testing) %*% t(n_probs)
predictions <- probs_labels[apply(log_likelihoods, 1, which.max)]
mean(as.numeric(predictions) == as.numeric(test_labels))
	library(dplyr)

	# Let's make some fake data!
	# These will contain true labels and 500 binary features
	# We'll follow the general approach here: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
	# but rather than using a penalized likelihood estimator for the label-activity probabilities, we'll approximate
	# it by taking the average of the frequency of activity within label and 0.5.

	N <- 100000
	I <- 30000
	A <- 500

	# User are the labels
	user <- rep(NA, I)
	# Activities are the features
	activities <- matrix(NA, N, A)

	# Draw some generative values of alpha
	alpha <- matrix(rnorm(I*A, 0, 2), I, A)

	# Draw some 0-1 activities with probabilities given by logit^-1(alpha)
	for(n in 1:N) {
	# Draw the user
	user[n] <- sample(1:I, 1)
	# Given her alphas, draw an activity profile
	activities[n,] <- as.numeric(rbinom(A, 1, arm::invlogit(alpha[user[n],])))
	}

	full_data <- bind_cols(user = user, as_data_frame(activities))

	# Create 10% holdout, 90% training
	full_data_training <- full_data[1:(N - round(N/10)),]

	testing <- full_data[(N - round(N/10) + 1):N,-1] %>% as.matrix()
	test_labels <- full_data[(N - round(N/10) + 1):N,1] %>% as.matrix


	# full_data_training contains binary features and a column "user" which contains the categorical label
	# testing is a feature matrix for observations to be labelled
	mean_2 <- function(x) mean(c(x, .5))
	probs <- full_data_training %>% group_by(user) %>% summarise_all(.funs = funs(mean_2))
	# Create a labels vector and remove that column from the probs matrix
	probs_labels <- probs$user; probs$user <- NULL
	# log probabilities of 1s and 0s
	t_probs <- log(probs) %>% as.matrix()
	n_probs <- log(1 - probs) %>% as.matrix()
	# Approximate log likelihood
	log_likelihoods <- testing %% t(t_probs) + (1 - testing) %% t(n_probs)
	predictions <- probs_labels[apply(log_likelihoods, 1, which.max)]
	mean(as.numeric(predictions) == as.numeric(test_labels))