Created
March 16, 2018 11:31
-
-
Save khakieconomics/0325c054b1499d5037a1de5d1014645a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
# Let's make some fake data! | |
# These will contain true labels and 500 binary features | |
# We'll follow the general approach here: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html | |
# but rather than using a penalized likelihood estimator for the label-activity probabilities, we'll approximate | |
# it by taking the average of the frequency of activity within label and 0.5. | |
N <- 100000 | |
I <- 30000 | |
A <- 500 | |
# User are the labels | |
user <- rep(NA, I) | |
# Activities are the features | |
activities <- matrix(NA, N, A) | |
# Draw some generative values of alpha | |
alpha <- matrix(rnorm(I*A, 0, 2), I, A) | |
# Draw some 0-1 activities with probabilities given by logit^-1(alpha) | |
for(n in 1:N) { | |
# Draw the user | |
user[n] <- sample(1:I, 1) | |
# Given her alphas, draw an activity profile | |
activities[n,] <- as.numeric(rbinom(A, 1, arm::invlogit(alpha[user[n],]))) | |
} | |
full_data <- bind_cols(user = user, as_data_frame(activities)) | |
# Create 10% holdout, 90% training | |
full_data_training <- full_data[1:(N - round(N/10)),] | |
testing <- full_data[(N - round(N/10) + 1):N,-1] %>% as.matrix() | |
test_labels <- full_data[(N - round(N/10) + 1):N,1] %>% as.matrix | |
# full_data_training contains binary features and a column "user" which contains the categorical label | |
# testing is a feature matrix for observations to be labelled | |
mean_2 <- function(x) mean(c(x, .5)) | |
probs <- full_data_training %>% group_by(user) %>% summarise_all(.funs = funs(mean_2)) | |
# Create a labels vector and remove that column from the probs matrix | |
probs_labels <- probs$user; probs$user <- NULL | |
# log probabilities of 1s and 0s | |
t_probs <- log(probs) %>% as.matrix() | |
n_probs <- log(1 - probs) %>% as.matrix() | |
# Approximate log likelihood | |
log_likelihoods <- testing %*% t(t_probs) + (1 - testing) %*% t(n_probs) | |
predictions <- probs_labels[apply(log_likelihoods, 1, which.max)] | |
mean(as.numeric(predictions) == as.numeric(test_labels)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment