Skip to content

Instantly share code, notes, and snippets.

@statwonk
Created August 30, 2020 19:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statwonk/4a36f6383281ca8a5d2256fdd3dc50bb to your computer and use it in GitHub Desktop.
Save statwonk/4a36f6383281ca8a5d2256fdd3dc50bb to your computer and use it in GitHub Desktop.
A tool to simulate multilevel logistic data.
library(tidyverse)
library(brms)
library(tidybayes)
1e7 -> N # obs
1 -> J # groups of members
10 -> K # members
0.5 -> base_p # base rate, this is logistic regression
# sample member coefficients
tibble(groups = LETTERS[1:J]) %>%
mutate(coefs = list(tibble(member = paste0("member_", seq_len(K)), coef = rnorm(K, sd = 0.5)))) %>%
unnest(cols = c(coefs)) -> d
# suppose this is unbalanced data, where multilevel / regularization shines.
rbeta(K, 1, 1) -> appearance_probabilities
appearance_probabilities/sum(appearance_probabilities) -> appearance_probabilities
# generate samples
d %>%
mutate(appearance_probabilities) %>%
mutate(appearances = rmultinom(1, N, appearance_probabilities)[, 1]) %>%
mutate(data = parallel::mcmapply(function(x, y) {
if(x == 0) {
stop("Increase N, 0 appearances by a group, unhandled exception.")
}
tibble(p = plogis(qlogis(base_p) + y)) %>%
rerun(x, .) %>%
bind_rows() %>%
mutate(y = rbinom(n(), 1, p)) %>% # linear predictor
select(-p)
}, appearances, coef, SIMPLIFY = FALSE, mc.cores = 8)) %>% # 8 cores
select(groups, member, data) %>%
unnest(cols = data) %>%
select(y, A = member) -> d2
d2 %>% count(A) # check frequency of group member obs
bind_cols(
y = d2$y,
d2 %>%
model.matrix(~ A, data = .) %>%
as_tibble()
) %>%
mutate_if(is.numeric, as.integer) -> d3
# d3 %>% skimr::skim()
d3 %>%
write_csv("data.csv")
d %>% distinct() %>% arrange(member) %>% write_csv("params.csv")
import sklearn
from sklearn import linear_model
import pandas as pd
import numpy as np
d = pd.read_csv("data.csv")
y = d.iloc[:, 0]
X = d.iloc[:,list(range(1, d.shape[1]))]
fit = linear_model.SGDClassifier(loss='log', fit_intercept=False)
fit.partial_fit(X, y, (0, 1))
fit.predict_log_proba(X[2:3])
params = pd.read_csv("params.csv").sort_values('member')
pd.concat([
pd.DataFrame(np.transpose(fit.coef_) - fit.intercept_, X.columns, columns=['Coefficients']).reset_index(drop=True),
pd.DataFrame({'actual': params['coef']}),],
axis=1
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment