Jim khakieconomics

## heckman_correction.stan
functions {
  vector inverse_mills(vector z) {
    vector[rows(z)] out;

    for(i in 1:rows(z)) {
       out[i] = exp(normal_lpdf(z[i] | 0, 1)) / (Phi(z[i]));
    }
    return(out);
  }
}

## b_splines_of_probabilities.stan
functions {
  // B function
  vector B(vector x, vector t, int i, int j_p1);

  vector B(vector x, vector t, int i, int k) {
    vector[rows(x)] out;
    vector[rows(x)] a_i1j1;
    vector[rows(x)] a_ij1;


## check_that_spline_function_works.R
# This script checks that the b-spline function in gist https://gist.github.com/khakieconomics/2272cd7f0d61950852622198b26a2d02
# produces something approximating the function in the splines package.

library(rstan)
expose_stan_functions("b_spline_function.stan")

probs <- runif(1000)
probs <- probs[order(probs)]


## b_spline_function.stan
functions {
  // B function
  vector B(vector x, vector t, int i, int j_p1);

  vector B(vector x, vector t, int i, int k) {
    vector[rows(x)] out;
    vector[rows(x)] a_i1j1;
    vector[rows(x)] a_ij1;


## GP_IV.R
# First load the libraries and define a function to create a

library(tidyverse); library(rstan)

set.seed(78)
ard_Sigma <- function(features, rho, alpha) {
  noise <- 1e-8
  features <- as.data.frame(features)
  P <- ncol(features)
  if(length(rho) != P) {

## GP_IV.stan
functions {
  matrix L_cov_exp_quad_ARD(matrix x,
                            real alpha,
                            vector rho,
                            real delta) {
                              int N = rows(x);
                              matrix[N, N] K;
                              real sq_alpha = square(alpha);
                              for (i in 1:(N-1)) {
                                K[i, i] = sq_alpha + delta;

## double_ml_hierarchical_bayes.R
library(rstanarm); library(tidyverse)
options(mc.cores = parallel::detectCores())
set.seed(42)
data("radon")
head(treatment_sample)

# Some levels have no variance in the outcomes, making likelihood estimates impossible
# Adding a tiny bit of noise fixes the problem
radon$log_uranium <- rnorm(nrow(radon), radon$log_uranium, 0.05)

## never_use_neural_networks_when_matrix_multiplication_will_do.R
# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a
# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out
# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks.
# The upside of course is that it estimates/generates predictions in a couple of seconds.

library(tidyverse); library(reticulate); library(pROC)

# Download some data from Kaggle
system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff")

## dont_fit_random_effects_with_optimization.R

# This illustrates why Neal's Funnel really messes with us when we're trying
# to estimate random effects models using maximum likelihood. Ie. why we need
# approximations with lme4, INLA, or need to go full Bayes. The mode is an
# awful one.

# Let's simulate some data from a really simple model with 10
# random intercepts whose variance you want to estimate

N <- 1000

## very_large_classification.R
library(dplyr)

# Let's make some fake data!
# These will contain true labels and 500 binary features
# We'll follow the general approach here: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
# but rather than using a penalized likelihood estimator for the label-activity probabilities, we'll approximate
# it by taking the average of the frequency of activity within label and 0.5.

N <- 100000
I <- 30000
	functions {
	vector inverse_mills(vector z) {
	vector[rows(z)] out;

	for(i in 1:rows(z)) {
	out[i] = exp(normal_lpdf(z[i] \| 0, 1)) / (Phi(z[i]));
	}
	return(out);
	}
	}
	functions {
	// B function
	vector B(vector x, vector t, int i, int j_p1);

	vector B(vector x, vector t, int i, int k) {
	vector[rows(x)] out;
	vector[rows(x)] a_i1j1;
	vector[rows(x)] a_ij1;
	# This script checks that the b-spline function in gist https://gist.github.com/khakieconomics/2272cd7f0d61950852622198b26a2d02
	# produces something approximating the function in the splines package.

	library(rstan)
	expose_stan_functions("b_spline_function.stan")

	probs <- runif(1000)
	probs <- probs[order(probs)]
	# First load the libraries and define a function to create a

	library(tidyverse); library(rstan)

	set.seed(78)
	ard_Sigma <- function(features, rho, alpha) {
	noise <- 1e-8
	features <- as.data.frame(features)
	P <- ncol(features)
	if(length(rho) != P) {
	functions {
	matrix L_cov_exp_quad_ARD(matrix x,
	real alpha,
	vector rho,
	real delta) {
	int N = rows(x);
	matrix[N, N] K;
	real sq_alpha = square(alpha);
	for (i in 1:(N-1)) {
	K[i, i] = sq_alpha + delta;
	library(rstanarm); library(tidyverse)
	options(mc.cores = parallel::detectCores())
	set.seed(42)
	data("radon")
	head(treatment_sample)

	# Some levels have no variance in the outcomes, making likelihood estimates impossible
	# Adding a tiny bit of noise fixes the problem
	radon$log_uranium <- rnorm(nrow(radon), radon$log_uranium, 0.05)
	# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
	# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a
	# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out
	# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks.
	# The upside of course is that it estimates/generates predictions in a couple of seconds.

	library(tidyverse); library(reticulate); library(pROC)

	# Download some data from Kaggle
	system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff")

	# This illustrates why Neal's Funnel really messes with us when we're trying
	# to estimate random effects models using maximum likelihood. Ie. why we need
	# approximations with lme4, INLA, or need to go full Bayes. The mode is an
	# awful one.

	# Let's simulate some data from a really simple model with 10
	# random intercepts whose variance you want to estimate

	N <- 1000
	library(dplyr)

	# Let's make some fake data!
	# These will contain true labels and 500 binary features
	# We'll follow the general approach here: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
	# but rather than using a penalized likelihood estimator for the label-activity probabilities, we'll approximate
	# it by taking the average of the frequency of activity within label and 0.5.

	N <- 100000
	I <- 30000