Skip to content

Instantly share code, notes, and snippets.

View khakieconomics's full-sized avatar

Jim khakieconomics

View GitHub Profile
functions {
vector inverse_mills(vector z) {
vector[rows(z)] out;
for(i in 1:rows(z)) {
out[i] = exp(normal_lpdf(z[i] | 0, 1)) / (Phi(z[i]));
}
return(out);
}
}
functions {
// B function
vector B(vector x, vector t, int i, int j_p1);
vector B(vector x, vector t, int i, int k) {
vector[rows(x)] out;
vector[rows(x)] a_i1j1;
vector[rows(x)] a_ij1;
# This script checks that the b-spline function in gist https://gist.github.com/khakieconomics/2272cd7f0d61950852622198b26a2d02
# produces something approximating the function in the splines package.
library(rstan)
expose_stan_functions("b_spline_function.stan")
probs <- runif(1000)
probs <- probs[order(probs)]
@khakieconomics
khakieconomics / b_spline_function.stan
Created July 13, 2018 15:04
Produce a matrix of b-splines for use internally within a Stan program
functions {
// B function
vector B(vector x, vector t, int i, int j_p1);
vector B(vector x, vector t, int i, int k) {
vector[rows(x)] out;
vector[rows(x)] a_i1j1;
vector[rows(x)] a_ij1;
@khakieconomics
khakieconomics / GP_IV.R
Last active July 6, 2018 21:43
File to run/visualise GP_IV.stan
# First load the libraries and define a function to create a
library(tidyverse); library(rstan)
set.seed(78)
ard_Sigma <- function(features, rho, alpha) {
noise <- 1e-8
features <- as.data.frame(features)
P <- ncol(features)
if(length(rho) != P) {
functions {
matrix L_cov_exp_quad_ARD(matrix x,
real alpha,
vector rho,
real delta) {
int N = rows(x);
matrix[N, N] K;
real sq_alpha = square(alpha);
for (i in 1:(N-1)) {
K[i, i] = sq_alpha + delta;
library(rstanarm); library(tidyverse)
options(mc.cores = parallel::detectCores())
set.seed(42)
data("radon")
head(treatment_sample)
# Some levels have no variance in the outcomes, making likelihood estimates impossible
# Adding a tiny bit of noise fixes the problem
radon$log_uranium <- rnorm(nrow(radon), radon$log_uranium, 0.05)
# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a
# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out
# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks.
# The upside of course is that it estimates/generates predictions in a couple of seconds.
library(tidyverse); library(reticulate); library(pROC)
# Download some data from Kaggle
system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff")
# This illustrates why Neal's Funnel really messes with us when we're trying
# to estimate random effects models using maximum likelihood. Ie. why we need
# approximations with lme4, INLA, or need to go full Bayes. The mode is an
# awful one.
# Let's simulate some data from a really simple model with 10
# random intercepts whose variance you want to estimate
N <- 1000
library(dplyr)
# Let's make some fake data!
# These will contain true labels and 500 binary features
# We'll follow the general approach here: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
# but rather than using a penalized likelihood estimator for the label-activity probabilities, we'll approximate
# it by taking the average of the frequency of activity within label and 0.5.
N <- 100000
I <- 30000