Christopher Peters statwonk

## gist:6283c5b01e5896b94c46edfdd9ff490a
library(tidyverse)
library(rvest)
library(gamlss)
library(brms)
library(tidybayes)
select <- dplyr::select

####################################################################################
# Model the market capitalizations of members of the S&P 500.
####################################################################################

## fattails.R
library(tidyverse)
library(quantmod)
library(gamlss)
select <- dplyr::select
posix <- function(x) { as.POSIXct(x, origin = "1970-01-01") }

## "Fat tails"
## Here we compare the residuals from the normal and t distributional models.
## Notice standardized error is worse in the normal model. This happens
## because returns are leptokurtic (large surprises should be expected, there's risk in stock returns),

## risk_adds_up2.R
library(tidyverse)
library(ggthemes)
expand.grid(
  risk = seq(0.1/5e3, 1/5e3, 1e-05), # average daily risk e.g. - 1,000 infected per day in Alabama / 5,000,000 AL population
  units_of_exposure = seq_len(31) # days of exposure (up to 31 days)
) %>% as_tibble() %>%
  mutate(total_risk = map2_dbl(risk, units_of_exposure, ~ 1 - (1 - .x)^(.y)),
         total_odds = 1/total_risk,
         risk_threshold = case_when(total_odds <= 5e2 ~ "Worse than 1 in 500",
                                    total_odds <= 1e3 ~ "Worse than 1 in 1k chance",

## sk_learn_logistic.Rmd
---
title: "Testing sklearn's Stochastic Gradient Descent Algo"
author: "Statwonk"
date: "2/07/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(reticulate)

## generate_multilevel_logistic_data.R
library(tidyverse)
library(brms)
library(tidybayes)

1e7 -> N # obs
1 -> J # groups of members
10 -> K # members
0.5 -> base_p # base rate, this is logistic regression

# sample member coefficients

## out_of_core.py
import sklearn
from sklearn import naive_bayes
import pandas as pd
import numpy as np

d = pd.read_csv("data.csv")

y = d.iloc[:, 1]
X = d.iloc[:,list(range(2, d.shape[1]))]

## variational_logistic.R
library(tidyverse)
library(brms)
library(tidybayes)

3e4 -> N
40 -> K
rnorm(K) -> group_coefs

tibble(K = factor(rep(paste0("group_", seq_len(K)), length.out = N))) %>%
  mutate(coef = rep(group_coefs, N/40)) %>%

## massive_logistic.R
library(tidyverse)
1e4 -> N
0.03 -> p
# author: twitter.com/statwonk
# showing how cases can be discarded in logistic regression while preserving an unbiased estimator
seq_len(1e3) %>%
  map_dbl(function(x) {
    rbinom(N, 1, p) -> y
    tibble(
      all_data = tibble(y = y) %>% glm(y ~ 1, "binomial", .) %>% coef() %>% plogis(),

## coronavirus.R
set.seed(1)
N <- 1e7 # sims

quantiles_of_interest <- function(x) { quantile(x, c(0.0001, 0.25, 0.5, 0.75, 0.9999)) }

death_rate <- function() { pmin(pmax(rlnorm(N, log(0.02), 0.35), 0.001), 0.06) }
quantiles_of_interest(death_rate())

# very robust / ignorant belief of range of case 2.4k to 60M cases.
susceptible_cases <- function() { runif(N, 0.0024, 150) }

## coronavirus.R
set.seed(0)
N <- 1e7 # sims

quantiles_of_interest <- function(x) { quantile(x, c(0.0001, 0.25, 0.5, 0.75, 0.9999)) }

death_rate <- function() { pmin(pmax(rlnorm(N, log(0.02), 0.35), 0.001), 0.06) }
# Statwonk's belief of final US death rate
quantiles_of_interest(death_rate())

# 0.01%         25%         50%         75%      99.99%
	library(tidyverse)
	library(rvest)
	library(gamlss)
	library(brms)
	library(tidybayes)
	select <- dplyr::select

	####################################################################################
	# Model the market capitalizations of members of the S&P 500.
	####################################################################################
	library(tidyverse)
	library(quantmod)
	library(gamlss)
	select <- dplyr::select
	posix <- function(x) { as.POSIXct(x, origin = "1970-01-01") }

	## "Fat tails"
	## Here we compare the residuals from the normal and t distributional models.
	## Notice standardized error is worse in the normal model. This happens
	## because returns are leptokurtic (large surprises should be expected, there's risk in stock returns),
	library(tidyverse)
	library(ggthemes)
	expand.grid(
	risk = seq(0.1/5e3, 1/5e3, 1e-05), # average daily risk e.g. - 1,000 infected per day in Alabama / 5,000,000 AL population
	units_of_exposure = seq_len(31) # days of exposure (up to 31 days)
	) %>% as_tibble() %>%
	mutate(total_risk = map2_dbl(risk, units_of_exposure, ~ 1 - (1 - .x)^(.y)),
	total_odds = 1/total_risk,
	risk_threshold = case_when(total_odds <= 5e2 ~ "Worse than 1 in 500",
	total_odds <= 1e3 ~ "Worse than 1 in 1k chance",
	---
	title: "Testing sklearn's Stochastic Gradient Descent Algo"
	author: "Statwonk"
	date: "2/07/2021"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	library(reticulate)
	import sklearn
	from sklearn import naive_bayes
	import pandas as pd
	import numpy as np

	d = pd.read_csv("data.csv")

	y = d.iloc[:, 1]
	X = d.iloc[:,list(range(2, d.shape[1]))]
	library(tidyverse)
	1e4 -> N
	0.03 -> p
	# author: twitter.com/statwonk
	# showing how cases can be discarded in logistic regression while preserving an unbiased estimator
	seq_len(1e3) %>%
	map_dbl(function(x) {
	rbinom(N, 1, p) -> y
	tibble(
	all_data = tibble(y = y) %>% glm(y ~ 1, "binomial", .) %>% coef() %>% plogis(),
	set.seed(1)
	N <- 1e7 # sims

	quantiles_of_interest <- function(x) { quantile(x, c(0.0001, 0.25, 0.5, 0.75, 0.9999)) }

	death_rate <- function() { pmin(pmax(rlnorm(N, log(0.02), 0.35), 0.001), 0.06) }
	quantiles_of_interest(death_rate())

	# very robust / ignorant belief of range of case 2.4k to 60M cases.
	susceptible_cases <- function() { runif(N, 0.0024, 150) }
	set.seed(0)
	N <- 1e7 # sims

	quantiles_of_interest <- function(x) { quantile(x, c(0.0001, 0.25, 0.5, 0.75, 0.9999)) }

	death_rate <- function() { pmin(pmax(rlnorm(N, log(0.02), 0.35), 0.001), 0.06) }
	# Statwonk's belief of final US death rate
	quantiles_of_interest(death_rate())

	# 0.01% 25% 50% 75% 99.99%