Workshop date: 10/17/2019
Home directory: 50 Mb storage
Scratch: 10 TB (90 day limit)
Can purchase storage; requires a PTAO
# Table 2.8, Agresti (2002) p. 57 | |
M <- as.table(rbind(c(1, 3, 10, 6), | |
c(2, 3, 10, 7), | |
c(1, 6, 14, 12), | |
c(0, 1, 9, 11))) | |
dimnames(M) <- list(`income` = c("<15", "15-25", "25-40", ">40"), | |
`job satisfaction` = c("VD", "LD", "MS","VS")) | |
# count concordant pairs | |
n <- nrow(M) |
# replicate Fig 2 of | |
# Pocock, S.J., Assmann, S.E., Enos, L.E. and Kasten, L.E. (2002), | |
# Subgroup analysis, covariate adjustment and baseline comparisons in clinical trial reporting: current practiceand problems. | |
# Statist. Med., 21: 2917-2930. https://doi.org/10.1002/sim.1296 | |
Z_x <- seq(-2, 2, 0.1) | |
rho <- c(0, 0.1, 0.3, 0.5, 0.7, 0.9) | |
d <- expand.grid(Z_x = Z_x, rho = rho) | |
Z_a <- qnorm(0.025, lower.tail = F) | |
d$size <- pnorm((Z_a - d$Z_x*d$rho)/sqrt(1 - d$rho^2), |
# simulate data similar to that used in Aiken & West | |
# Aiken, L. S. and West, S.G. (1991). Multiple Regression: Testing and Interpreting Interactions. Newbury Park, Calif: Sage Publications. | |
# The authors do not provide data but give summary stats of data in Table 2.1 (p. 11) | |
# The only parameter I had to guess at was the residual standard error. I assumed N(0,5) | |
library(MASS) | |
covar <- 0.42*2.20*0.95 # convert correlation to covariance | |
set.seed(1) | |
xz <- mvrnorm(n = 400, mu = c(5, 10), |
# generate population with collinearity | |
n <- 10000 | |
x1 <- runif(n, -3, 3) | |
x2 <- x1 * 2 + rnorm(n, sd = 0.3) | |
plot(x1, x2) # collinearity! | |
x3 <- rnorm(n, 10, 2) | |
x4 <- x3 * -4 + rnorm(n, sd = 0.5) | |
plot(x3, x4) # collinearity! | |
x5 <- rexp(n = n) | |
x6 <- runif(n = n) |
# https://www.sciencedirect.com/science/article/pii/S0022480420305023?via%3Dihub | |
# illustrates that tests with small P-values always have high observed power (on the left side, where the P-value is close to 0 and observed power close to 1). | |
x1 <- rnorm(30, mean = 2, sd = 2) | |
x2 <- rnorm(30, mean = 3, sd = 2) | |
tout <- t.test(x1, x2) | |
obs_eff <- diff(tout$estimate) | |
pout <- power.t.test(n = 30, delta = abs(obs_eff), sd = 2, sig.level = 0.05) | |
pout$power |
# R version of load_planar_dataset() on https://datascience-enthusiast.com/DL/Planar-data-classification-with-one-hidden-layer.html | |
# Could use python code to export result of load_planar_dataset() as CSV: | |
# import pandas as pd | |
# df1 = pd.DataFrame(np.transpose(X), columns = ['X1','X2']) | |
# df2 = pd.DataFrame(np.transpose(Y), columns = ['Y']) | |
# df = pd.concat([df1, df2], axis = 1) | |
# df.to_csv('planar_flower.csv', index = False) | |
# Or create data in R as a data frame |
# Clay Ford | |
# 2020-06-22 | |
# Simulate data from a proportional odds model with proportional odds assumption | |
# satisfied. | |
# 300 observations and a grouping variable (example: democratic/republican) | |
n <- 300 | |
set.seed(1) | |
grp <- sample(0:1, size = n, replace = TRUE) |
library(tidyverse) | |
d1 <- tibble(name = c("Clay", "Laura"), | |
score_1 = c(88, 99), | |
score_2 = c(77, 88), | |
score_3 = c(55, 66), | |
survey_1 = c(4, 5), | |
survey_2 = c(3, 3), | |
survey_3 = c(2, 5)) | |
d1 | |
# A tibble: 2 x 7 |
library(car) | |
library(MASS) | |
# generate data with slight non-constant variance | |
x1 <- gl(n = 3, k = 400, labels = c("A","B","C")) | |
x2 <- gl(n = 2, k = 600, labels = c("1","2")) | |
set.seed(1) | |
y <- 1 + 1.2*(x1 == "B") + 1.3*(x1 == "C") -0.5*(x2 == "2") + |