Martin Papenberg m-Py

## PCA_Variance_Explained.R
# Get % of variance explained by Principal Component Analysis

library(psych)

pca_variance_explained <- function(data, n_components) {
  pca <- psych::principal(data, n_components, rotate = "none")
  list(
    by_variable = colSums(cor(pca$scores, data)^2),
    total = summary(prcomp(data))$importance["Cumulative Proportion", paste0("PC", n_components)]
  )

## compare_phi_h.R
# compare Cohen's h and phi coefficient as effect sizes for comparing proportions

library(effectsize)

# test data (have to be contingency tables; one column is always the same)
matrices <- lapply(1:999, function(i) matrix(c(i, 1000-i, 999, 1), ncol = 2))
phis <- sapply(matrices, function(x) effectsize::phi(x)$phi)
hs <- sapply(matrices, function(x) effectsize::cohens_h(x)$Cohens_h)

plot(abs(hs), phis, type = "l")

## small_anticlust_simulation.R
# Test if splitting data via anticlustering leads to closer groups means to the *true* population means,
# as compared to a random split (e.g., for cross validation
simulate <- function(N = 100, split = c(1, 3) / 4) { # default: split 75/25
  data <- rnorm(N)
  groups <- anticlustering(
    data,
    K = round(N * split),
    objective = "variance"
  )
  c(

## test_anticlust.R
## 1. Load - and, if required, install - package `anticlust`

if (!requireNamespace("remotes")) {
  install.packages("remotes")
}
remotes::install_github("m-Py/anticlust")

library(anticlust)


## simulate_glm.R

# Show that interaction in glm() changes nature of main effect
# (only if a categorical predictor is dummy coded - not contrast coded)

# Returns the p-value associated with a predictor main effect, once
# with and once without interaction with a (non-predictive) categorical
# independent variable

simulate_glm <- function(N = 100, contrast_coding = FALSE) {
  iv1 <- rnorm(N) # related to DV

## KNN_RANN.R
# Author: Martin Papenberg
# Year: 2019

# Perform fast KNN classifier using RANN for nearest neighbour search

library("RANN")
library("data.table")

# param data: The numeric data matrix used
# param labels: the labels to predict

## covariate_regression.R


## This document illustrates that type 1 sum of squares lead to increased alpha
## error rates when a predictive covariate is included in the regression model.


# Estimate p-value for treatment (null) effect via linear regression,
# including a covariate that is predictive of the outcome
#
# param N: sample size, default 100

## correlated_data.R
## Year 2019 - 2020
## Author: Martin Papenberg

## This code is in the public domain, do with it whatever you like.
# Generate bivariate normal data with specified correlation

# param n: how many data points
# param mx: the mean of the first variable
# param my: the mean of the second variable
# param sdx: the standard deviation of the first variable

## SIX_OUT_OF_THIRTY.R
## Warning: This code is just for fun / educational purposes; the file contains functions
## to find out how severely the p value in a t-test can be minimized by systematic removal of data points.

## SIX OUT OF THIRTY - Martin's approach
## Based on @juli_tkotz's (https://twitter.com/juli_tkotz/status/1085446224117985281)
## idea that removing from the most extreme values is the best apporach.


#' Simulate t-tests and store best p values
#'

## ordinal_scores.R
## Author Martin Papenberg
## Year 2018

## This code is released into the public domain. Anybody may use, alter
## and distribute the code without restriction. The author makes no
## guarantees, and takes no liability of any kind for use of this code.

#' Compute ordinal scores from continuous data
#'
#' Might be useful for data exploration with highly skewed data
	# Get % of variance explained by Principal Component Analysis

	library(psych)

	pca_variance_explained <- function(data, n_components) {
	pca <- psych::principal(data, n_components, rotate = "none")
	list(
	by_variable = colSums(cor(pca$scores, data)^2),
	total = summary(prcomp(data))$importance["Cumulative Proportion", paste0("PC", n_components)]
	)
	# compare Cohen's h and phi coefficient as effect sizes for comparing proportions

	library(effectsize)

	# test data (have to be contingency tables; one column is always the same)
	matrices <- lapply(1:999, function(i) matrix(c(i, 1000-i, 999, 1), ncol = 2))
	phis <- sapply(matrices, function(x) effectsize::phi(x)$phi)
	hs <- sapply(matrices, function(x) effectsize::cohens_h(x)$Cohens_h)

	plot(abs(hs), phis, type = "l")
	# Test if splitting data via anticlustering leads to closer groups means to the true population means,
	# as compared to a random split (e.g., for cross validation
	simulate <- function(N = 100, split = c(1, 3) / 4) { # default: split 75/25
	data <- rnorm(N)
	groups <- anticlustering(
	data,
	K = round(N * split),
	objective = "variance"
	)
	c(
	## 1. Load - and, if required, install - package `anticlust`

	if (!requireNamespace("remotes")) {
	install.packages("remotes")
	}
	remotes::install_github("m-Py/anticlust")

	library(anticlust)

	# Show that interaction in glm() changes nature of main effect
	# (only if a categorical predictor is dummy coded - not contrast coded)

	# Returns the p-value associated with a predictor main effect, once
	# with and once without interaction with a (non-predictive) categorical
	# independent variable

	simulate_glm <- function(N = 100, contrast_coding = FALSE) {
	iv1 <- rnorm(N) # related to DV
	# Author: Martin Papenberg
	# Year: 2019

	# Perform fast KNN classifier using RANN for nearest neighbour search

	library("RANN")
	library("data.table")

	# param data: The numeric data matrix used
	# param labels: the labels to predict


	## This document illustrates that type 1 sum of squares lead to increased alpha
	## error rates when a predictive covariate is included in the regression model.


	# Estimate p-value for treatment (null) effect via linear regression,
	# including a covariate that is predictive of the outcome
	#
	# param N: sample size, default 100
	## Year 2019 - 2020
	## Author: Martin Papenberg

	## This code is in the public domain, do with it whatever you like.
	# Generate bivariate normal data with specified correlation

	# param n: how many data points
	# param mx: the mean of the first variable
	# param my: the mean of the second variable
	# param sdx: the standard deviation of the first variable
	## Warning: This code is just for fun / educational purposes; the file contains functions
	## to find out how severely the p value in a t-test can be minimized by systematic removal of data points.

	## SIX OUT OF THIRTY - Martin's approach
	## Based on @juli_tkotz's (https://twitter.com/juli_tkotz/status/1085446224117985281)
	## idea that removing from the most extreme values is the best apporach.


	#' Simulate t-tests and store best p values
	#'
	## Author Martin Papenberg
	## Year 2018

	## This code is released into the public domain. Anybody may use, alter
	## and distribute the code without restriction. The author makes no
	## guarantees, and takes no liability of any kind for use of this code.

	#' Compute ordinal scores from continuous data
	#'
	#' Might be useful for data exploration with highly skewed data