NickCH-K

## collider_sim.R

library(tidyverse)
library(broom)
library(purrr)

create_data = function(n, trt) {
  tibble(donut = rnorm(n),
         sleep = rnorm(n)) %>%
    mutate(lifting = rnorm(n) + .1*sleep + .3*donut,
           coffee = rnorm(n) + .2*sleep,

## regression_using_other_powers
library(tidyverse)
library(purrr)

predfunc = function(x, xdat) x[1] + x[2]*xdat
residsum = function(x, xdat, ydat, pwr) sum(abs((predfunc(x, xdat)-ydat))^pwr)


set.seed(1000)
exdat = bind_rows(
  tibble(label = 'Outlier X',

## language_switch.R
library(data.table); library(ggplot2)
dat = as.data.table(readxl::read_excel("Polls-per-user-CEU_R_Python_Stata_2022.xlsx"))
dat = dat[2:nrow(dat), 7:8]
setnames(dat, c('After','Before'))
dat = dat[!is.na(Before) & !(Before == 'Other') & !(After == 'Other')]
results = data.table(Language = c('Python','R','Stata'), Change = sapply(c('Python','R','Stata'), \(x) dat[, sum(After == x) - sum(Before == x)]))
ggplot(results, aes(x = Change, y = 0, label = Language)) +
  geom_hline(yintercept = 0, size = 1) + geom_point(size = 3, color = 'darkblue') + geom_text(vjust = -1) +
  scale_x_continuous(breaks = c(-5, 0, 5, 10), limits = c(-7, 12)) +
  ggthemes::theme_economist() +

## evolutionary_lasso.R
library(data.table)
library(glmnet)
library(ggplot2)

MUTATION_RATE = .5

generate_random_data = function(N = 1000, truth = c(.5, .5, -.5, 0, 0, 0, 0, 0, .1)) {
  dat = data.table(x = rnorm(N))
  dat = dat[, y := generate_predictions(x, truth) + rnorm(N)]
  return(dat)

## a_basic_py_cem.py
import pandas as pd
import statsmodels.formula.api as sm
# There is a cem package but it doesn't seem to work that well
# So we will do this by hand

br = pd.read_csv("broockman2013.csv")

# Create bins for our continuous matching variables
# cut creates evenly spaced bins
# while qcut cuts based on quantiles

## safegraph_aws.R
#' Download SafeGraph data from AWS COVID Response
#'
#' This is a thin wrapper for \code{aws.s3::s3sync} that will aim you at the right directory to synchronize.
#'
#' This function doesn't add too much, but it does make the default behavior you probably want a bit easier. If you plan to specify the \code{aws.s3::s3sync} "bucket" option yourself, this function is largely useless.
#'
#' See catalog.safegraph.io for more description of the various buckets.
#'
#' @param path The local directory to synchronize.
#' @param dataset The SafeGraph bucket to get from. Can be "weekly" (OLD VERSION), "weekly-new" (new method since December 2020), "weekly-backfill" (the new method for times before December 2020), "monthly" (OLD VERSION, but also includes the backfill data for the new version), "monthly-backfill" (method since Dec 2020), "distancing", "transactions", "core" (before Nov 2020), "core-new" (Nov 2020 or later), "geo-supplement" or, to get the baseline bucket, "none". v2 versions always selected.

## safegraph_file_locate
########## THIS FILE REQUIRES data.table TO BE LOADED TO FUNCTION. USE library(data.table)

#' Patterns File Lookup
#'
#' This function, given a date or range of dates, will return a character vector of folder paths you will need to read in with \code{list.files()} (or just set \code{list_files = TRUE} to return the full set of filepaths), which must be run through \code{list.files(pattern = '.csv.gz', full.names = TRUE)} after downloading files. This is done because the subfolder after this is based on the hour the data is released, which can't be predicted ahead of time for future weeks.
#'
#' For the period from mid-June-early December, 2020, data is available in both "old" (\code{patterns_backfill}) and "new" (\code{patterns}`) This function will generate filepaths to the "new" format.
#'
#' @param dates A vector of \code{Date} objects (perhaps taking a single \code{Date} object and adding \code{+lubridate::days(0:finish)}) to find the associated files for.
#' @param dir If specified, will append \code{di

## Basic within estimation
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm

# Read in data
gm = pd.read_csv('gapminder.csv')

# Put GDP per capita in log format since it's very skewed
gm['logGDPpercap'] = np.log(gm['gdpPercap'])

## .mean vs. .transform example
import pandas as pd
import numpy as np

# Read in data
gm = pd.read_csv('gapminder.csv')

# Put GDP per capita in log format since it's very skewed
gm['logGDPpercap'] = np.log(gm['gdpPercap'])

# Transform gives me one row per original row

## factortable.R
# Necessary functions
# (never worked with functions in R before? Just run these lines, the functions will be stored in memory
# sort of like if you load a package)

cpct <- function(df, var, name, append) {
  # Limit to nonmissings and see how many nonmissings there are
  df <- df %>% filter_at(var, any_vars(!is.na(.)))
  N <- nrow(df)

  df <- df %>%

	library(tidyverse)
	library(broom)
	library(purrr)

	create_data = function(n, trt) {
	tibble(donut = rnorm(n),
	sleep = rnorm(n)) %>%
	mutate(lifting = rnorm(n) + .1sleep + .3donut,
	coffee = rnorm(n) + .2*sleep,
	library(tidyverse)
	library(purrr)

	predfunc = function(x, xdat) x[1] + x[2]*xdat
	residsum = function(x, xdat, ydat, pwr) sum(abs((predfunc(x, xdat)-ydat))^pwr)


	set.seed(1000)
	exdat = bind_rows(
	tibble(label = 'Outlier X',
	library(data.table); library(ggplot2)
	dat = as.data.table(readxl::read_excel("Polls-per-user-CEU_R_Python_Stata_2022.xlsx"))
	dat = dat[2:nrow(dat), 7:8]
	setnames(dat, c('After','Before'))
	dat = dat[!is.na(Before) & !(Before == 'Other') & !(After == 'Other')]
	results = data.table(Language = c('Python','R','Stata'), Change = sapply(c('Python','R','Stata'), \(x) dat[, sum(After == x) - sum(Before == x)]))
	ggplot(results, aes(x = Change, y = 0, label = Language)) +
	geom_hline(yintercept = 0, size = 1) + geom_point(size = 3, color = 'darkblue') + geom_text(vjust = -1) +
	scale_x_continuous(breaks = c(-5, 0, 5, 10), limits = c(-7, 12)) +
	ggthemes::theme_economist() +
	library(data.table)
	library(glmnet)
	library(ggplot2)

	MUTATION_RATE = .5

	generate_random_data = function(N = 1000, truth = c(.5, .5, -.5, 0, 0, 0, 0, 0, .1)) {
	dat = data.table(x = rnorm(N))
	dat = dat[, y := generate_predictions(x, truth) + rnorm(N)]
	return(dat)
	import pandas as pd
	import statsmodels.formula.api as sm
	# There is a cem package but it doesn't seem to work that well
	# So we will do this by hand

	br = pd.read_csv("broockman2013.csv")

	# Create bins for our continuous matching variables
	# cut creates evenly spaced bins
	# while qcut cuts based on quantiles
	#' Download SafeGraph data from AWS COVID Response
	#'
	#' This is a thin wrapper for \code{aws.s3::s3sync} that will aim you at the right directory to synchronize.
	#'
	#' This function doesn't add too much, but it does make the default behavior you probably want a bit easier. If you plan to specify the \code{aws.s3::s3sync} "bucket" option yourself, this function is largely useless.
	#'
	#' See catalog.safegraph.io for more description of the various buckets.
	#'
	#' @param path The local directory to synchronize.
	#' @param dataset The SafeGraph bucket to get from. Can be "weekly" (OLD VERSION), "weekly-new" (new method since December 2020), "weekly-backfill" (the new method for times before December 2020), "monthly" (OLD VERSION, but also includes the backfill data for the new version), "monthly-backfill" (method since Dec 2020), "distancing", "transactions", "core" (before Nov 2020), "core-new" (Nov 2020 or later), "geo-supplement" or, to get the baseline bucket, "none". v2 versions always selected.
	########## THIS FILE REQUIRES data.table TO BE LOADED TO FUNCTION. USE library(data.table)

	#' Patterns File Lookup
	#'
	#' This function, given a date or range of dates, will return a character vector of folder paths you will need to read in with \code{list.files()} (or just set \code{list_files = TRUE} to return the full set of filepaths), which must be run through \code{list.files(pattern = '.csv.gz', full.names = TRUE)} after downloading files. This is done because the subfolder after this is based on the hour the data is released, which can't be predicted ahead of time for future weeks.
	#'
	#' For the period from mid-June-early December, 2020, data is available in both "old" (\code{patterns_backfill}) and "new" (\code{patterns}`) This function will generate filepaths to the "new" format.
	#'
	#' @param dates A vector of \code{Date} objects (perhaps taking a single \code{Date} object and adding \code{+lubridate::days(0:finish)}) to find the associated files for.
	#' @param dir If specified, will append \code{di
	import pandas as pd
	import numpy as np
	import statsmodels.formula.api as sm

	# Read in data
	gm = pd.read_csv('gapminder.csv')

	# Put GDP per capita in log format since it's very skewed
	gm['logGDPpercap'] = np.log(gm['gdpPercap'])
	# Necessary functions
	# (never worked with functions in R before? Just run these lines, the functions will be stored in memory
	# sort of like if you load a package)

	cpct <- function(df, var, name, append) {
	# Limit to nonmissings and see how many nonmissings there are
	df <- df %>% filter_at(var, any_vars(!is.na(.)))
	N <- nrow(df)

	df <- df %>%