Skip to content

Instantly share code, notes, and snippets.

@dougspencer
Last active November 18, 2025 05:56
Show Gist options
  • Select an option

  • Save dougspencer/ce597584de03238c38a0d68996329828 to your computer and use it in GitHub Desktop.

Select an option

Save dougspencer/ce597584de03238c38a0d68996329828 to your computer and use it in GitHub Desktop.
######################
# GAPMINDER Exercise
######################
install.packages("gapminder")
library(gapminder)
# read in data
data(gapminder)
head(gapminder)
# rename data
x <- gapminder; rm(gapminder)
# create log GDP to address massive right screw
hist(x$gdpPercap, breaks = 50)
x$logGDP <- log(x$gdpPercap)
# plot life expectancy against log(GDP)
plot(x$logGDP, x$lifeExp,
pch = 19,
col = "dodgerblue4",
main = "Life Expectancy v. GDP",
xlab = "GDP per capita (log scale)",
ylab = "Life Expectancy (years)",
cex = 2)
lines(lowess(x$logGDP, x$lifeExp),
col = "red",
lwd = 3)
# now color code for continent
table(x$continent) # note there are 5 continents
x$color <- as.numeric(x$continent)
plot(x$logGDP, x$lifeExp,
pch = 19,
col = x$color,
main = "Life Expectancy v. GDP",
xlab = "GDP per capita (log scale)",
ylab = "Life Expectancy (years)")
lines(lowess(x$logGDP, x$lifeExp),
col = "red",
lwd = 3)
legend("bottomright",
legend = c("Africa", "Americas", "Asia", "Europe", "Oceania"),
pch = 19,
col = c(1:5),
title = "Continents",
cex = 0.5)
######################
# PENGUINS Exercise
######################
x <- read.csv("http://www.dougspencer.org/data/penguins.csv")
head(x)
table(x$year)
aggregate(flipper_length_mm ~ species, data = x, FUN = mean)
plot(x$body_mass_g, x$flipper_length_mm,
pch = 19,
col = "black",
xlab = "Body mass (g)",
ylab = "Flipper length (mm)",
main = "Are Penguins Proportional?",
xlim = c(2500, 6500)
)
lines(lowess(x$body_mass_g, x$flipper_length_mm),
col = "red",
lwd = 3,
lty = 3)
x$color <- "dodgerblue4"
x$color[x$species == "Chinstrap"] <- "burlywood"
x$color[x$species == "Gentoo"] <- "violetred4"
plot(x$body_mass_g, x$flipper_length_mm,
xlab = "Body mass (g)",
ylab = "Flipper length (mm)",
xlim = c(2500,6500),
pch = 16,
col = x$color
)
######################
# MTCARS Exercise
######################
# read in data
x <- read.csv("http://www.dougspencer.org/data/mtcars.csv")
# how many cars have 8 cylinders?
table(x$cyl)
# median horsepower
median(x$hp)
# how many cars have more than 100 hp?
x[x$hp > 100, ]
# correlation between horesepower and mpg?
cor(x$hp, x$mpg)
# Relationship between horsepower and mpg
plot(x$hp, x$mpg,
xlab = "Horsepower",
ylab = "MPG",
pch = 18,
main = "Powerful Cars Guzzle Gas")
# create colors
x$color <- NA
x$color[x$cyl == 4] <- "burlywood"
x$color[x$cyl == 6] <- "violetred4"
x$color[x$cyl == 8] <- "dodgerblue4"
# new plot
plot(x$hp, x$mpg,
xlab = "Horsepower",
ylab = "MPG",
pch = 18,
cex = 2,
main = "Powerful Cars Guzzle Gas",
col = x$color,
xlim = c(0, 350))
# add legend
legend(275, 33,
pch = 18,
pt.cex = 2,
col = c("dodgerblue4", "violetred4", "burlywood"),
legend = c(4,6,8),
title = "Cylinders",
bty = "n")
# linear trend line
abline(lm(mpg ~ hp, data = x))
# lowess line
lines(lowess(x$hp, x$mpg),
col = "red",
lty = 3,
lwd = 3)
# add correlation to plot
text(50, 12, "Cor = -0.78")
# estimate bivariate relatinship mpg ~ hp
lm1 <- lm(mpg ~ hp, data = x)
summary(lm1)
######################
# GALTON Exercise
######################
# read in data
x <- read.csv("http://www.dougspencer.org/data/galton.csv")
# how many families
length(unique(x$family))
# how many male kids?
table(x$sex)
# create adjusted average parent height
x$adj_height <- x$mother*1.08
x$avg_parent <- (x$father + x$adj_height)/2
# plot
plot(x$avg_parent, x$height,
pch = 16,
col = "dodgerblue4",
xlab = "Avg parent height",
ylab = "Children's height")
abline(lm(height ~ avg_parent, data = x),
col = "red",
lwd = 3)
# lowess line
lines(lowess(x$avg_parent, x$height),
col = "burlywood",
lwd = 2)
# estimate linear model
m <- lm(height ~ avg_parent, data = x)
summary(m)
# find correlation metric
cor(x$height, x$avg_parent)
######################
# CAMPAIGN FINANCE & CORRUPTION Exercise
######################
# read in data
load(url("http://www.dougspencer.org/data/cf_conjoint.RData"))
# rename object
x <- cf_conjoint; rm(cf_conjoint)
# review data
table(x$AGT_Corrupt)
table(x$AGT_Finance)
# crosstab
table(x$AGT_Corrupt, x$AGT_Finance)
# now plot (numeric)
plot(as.numeric(x$AGT_Corrupt), as.numeric(x$AGT_Finance))
# plot (jitter)
plot(jitter(as.numeric(x$AGT_Corrupt)),
jitter(as.numeric(x$AGT_Finance)),
pch = 16,
col = "dodgerblue4",
xlab = "How corrupt is American politics?",
ylab = "Campaign finance rules are ___",
axes = F)
axis(1, at = c(1:5), labels = c("Extremely", "", "Somewhat", "", "Not at all"))
axis(2, at = c(1:5), labels = c("Way too strict", "", "About right", "", "Way too loose"))
box()
## IF TIME -- plot by Ideology
table(x$CC334A)
lib <- x[x$CC334A == "Very Liberal", ]
cons <- x[x$CC334A == "Very Conservative", ]
## copy and paste plot code and replace objects
par(mfrow = c(1, 2)) # this line directs R to generate multiple figures by row: 1 row and 2 columns
# plot 1 (object lib)
plot(jitter(as.numeric(lib$AGT_Corrupt)),
jitter(as.numeric(lib$AGT_Finance)),
pch = 16,
col = "dodgerblue4",
xlab = "How corrupt is American politics?",
ylab = "Campaign finance rules are ___",
axes = F,
main = "Very liberal")
axis(1, at = c(1:5), labels = c("Extremely", "", "Somewhat", "", "Not at all"))
axis(2, at = c(1:5), labels = c("Way too strict", "", "About right", "", "Way too loose"))
# plot 2 (object cons)
plot(jitter(as.numeric(cons$AGT_Corrupt)),
jitter(as.numeric(cons$AGT_Finance)),
pch = 16,
col = "dodgerblue4",
xlab = "How corrupt is American politics?",
ylab = "Campaign finance rules are ___",
axes = F,
main = "Very conservative")
axis(1, at = c(1:5), labels = c("Extremely", "", "Somewhat", "", "Not at all"))
axis(2, at = c(1:5), labels = c("Way too strict", "", "About right", "", "Way too loose"))
######################
# NYC FLIGHT DELAYS
######################
# load data
load(url("http://www.dougspencer.org/data/nycflights13.RData"))
# rename object
x <- nycflights13; rm(nycflights13)
# average distance of flight
mean(x$distance)
median(x$distance)
# how many carriers
length(table(x$carrier))
# who is most common carrier (UA)
sort(table(x$carrier))
# histogram of departure times
hist(x$dep_time,
xlab = "Time of Day",
ylab = "Number of flights",
main = "Departures from NYC Area Airports",
axes = F)
# add custom axes
axis(1, at = seq(0, 2400, 300), labels = c("12a", "3a", "6a", "9a", "12p", "3p", "6p", "9p", "12a"))
axis(2, at = seq(0, 25000, 5000))
# what percent of flights had delay?
x$delay <- ifelse(x$dep_delay > 0, 1, 0)
prop.table(table(x$delay))
# highest rate of delays by carrier
avg <- aggregate(delay ~ carrier, data = x, FUN = mean)
avg[order(avg$delay), ]
# longest average delay by carrier
aggregate(dep_delay ~ carrier, data = x, FUN = mean)
# OPTIONAL: zero-out flights that leave early (and that might skew average)
x$delay_time <- ifelse(x$dep_delay < 0, 0, x$dep_delay)
adj_avg <- aggregate(delay_time ~ carrier, data = x, FUN = mean)
adj_avg[order(adj_avg$delay_time), ]
######################
# BIASED BALLOTS
######################
# load data
load(url("http://www.dougspencer.org/data/biased_ballots.RData"))
# rename object
x <- ballot.titles; rm(ballot.titles)
# which universities particiated in study?
table(x$r.univ)
# plot party ID by university
table(x$r.univ, x$r.party)
plot(jitter(x$r.party),
jitter(as.numeric(as.factor(x$r.univ))),
pch = 16,
axes = F,
xlab = "Party ID",
ylab = ""
)
axis(1, at = c(-3:3))
axis(2, at = c(1,2), labels = c("BYU", "Berkeley"))
# variation in summary and title
plot(density(x$bal.sum.len),
ylim = c(0, 0.018),
main = "Length of Ballot Titles & Summaries")
lines(density(x$bal.title.len),
lwd = 3,
col = "red")
# legend
legend("topright",
col = c("red", "black"),
lwd = c(2, 1),
legend = c("Title", "Summary"),
title = "Ballot Measures"
)
# compare ideology and party ID
plot(jitter(x$r.ideol), jitter(x$r.party),
xlab = "Political Ideology",
ylab = "Party ID",
cex.lab = 1.1,
main = "Self-reported political preferences\n(Survey respondents, N=996)",
pch = 16,
col = "firebrick4",
axes = F,
xlim = c(-3,3)
)
axis(1, at = c(-3:3),
labels = c("Very\nliberal", "Liberal", "Somewhat\nliberal", "Moderate", "Somewhat\nconservative", "Conservative", "Very\nconservative"),
padj = 0.5)
axis(2, at = c(-3:3),
labels = c("Strong\nDem", "Democratic", "Lean\nDem", "Indep", "Lean\nRepub", "Republican", "Strong\nRepub"))
cor(x$r.ideol, x$r.party, use = "complete.obs")
text(2, -2, "Corr = 0.82")
# add OLS line
lm1 <- lm(r.party ~ r.ideol, data = x)
summary(lm1)
abline(lm1)
# add 45 deg line for reference
abline(0, 1, lty = 2, col = "gray45")
######################
# UNIV TEXAS STUDENT EVALS
######################
load(url("http://www.dougspencer.org/data/utaustin_evals.RData"))
# rename object
x <- utaustin_evals; rm(utaustin_evals)
# which gender/ethnicity of profs gets lowest evals?
aggregate(score ~ gender + ethnicity, data = x, FUN = mean)
# how many "bad" profs are there?
prof <- aggregate(score ~ prof_ID, data = x, FUN = mean)
## identify cut-off
plot(density(prof$score))
## maybe 2*sd?
mean(prof$score) - 2*sd(prof$score)
prof[prof$score < 3.12, ] # 3 profs
prof <- prof[order(prof$score), ] # sort and then manually select cutoff
# tenure vs. non-tenure
aggregate(score ~ rank, data = x, FUN = mean)
## response rate a problem?
x$rate <- x$cls_did_eval/x$cls_students
plot(x$rate, x$score,
xlab = "Response rate",
ylab = "Mean eval score",
main = "Do Response Rates Affect Scores?",
pch = 16,
col = "dodgerblue4")
lm1 <- lm(score ~ rate, data = x)
summary(lm1)
abline(lm1, lty = 2, col = "firebrick4", lwd = 3)
cor(x$score, x$rate) # cor = 0.18
######################
# SPENCER'S PELOTON DATA
######################
x <- read.csv("http://www.dougspencer.org/data/dork_horse_workouts.csv")
# change date to date (so plots sequentially)
x$date <- as.Date(x$Workout.Timestamp)
x$year <- substr(x$date, 1, 4)
# plot total output over time
plot(x$date, x$Total.Output)
# subset to just bike rides
x <- x[x$Fitness.Discipline == "Cycling", ]
## drops from 526 obs to 226
x30 <- x[x$Length..minutes. == 30, ]
plot(x30$date, x30$Total.Output)
# subset out Arms
grep("Arms", x30$Title)
# drops to 182 obs
x30 <- x30[-grep("Arms", x30$Title), ]
plot(x30$date, x30$Total.Output)
# drop outliers (2 obs)
x30 <- x30[x30$Total.Output > 200, ]
plot(x30$date, x30$Total.Output,
pch = 16,
col = "gray",
xlab = "Date",
ylab = "Total Output",
main = "Spencer Peloton Over Time")
# horiz line at mean
abline(h = mean(x30$Total.Output), lty = 2)
# lowess line
lines(lowess(x30$date, x30$Total.Output),
col = "dodgerblue4",
lwd = 2)
## 45 and 60 sec rides?
agg <- aggregate(Total.Output ~ Length..minutes., data = x, FUN = mean)
agg$avg <- agg$Total.Output/agg$Length..minutes.
agg
##
inst <- data.frame(sort(table(x$Instructor.Name)))
# subset to just inst > 5
inst <- inst[inst$Freq > 5, ]
# now hard part -- subset 'x' to just these instructors
df <- x[x$Instructor.Name %in% inst$Var1, ]
## subset to just instructors > 5
coach <- aggregate(Total.Output ~ Instructor.Name, data = df, FUN = mean)
######################
# FBI CRIME STATISTICS
######################
## arrest rates
# https://cde.ucr.cjis.gov/LATEST/webapp/#
# scroll down to "Additional Datasets" and download
# (1) Summary Reporting Survey (SRS) ("estimated_crimes_1979_2023.csv")
# (2) Arrest Data - Reported Number of Arrests by Crime ("arrests_national.csv")
# read in estimated crimes data
ec <- read.csv("~/Downloads/estimated_crimes_1979_2024.csv")
# add USA to national crime stats line
ec$state_abbr[ec$state_abbr==""] <- "USA"
# subset to just USA numbers
ec.usa <- ec[ec$state_abbr == "USA", ]
# side by side plots
par(mfrow = c(2,2))
# now plot
plot(ec.usa$year, ec.usa$violent_crime,
type = "b",
main = "Estimated Violent Crime\n in USA",
ylab = "Crimes divided by 100,000",
xlab = "Year",
xlim = c(1960,2021))
plot(ec.usa$year, as.numeric(ec.usa$property_crime)/100000,
type = "b",
main = "Estimated Property Crime \n in USA",
ylab = "Crime rate per 100,000",
xlab = "Year",
xlim = c(1960,2021))
## read in arrests data
ar <- read.csv("http://www.dougspencer.org/data/arrests_national.csv")
# create plot space with side-by-side plots
par(mfcol = c(1,2))
# now make the plots
plot(ar$year, ar$tot_clr_index_violent, type = "o",
main = "Violent Crime")
plot(ar$year, ar$tot_clr_index_property, type = "l", main = "Property Crime")
## read in nat'l crime victimization sruvey (we did not do this in class)
# URL: https://www.icpsr.umich.edu/web/NACJD/series/95
# Ctrl+F "ICPSR 38963" which should be the survey for 1992-2023
# Uncompress zip file and there will be three folders.
# Choose "DS0003" folder and the RData file is saved as ".rda"
load("~/Downloads/ICPSR_38963/DS0003/38963-0003-Data.rda")
# rename unwieldy object name
vs <- da38963.0003; rm(da38963.0003)
# now sum the number of crimes respondents claimed they were victims each year
# variable for reported crimes is "V2073"
v <- aggregate(V2073 ~ YEAR, data = vs, FUN = sum)
# Now plot these survey responses over time
plot(v$YEAR, v$V2073, type = "b",
xlab = "Year",
ylab = "Total reported crimes",
main = "Total Number of Reported Crimes \n (Nat'l Crime Victimization Survey)")
######################
# SOCIAL SECURITY BABY NAMES
######################
load(url("http://www.dougspencer.org/data/babynames.RData"))
# rename
x <- babynames; rm(babynames)
# explore meaning/organization of data
aggregate(prop ~ year + sex, data = x, FUN = sum)
# what is the rage of years included in the dataset?
range(x$year)
summary(x$year)
table(x$year)
min(x$year)
max(x$year)
# plot unique names by year
head(x) # to note that each row is a unique name by year
plot(table(x$year))
# alternative way to do the same thing
year <- as.data.frame(table(x$year))
plot(year$Var1, year$Freq)
# which year had most unique names
sort(table(x$year))
# top 5 baby names in birth year
# (note that rows already sorted from most to least)
head(x[x$year == 1979 & x$sex == "F", ])
head(x[x$year == 1979 & x$sex == "M", ])
# how many babies given your name in your birth year
x[x$name == "Douglas" & x$year == 1979 & x$sex == "M", ]
# my name's rank
y79 <- x[x$year == 1979 & x$sex == "M", ]
y79$rank <- 1:length(y79$name)
head(y79)
y79[y79$name == "Douglas",]
# my name in entire sample
doug <- x[grep("Doug", x$name), ]
# how many?
sum(doug$n)
# how timeless? (range)
range(doug$year)
# my name over time
par(mfrow = c(1,2))
plot(doug$year, doug$n,
type = "h")
abline(v = 1979, lty = 2)
plot(doug$year, doug$prop,
type = "h")
abline(v = 1979, lty = 2)
# what year was Douglas most popular?
doug[doug$n == max(doug$n),]
######################
# TITANIC PASSENGER MANIFEST
######################
x <- read.csv("http://www.dougspencer.org/data/titanic.csv")
head(x)
plot(density(x$Age, na.rm = T))
hist(x$Age,
breaks = 20,
col = "burlywood",
main = "Age of Titanic Passengers",
xlab = "Age")
# average survival rate by ticket class
aggregate(Survived ~ Pclass, data = x, FUN = mean)
# create fare category
x$fare.cat <- cut(x$Fare, breaks = 4)
table(x$fare.cat) # nope note quite right!!
x$fare.cat <- cut(x$Fare, breaks = quantile(x$Fare, c(0, 0.25, 0.5, 0.75, 1)), include.lowest = T)
x$age.cat <- cut(x$Age, breaks = quantile(x$Age, c(0, 0.25, 0.5, 0.75, 1), na.rm = T), include.lowest = T)
table(x$fare.cat)
# plot survival rate by fare
fs <- aggregate(Survived ~ fare.cat, data = x, FUN = mean)
# histogram
plot(as.numeric(fs$fare.cat), fs$Survived, type = "h")
# scatterplot
plot(fs, axes = F, ylim = c(0,0.6))
axis(1, at = c(1:4), labels = c("Lowest 25% fare", "", "", "HIghest 25% fare"))
axis(2, at = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6))
box()
# sex disparity by fare category
aggregate(Survived ~ age.cat + fare.cat, data = x, FUN = mean)
######################
# DIFF-IN-DIFF: KENTUCKY WORKER'S COMP
######################
# read in data
x <- read.csv("http://www.dougspencer.org/data/injury.csv")
# subset to just kentucky (where new law was enacted)
x <- x[x$ky == 1, ]
# create diff-in-diff vars
x$log_duration <- log(x$durat)
x$post <- ifelse(x$year > 1980, 1, 0)
x$treat <- ifelse(x$prewage > 350, 1, 0)
# aggregate outcome by diff-diff vars
k <- aggregate(log_duration ~ post + treat, data = x, FUN = mean)
# plot the points
plot(k$post, k$log_duration,
ylim = c(0.9, 1.7),
xlim = c(-0.1, 1.3),
pch = 19,
xlab = "",
ylab = "log(Weeks on unemployment)",
axes = F)
axis(1, at = c(0,1), labels = c("Pre", "Post"))
axis(2, at = c(0.9,1.1, 1.3, 1.5, 1.7))
# add segments
# control
segments(x0 = 0, x1 = 1,
y0 = k$log_duration[k$post == 0 & k$treat == 0],
y1 = k$log_duration[k$post == 1 & k$treat == 0],
lwd = 3,
col = "black")
#treat
segments(x0 = 0, x1 = 1,
y0 = k$log_duration[k$post == 0 & k$treat == 1],
y1 = k$log_duration[k$post == 1 & k$treat == 1],
lwd = 3,
col = "red")
# calculate end of diff-diff estimate
control.change <- diff(c(k$log_duration[1], k$log_duration[2]))
segments(x0 = 0, x1 = 1,
y0 = k$log_duration[k$post == 0 & k$treat == 1],
y1 = k$log_duration[k$post == 0 & k$treat == 1] + control.change,
lty = 2)
## OPTIONAL
text(1.07, 1.49, "}", cex = 5.5, family = "Helvetica Neue Ultralight")
text(1.23, 1.49, "Treatment\neffect", cex = 0.75)
# OPTIONAL legend
legend("topleft", legend = c("High earners", "Low earners"),
col = c("red", "black"),
lty = c(1,1),
bty = "n",
cex = 0.7)
# now run diff-in-diff regression without and with controls
lm1 <- lm(log_duration ~ treat + post + treat*post,
data = x)
lm2 <- lm(log_duration ~ treat + post + treat*post +
male + married + age + hosp +
as.factor(indust) + as.factor(injtype),
data = x)
# print regression output for analysis
summary(lm1)
summary(lm2)
## OPTIONAL ##
# library in package for creating side-by-side regression tables
#install.packages("modelsummary") # you only have to do this once
library(modelsummary)
# print regression results
modelsummary(list("Simple" = lm1, "With Controls" = lm2),
stars = T,
fmt = 2) # default is 3 digits after decimal, this sets to 2
# print regression results: HIDE CONTROLS (omit coefficients 5-17)
modelsummary(list("Simple" = lm1, "With Controls" = lm2),
stars = T,
fmt = 2,
coef_omit = c(5:17))
# save regression results (add output line)
# library package to save files to Excel
install.packages("openxlsx") # you only have to do this once
library(openxlsx)
modelsummary(list("Simple" = lm1, "With Controls" = lm2),
stars = T,
coef_omit = c(5:17),
fmt = 2,
output = "~/Downloads/diff-diff-regression.xlsx")
##########################################
## Nov 18: Regression Discontinuity EXAMPLE
##########################################
# library in relevant packages and their dependencies
library(rdrobust)
library(rddensity)
library(modelsummary)
# read in data
x <- read.csv("http://www.dougspencer.org/data/tutoring_program.csv")
# plot discontinuity of running variable and treatment
plot(x$entrance_exam,
jitter(as.numeric(as.factor(x$tutoring))),
cex = 0.75,
xlab = "Entrance Exam",
ylab = "Tutoring",
xlim = c(20, 100),
axes = F)
axis(1, at = seq(20, 100, 10))
axis(2, at = c(1, 2), labels = c("No", "Yes"))
box()
abline(v = 70, lty = 2)
# check continuity of running variable
hist(x$entrance, breaks = 30)
abline(v = 70) # note a tiny cluster just below 70
## ADVANCED: statistical test whether density at cutoff is statistically significant
test <- rddensity(x$entrance_exam, c = 70)
summary(test) # look at p-value of "Robust" (here 0.58 so NOT significant)
## ADVANCED: plot of statistical test showing gap at cutoff
## is not statistically different (from rddensity package)
rdplotdensity(rdd = test,
X = x$entrance_exam,
type = "both") # This adds both points and lines
# Now we build our money plot showing effect of tutoring on exit exam
plot(x$entrance_exam, x$exit_exam,
pch = 19, col = "gray", cex = 0.5,
xlab = "Entrance Exam Score",
ylab = "Final Exam Score",
main = "Effect of Tutoring on Final Exam")
# add color to treatment points
points(x$entrance_exam[x$entrance_exam <= 70],
x$exit_exam[x$entrance_exam <= 70],
col = "blue", pch = 19, cex = 0.6)
# now add lowess lines for treatment and control
lines(lowess(x = x$entrance_exam[x$entrance_exam <= 70],
y = x$exit_exam[x$entrance_exam <= 70]),
lwd = 5)
lines(lowess(x = x$entrance_exam[x$entrance_exam > 70],
y = x$exit_exam[x$entrance_exam > 70]),
lwd = 5)
# add vertical line to denote discontinuity
abline(v = 70, lty = 2)
# for illustrative purposes, add a rectangle to highlight focus of analysis
rect(xleft = 68, xright = 72, ybottom = 60, ytop = 71, lwd = 2,
col = NA, border = "red")
## ADVANCED: from "rdrobust" package a single line to create outcome plot
rdplot(y = x$exit_exam, x = x$entrance_exam, c = 70)
## EXTRA ADVANCED: generate regressions at 3 bandwidths
# all data
raw <- lm(exit_exam ~ entrance_exam + tutoring, data = x)
# approx. 5% bandwidth
bw5 <- lm(exit_exam ~ entrance_exam + tutoring,
data = x[x$entrance_exam >= 66 & x$entrance_exam <= 74, ])
# approx. 10% bandwidth
bw10 <- lm(exit_exam ~ entrance_exam + tutoring,
data = x[x$entrance_exam >= 62 & x$entrance_exam <= 78, ])
# from "modelsummary" package -- generate table comparing all regression output
modelsummary(list("Full data" = raw,
"Bandwidth = 5%" = bw5,
"Bandwidth = 10%" = bw10))
# Interpretation: coeficient where "Tutor = TRUE" shows
# that causal effect of tutoring is about 9-10%
# note the difference from just comparing final exam scores
# among those with and without a tutor (not an apples-to-apples comparison)
aggregate(exit_exam ~ tutoring, data = x, FUN = mean)
# difference of the second column of the aggregation (use diff wrapper)
diff(aggregate(exit_exam ~ tutoring, data = x, FUN = mean)[,2])
#####
# Second RDD Example
#####
# required libraries
library(rdrobust)
library(rddensity)
# read in data
x <- read.csv("http://www.dougspencer.org/data/incumbent_rdd.csv")
### DISCO PLOTS ###
## DISCONTINUITY
plot(x$margin_t, jitter(as.numeric(as.factor(x$win_t))),
xlab = "Margin of Victory at time = t",
ylab = "Election Outcome",
axes = F)
axis(1, at = seq(-100, 100, 20))
axis(2, at = c(1,2), labels = c("Lose", "Win"))
abline(v = 0, lty = 2)
## CONTINUITY
hist(x$margin_t, breaks = 50,
main = "Continuity of Running Variable",
xlab = "Vote Share")
abline(v = 0, col = "red", lwd = 4)
# statistical test of continuity
dens.test <- rddensity(x$margin_t, c = 0, massPoints = F)
# look at p-value for "Robust" (if bigger than 0.05 then continuity confirmed)
summary(dens.test)
# plot of this test
rdplotdensity(rdd = dens.test,
X = x$margin_t,
type = "both")
## OUTCOME PLOT
plot(x$margin_t, x$voteshare_t1,
xlab = "Margin of Victory at time t",
ylab = "Vote share at time t+1",
main = "Effect of Incumbency on Future Vote Shares\n(Regression Discontinuity Example)",
pch = 19,
col = "gray",
cex = 0.8)
# optional: change color of "treated" group
points(x$margin_t[x$margin_t > 0], x$voteshare_t1[x$margin_t > 0],
pch = 19,
col = "dodgerblue4",
cex = 0.9)
# add lowess lines
lines(lowess(x$margin[x$margin < 0], x$vote[x$margin < 0]),
lwd = 3, col = "black")
lines(lowess(x$margin[x$margin >= 0], x$vote[x$margin >= 0]),
lwd = 3, col = "red")
# add text
text(x = -2, y = 49, "{", cex = 1.5)
text(x = -4, y = 49, "Treatment effect", adj = 1, cex = 0.7)
# ggplot (again just including fyi)
ggplot(data = x, aes(x = margin_t, y = voteshare_t1)) +
geom_point(col = "gray") +
geom_point(data = x[x$margin_t > 0,], aes(x = margin_t, y = voteshare_t1), col = "dodgerblue4") +
ggtitle("Effect of Incumbency on Future Vote Shares (RDD)") +
theme_bw() +
geom_smooth(se = F, aes(col = win_t), size = 1.5) +
labs(x = "Margin of victory at time t",
y = "Vote share at time t+1") +
scale_color_manual("", values = c("black", "red"), guide = "none") +
annotate(geom = "pointrange", x = 0, y = 49, ymin = 45, ymax = 52,
linewidth = 1.5) +
annotate(geom = "text", x = -3, y = 49,
label = "Treatment effect",
hjust = 1, size = 3)
# built in plot in rdrobust
rdplot(x = x$margin_t, y = x$voteshare_t1,
title = "Measure of Incumbency Advantage (RDD)",
y.label = "Vote share at time t+1",
x.label = "Margin of victory at time t")
# experiment with binwidths (binselect = "es" means "equally spaced")
# choose different binwidths (try scale = 5 / scale = 15 / scale = 35)
rdplot(x = x$margin_t, y = x$voteshare_t1,
title = "Measure of Incumbency Advantage (RDD)",
y.label = "Vote share at time t+1",
x.label = "Margin of victory at time t",
binselect = "es",
scale = 17)
# what is the right binwidth?
# this test shows that best bw = 17.754
rdrobust(x = x$margin_t, y = x$voteshare_t1)
######################
## STARBUCKS NUTRITION
######################
## FOOD
f <- read.csv("http://www.dougspencer.org/data/starbucks_food.csv")
# which food group has highest average caloric content?
aggregate(calories ~ type, data = f, FUN = mean)
# which singular food item has the highest caloric content?
f[f$calories == max(f$calories), ]
# which of the following have the strongest correlation to calories: fat, carg, fiber, protein
cor(f[,c(2:6),])
#
plot(f$calories, f$fat, pch = 19, col = "dodgerblue4",
main = "Starbucks Food Menu Items",
xlab = "Calories",
ylab = "Fat (g)")
lines(lowess(f$calories, f$fat),
col = "firebrick4",
lwd = 3,
lty = 3)
## DRINKS
d <- read.csv("http://www.dougspencer.org/data/starbucks_drinks.csv")
# highest caffeine
subset(d, d$caffeine_mg == max(d$caffeine_mg, na.rm = T))
# subset to nonfat
nonfat <- d[grep("Nonfat", d$bev_prep), ]
# summarize by drink type and size
aggregate(sugar_g ~ beverage + bev_prep, data = nonfat, FUN = mean)
# histogram of grande drinks
nfg <- nonfat[grep("Grande", nonfat$bev_prep), ]
plot(density(nfg$sugar_g),
main = "Sugar content of Starbucks drinks\n(density in grams)",
lwd = 3)
abline(v = c(25, 36), lty = 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment