Last active
November 18, 2025 05:56
-
-
Save dougspencer/ce597584de03238c38a0d68996329828 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ###################### | |
| # GAPMINDER Exercise | |
| ###################### | |
| install.packages("gapminder") | |
| library(gapminder) | |
| # read in data | |
| data(gapminder) | |
| head(gapminder) | |
| # rename data | |
| x <- gapminder; rm(gapminder) | |
| # create log GDP to address massive right screw | |
| hist(x$gdpPercap, breaks = 50) | |
| x$logGDP <- log(x$gdpPercap) | |
| # plot life expectancy against log(GDP) | |
| plot(x$logGDP, x$lifeExp, | |
| pch = 19, | |
| col = "dodgerblue4", | |
| main = "Life Expectancy v. GDP", | |
| xlab = "GDP per capita (log scale)", | |
| ylab = "Life Expectancy (years)", | |
| cex = 2) | |
| lines(lowess(x$logGDP, x$lifeExp), | |
| col = "red", | |
| lwd = 3) | |
| # now color code for continent | |
| table(x$continent) # note there are 5 continents | |
| x$color <- as.numeric(x$continent) | |
| plot(x$logGDP, x$lifeExp, | |
| pch = 19, | |
| col = x$color, | |
| main = "Life Expectancy v. GDP", | |
| xlab = "GDP per capita (log scale)", | |
| ylab = "Life Expectancy (years)") | |
| lines(lowess(x$logGDP, x$lifeExp), | |
| col = "red", | |
| lwd = 3) | |
| legend("bottomright", | |
| legend = c("Africa", "Americas", "Asia", "Europe", "Oceania"), | |
| pch = 19, | |
| col = c(1:5), | |
| title = "Continents", | |
| cex = 0.5) | |
| ###################### | |
| # PENGUINS Exercise | |
| ###################### | |
| x <- read.csv("http://www.dougspencer.org/data/penguins.csv") | |
| head(x) | |
| table(x$year) | |
| aggregate(flipper_length_mm ~ species, data = x, FUN = mean) | |
| plot(x$body_mass_g, x$flipper_length_mm, | |
| pch = 19, | |
| col = "black", | |
| xlab = "Body mass (g)", | |
| ylab = "Flipper length (mm)", | |
| main = "Are Penguins Proportional?", | |
| xlim = c(2500, 6500) | |
| ) | |
| lines(lowess(x$body_mass_g, x$flipper_length_mm), | |
| col = "red", | |
| lwd = 3, | |
| lty = 3) | |
| x$color <- "dodgerblue4" | |
| x$color[x$species == "Chinstrap"] <- "burlywood" | |
| x$color[x$species == "Gentoo"] <- "violetred4" | |
| plot(x$body_mass_g, x$flipper_length_mm, | |
| xlab = "Body mass (g)", | |
| ylab = "Flipper length (mm)", | |
| xlim = c(2500,6500), | |
| pch = 16, | |
| col = x$color | |
| ) | |
| ###################### | |
| # MTCARS Exercise | |
| ###################### | |
| # read in data | |
| x <- read.csv("http://www.dougspencer.org/data/mtcars.csv") | |
| # how many cars have 8 cylinders? | |
| table(x$cyl) | |
| # median horsepower | |
| median(x$hp) | |
| # how many cars have more than 100 hp? | |
| x[x$hp > 100, ] | |
| # correlation between horesepower and mpg? | |
| cor(x$hp, x$mpg) | |
| # Relationship between horsepower and mpg | |
| plot(x$hp, x$mpg, | |
| xlab = "Horsepower", | |
| ylab = "MPG", | |
| pch = 18, | |
| main = "Powerful Cars Guzzle Gas") | |
| # create colors | |
| x$color <- NA | |
| x$color[x$cyl == 4] <- "burlywood" | |
| x$color[x$cyl == 6] <- "violetred4" | |
| x$color[x$cyl == 8] <- "dodgerblue4" | |
| # new plot | |
| plot(x$hp, x$mpg, | |
| xlab = "Horsepower", | |
| ylab = "MPG", | |
| pch = 18, | |
| cex = 2, | |
| main = "Powerful Cars Guzzle Gas", | |
| col = x$color, | |
| xlim = c(0, 350)) | |
| # add legend | |
| legend(275, 33, | |
| pch = 18, | |
| pt.cex = 2, | |
| col = c("dodgerblue4", "violetred4", "burlywood"), | |
| legend = c(4,6,8), | |
| title = "Cylinders", | |
| bty = "n") | |
| # linear trend line | |
| abline(lm(mpg ~ hp, data = x)) | |
| # lowess line | |
| lines(lowess(x$hp, x$mpg), | |
| col = "red", | |
| lty = 3, | |
| lwd = 3) | |
| # add correlation to plot | |
| text(50, 12, "Cor = -0.78") | |
| # estimate bivariate relatinship mpg ~ hp | |
| lm1 <- lm(mpg ~ hp, data = x) | |
| summary(lm1) | |
| ###################### | |
| # GALTON Exercise | |
| ###################### | |
| # read in data | |
| x <- read.csv("http://www.dougspencer.org/data/galton.csv") | |
| # how many families | |
| length(unique(x$family)) | |
| # how many male kids? | |
| table(x$sex) | |
| # create adjusted average parent height | |
| x$adj_height <- x$mother*1.08 | |
| x$avg_parent <- (x$father + x$adj_height)/2 | |
| # plot | |
| plot(x$avg_parent, x$height, | |
| pch = 16, | |
| col = "dodgerblue4", | |
| xlab = "Avg parent height", | |
| ylab = "Children's height") | |
| abline(lm(height ~ avg_parent, data = x), | |
| col = "red", | |
| lwd = 3) | |
| # lowess line | |
| lines(lowess(x$avg_parent, x$height), | |
| col = "burlywood", | |
| lwd = 2) | |
| # estimate linear model | |
| m <- lm(height ~ avg_parent, data = x) | |
| summary(m) | |
| # find correlation metric | |
| cor(x$height, x$avg_parent) | |
| ###################### | |
| # CAMPAIGN FINANCE & CORRUPTION Exercise | |
| ###################### | |
| # read in data | |
| load(url("http://www.dougspencer.org/data/cf_conjoint.RData")) | |
| # rename object | |
| x <- cf_conjoint; rm(cf_conjoint) | |
| # review data | |
| table(x$AGT_Corrupt) | |
| table(x$AGT_Finance) | |
| # crosstab | |
| table(x$AGT_Corrupt, x$AGT_Finance) | |
| # now plot (numeric) | |
| plot(as.numeric(x$AGT_Corrupt), as.numeric(x$AGT_Finance)) | |
| # plot (jitter) | |
| plot(jitter(as.numeric(x$AGT_Corrupt)), | |
| jitter(as.numeric(x$AGT_Finance)), | |
| pch = 16, | |
| col = "dodgerblue4", | |
| xlab = "How corrupt is American politics?", | |
| ylab = "Campaign finance rules are ___", | |
| axes = F) | |
| axis(1, at = c(1:5), labels = c("Extremely", "", "Somewhat", "", "Not at all")) | |
| axis(2, at = c(1:5), labels = c("Way too strict", "", "About right", "", "Way too loose")) | |
| box() | |
| ## IF TIME -- plot by Ideology | |
| table(x$CC334A) | |
| lib <- x[x$CC334A == "Very Liberal", ] | |
| cons <- x[x$CC334A == "Very Conservative", ] | |
| ## copy and paste plot code and replace objects | |
| par(mfrow = c(1, 2)) # this line directs R to generate multiple figures by row: 1 row and 2 columns | |
| # plot 1 (object lib) | |
| plot(jitter(as.numeric(lib$AGT_Corrupt)), | |
| jitter(as.numeric(lib$AGT_Finance)), | |
| pch = 16, | |
| col = "dodgerblue4", | |
| xlab = "How corrupt is American politics?", | |
| ylab = "Campaign finance rules are ___", | |
| axes = F, | |
| main = "Very liberal") | |
| axis(1, at = c(1:5), labels = c("Extremely", "", "Somewhat", "", "Not at all")) | |
| axis(2, at = c(1:5), labels = c("Way too strict", "", "About right", "", "Way too loose")) | |
| # plot 2 (object cons) | |
| plot(jitter(as.numeric(cons$AGT_Corrupt)), | |
| jitter(as.numeric(cons$AGT_Finance)), | |
| pch = 16, | |
| col = "dodgerblue4", | |
| xlab = "How corrupt is American politics?", | |
| ylab = "Campaign finance rules are ___", | |
| axes = F, | |
| main = "Very conservative") | |
| axis(1, at = c(1:5), labels = c("Extremely", "", "Somewhat", "", "Not at all")) | |
| axis(2, at = c(1:5), labels = c("Way too strict", "", "About right", "", "Way too loose")) | |
| ###################### | |
| # NYC FLIGHT DELAYS | |
| ###################### | |
| # load data | |
| load(url("http://www.dougspencer.org/data/nycflights13.RData")) | |
| # rename object | |
| x <- nycflights13; rm(nycflights13) | |
| # average distance of flight | |
| mean(x$distance) | |
| median(x$distance) | |
| # how many carriers | |
| length(table(x$carrier)) | |
| # who is most common carrier (UA) | |
| sort(table(x$carrier)) | |
| # histogram of departure times | |
| hist(x$dep_time, | |
| xlab = "Time of Day", | |
| ylab = "Number of flights", | |
| main = "Departures from NYC Area Airports", | |
| axes = F) | |
| # add custom axes | |
| axis(1, at = seq(0, 2400, 300), labels = c("12a", "3a", "6a", "9a", "12p", "3p", "6p", "9p", "12a")) | |
| axis(2, at = seq(0, 25000, 5000)) | |
| # what percent of flights had delay? | |
| x$delay <- ifelse(x$dep_delay > 0, 1, 0) | |
| prop.table(table(x$delay)) | |
| # highest rate of delays by carrier | |
| avg <- aggregate(delay ~ carrier, data = x, FUN = mean) | |
| avg[order(avg$delay), ] | |
| # longest average delay by carrier | |
| aggregate(dep_delay ~ carrier, data = x, FUN = mean) | |
| # OPTIONAL: zero-out flights that leave early (and that might skew average) | |
| x$delay_time <- ifelse(x$dep_delay < 0, 0, x$dep_delay) | |
| adj_avg <- aggregate(delay_time ~ carrier, data = x, FUN = mean) | |
| adj_avg[order(adj_avg$delay_time), ] | |
| ###################### | |
| # BIASED BALLOTS | |
| ###################### | |
| # load data | |
| load(url("http://www.dougspencer.org/data/biased_ballots.RData")) | |
| # rename object | |
| x <- ballot.titles; rm(ballot.titles) | |
| # which universities particiated in study? | |
| table(x$r.univ) | |
| # plot party ID by university | |
| table(x$r.univ, x$r.party) | |
| plot(jitter(x$r.party), | |
| jitter(as.numeric(as.factor(x$r.univ))), | |
| pch = 16, | |
| axes = F, | |
| xlab = "Party ID", | |
| ylab = "" | |
| ) | |
| axis(1, at = c(-3:3)) | |
| axis(2, at = c(1,2), labels = c("BYU", "Berkeley")) | |
| # variation in summary and title | |
| plot(density(x$bal.sum.len), | |
| ylim = c(0, 0.018), | |
| main = "Length of Ballot Titles & Summaries") | |
| lines(density(x$bal.title.len), | |
| lwd = 3, | |
| col = "red") | |
| # legend | |
| legend("topright", | |
| col = c("red", "black"), | |
| lwd = c(2, 1), | |
| legend = c("Title", "Summary"), | |
| title = "Ballot Measures" | |
| ) | |
| # compare ideology and party ID | |
| plot(jitter(x$r.ideol), jitter(x$r.party), | |
| xlab = "Political Ideology", | |
| ylab = "Party ID", | |
| cex.lab = 1.1, | |
| main = "Self-reported political preferences\n(Survey respondents, N=996)", | |
| pch = 16, | |
| col = "firebrick4", | |
| axes = F, | |
| xlim = c(-3,3) | |
| ) | |
| axis(1, at = c(-3:3), | |
| labels = c("Very\nliberal", "Liberal", "Somewhat\nliberal", "Moderate", "Somewhat\nconservative", "Conservative", "Very\nconservative"), | |
| padj = 0.5) | |
| axis(2, at = c(-3:3), | |
| labels = c("Strong\nDem", "Democratic", "Lean\nDem", "Indep", "Lean\nRepub", "Republican", "Strong\nRepub")) | |
| cor(x$r.ideol, x$r.party, use = "complete.obs") | |
| text(2, -2, "Corr = 0.82") | |
| # add OLS line | |
| lm1 <- lm(r.party ~ r.ideol, data = x) | |
| summary(lm1) | |
| abline(lm1) | |
| # add 45 deg line for reference | |
| abline(0, 1, lty = 2, col = "gray45") | |
| ###################### | |
| # UNIV TEXAS STUDENT EVALS | |
| ###################### | |
| load(url("http://www.dougspencer.org/data/utaustin_evals.RData")) | |
| # rename object | |
| x <- utaustin_evals; rm(utaustin_evals) | |
| # which gender/ethnicity of profs gets lowest evals? | |
| aggregate(score ~ gender + ethnicity, data = x, FUN = mean) | |
| # how many "bad" profs are there? | |
| prof <- aggregate(score ~ prof_ID, data = x, FUN = mean) | |
| ## identify cut-off | |
| plot(density(prof$score)) | |
| ## maybe 2*sd? | |
| mean(prof$score) - 2*sd(prof$score) | |
| prof[prof$score < 3.12, ] # 3 profs | |
| prof <- prof[order(prof$score), ] # sort and then manually select cutoff | |
| # tenure vs. non-tenure | |
| aggregate(score ~ rank, data = x, FUN = mean) | |
| ## response rate a problem? | |
| x$rate <- x$cls_did_eval/x$cls_students | |
| plot(x$rate, x$score, | |
| xlab = "Response rate", | |
| ylab = "Mean eval score", | |
| main = "Do Response Rates Affect Scores?", | |
| pch = 16, | |
| col = "dodgerblue4") | |
| lm1 <- lm(score ~ rate, data = x) | |
| summary(lm1) | |
| abline(lm1, lty = 2, col = "firebrick4", lwd = 3) | |
| cor(x$score, x$rate) # cor = 0.18 | |
| ###################### | |
| # SPENCER'S PELOTON DATA | |
| ###################### | |
| x <- read.csv("http://www.dougspencer.org/data/dork_horse_workouts.csv") | |
| # change date to date (so plots sequentially) | |
| x$date <- as.Date(x$Workout.Timestamp) | |
| x$year <- substr(x$date, 1, 4) | |
| # plot total output over time | |
| plot(x$date, x$Total.Output) | |
| # subset to just bike rides | |
| x <- x[x$Fitness.Discipline == "Cycling", ] | |
| ## drops from 526 obs to 226 | |
| x30 <- x[x$Length..minutes. == 30, ] | |
| plot(x30$date, x30$Total.Output) | |
| # subset out Arms | |
| grep("Arms", x30$Title) | |
| # drops to 182 obs | |
| x30 <- x30[-grep("Arms", x30$Title), ] | |
| plot(x30$date, x30$Total.Output) | |
| # drop outliers (2 obs) | |
| x30 <- x30[x30$Total.Output > 200, ] | |
| plot(x30$date, x30$Total.Output, | |
| pch = 16, | |
| col = "gray", | |
| xlab = "Date", | |
| ylab = "Total Output", | |
| main = "Spencer Peloton Over Time") | |
| # horiz line at mean | |
| abline(h = mean(x30$Total.Output), lty = 2) | |
| # lowess line | |
| lines(lowess(x30$date, x30$Total.Output), | |
| col = "dodgerblue4", | |
| lwd = 2) | |
| ## 45 and 60 sec rides? | |
| agg <- aggregate(Total.Output ~ Length..minutes., data = x, FUN = mean) | |
| agg$avg <- agg$Total.Output/agg$Length..minutes. | |
| agg | |
| ## | |
| inst <- data.frame(sort(table(x$Instructor.Name))) | |
| # subset to just inst > 5 | |
| inst <- inst[inst$Freq > 5, ] | |
| # now hard part -- subset 'x' to just these instructors | |
| df <- x[x$Instructor.Name %in% inst$Var1, ] | |
| ## subset to just instructors > 5 | |
| coach <- aggregate(Total.Output ~ Instructor.Name, data = df, FUN = mean) | |
| ###################### | |
| # FBI CRIME STATISTICS | |
| ###################### | |
| ## arrest rates | |
| # https://cde.ucr.cjis.gov/LATEST/webapp/# | |
| # scroll down to "Additional Datasets" and download | |
| # (1) Summary Reporting Survey (SRS) ("estimated_crimes_1979_2023.csv") | |
| # (2) Arrest Data - Reported Number of Arrests by Crime ("arrests_national.csv") | |
| # read in estimated crimes data | |
| ec <- read.csv("~/Downloads/estimated_crimes_1979_2024.csv") | |
| # add USA to national crime stats line | |
| ec$state_abbr[ec$state_abbr==""] <- "USA" | |
| # subset to just USA numbers | |
| ec.usa <- ec[ec$state_abbr == "USA", ] | |
| # side by side plots | |
| par(mfrow = c(2,2)) | |
| # now plot | |
| plot(ec.usa$year, ec.usa$violent_crime, | |
| type = "b", | |
| main = "Estimated Violent Crime\n in USA", | |
| ylab = "Crimes divided by 100,000", | |
| xlab = "Year", | |
| xlim = c(1960,2021)) | |
| plot(ec.usa$year, as.numeric(ec.usa$property_crime)/100000, | |
| type = "b", | |
| main = "Estimated Property Crime \n in USA", | |
| ylab = "Crime rate per 100,000", | |
| xlab = "Year", | |
| xlim = c(1960,2021)) | |
| ## read in arrests data | |
| ar <- read.csv("http://www.dougspencer.org/data/arrests_national.csv") | |
| # create plot space with side-by-side plots | |
| par(mfcol = c(1,2)) | |
| # now make the plots | |
| plot(ar$year, ar$tot_clr_index_violent, type = "o", | |
| main = "Violent Crime") | |
| plot(ar$year, ar$tot_clr_index_property, type = "l", main = "Property Crime") | |
| ## read in nat'l crime victimization sruvey (we did not do this in class) | |
| # URL: https://www.icpsr.umich.edu/web/NACJD/series/95 | |
| # Ctrl+F "ICPSR 38963" which should be the survey for 1992-2023 | |
| # Uncompress zip file and there will be three folders. | |
| # Choose "DS0003" folder and the RData file is saved as ".rda" | |
| load("~/Downloads/ICPSR_38963/DS0003/38963-0003-Data.rda") | |
| # rename unwieldy object name | |
| vs <- da38963.0003; rm(da38963.0003) | |
| # now sum the number of crimes respondents claimed they were victims each year | |
| # variable for reported crimes is "V2073" | |
| v <- aggregate(V2073 ~ YEAR, data = vs, FUN = sum) | |
| # Now plot these survey responses over time | |
| plot(v$YEAR, v$V2073, type = "b", | |
| xlab = "Year", | |
| ylab = "Total reported crimes", | |
| main = "Total Number of Reported Crimes \n (Nat'l Crime Victimization Survey)") | |
| ###################### | |
| # SOCIAL SECURITY BABY NAMES | |
| ###################### | |
| load(url("http://www.dougspencer.org/data/babynames.RData")) | |
| # rename | |
| x <- babynames; rm(babynames) | |
| # explore meaning/organization of data | |
| aggregate(prop ~ year + sex, data = x, FUN = sum) | |
| # what is the rage of years included in the dataset? | |
| range(x$year) | |
| summary(x$year) | |
| table(x$year) | |
| min(x$year) | |
| max(x$year) | |
| # plot unique names by year | |
| head(x) # to note that each row is a unique name by year | |
| plot(table(x$year)) | |
| # alternative way to do the same thing | |
| year <- as.data.frame(table(x$year)) | |
| plot(year$Var1, year$Freq) | |
| # which year had most unique names | |
| sort(table(x$year)) | |
| # top 5 baby names in birth year | |
| # (note that rows already sorted from most to least) | |
| head(x[x$year == 1979 & x$sex == "F", ]) | |
| head(x[x$year == 1979 & x$sex == "M", ]) | |
| # how many babies given your name in your birth year | |
| x[x$name == "Douglas" & x$year == 1979 & x$sex == "M", ] | |
| # my name's rank | |
| y79 <- x[x$year == 1979 & x$sex == "M", ] | |
| y79$rank <- 1:length(y79$name) | |
| head(y79) | |
| y79[y79$name == "Douglas",] | |
| # my name in entire sample | |
| doug <- x[grep("Doug", x$name), ] | |
| # how many? | |
| sum(doug$n) | |
| # how timeless? (range) | |
| range(doug$year) | |
| # my name over time | |
| par(mfrow = c(1,2)) | |
| plot(doug$year, doug$n, | |
| type = "h") | |
| abline(v = 1979, lty = 2) | |
| plot(doug$year, doug$prop, | |
| type = "h") | |
| abline(v = 1979, lty = 2) | |
| # what year was Douglas most popular? | |
| doug[doug$n == max(doug$n),] | |
| ###################### | |
| # TITANIC PASSENGER MANIFEST | |
| ###################### | |
| x <- read.csv("http://www.dougspencer.org/data/titanic.csv") | |
| head(x) | |
| plot(density(x$Age, na.rm = T)) | |
| hist(x$Age, | |
| breaks = 20, | |
| col = "burlywood", | |
| main = "Age of Titanic Passengers", | |
| xlab = "Age") | |
| # average survival rate by ticket class | |
| aggregate(Survived ~ Pclass, data = x, FUN = mean) | |
| # create fare category | |
| x$fare.cat <- cut(x$Fare, breaks = 4) | |
| table(x$fare.cat) # nope note quite right!! | |
| x$fare.cat <- cut(x$Fare, breaks = quantile(x$Fare, c(0, 0.25, 0.5, 0.75, 1)), include.lowest = T) | |
| x$age.cat <- cut(x$Age, breaks = quantile(x$Age, c(0, 0.25, 0.5, 0.75, 1), na.rm = T), include.lowest = T) | |
| table(x$fare.cat) | |
| # plot survival rate by fare | |
| fs <- aggregate(Survived ~ fare.cat, data = x, FUN = mean) | |
| # histogram | |
| plot(as.numeric(fs$fare.cat), fs$Survived, type = "h") | |
| # scatterplot | |
| plot(fs, axes = F, ylim = c(0,0.6)) | |
| axis(1, at = c(1:4), labels = c("Lowest 25% fare", "", "", "HIghest 25% fare")) | |
| axis(2, at = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6)) | |
| box() | |
| # sex disparity by fare category | |
| aggregate(Survived ~ age.cat + fare.cat, data = x, FUN = mean) | |
| ###################### | |
| # DIFF-IN-DIFF: KENTUCKY WORKER'S COMP | |
| ###################### | |
| # read in data | |
| x <- read.csv("http://www.dougspencer.org/data/injury.csv") | |
| # subset to just kentucky (where new law was enacted) | |
| x <- x[x$ky == 1, ] | |
| # create diff-in-diff vars | |
| x$log_duration <- log(x$durat) | |
| x$post <- ifelse(x$year > 1980, 1, 0) | |
| x$treat <- ifelse(x$prewage > 350, 1, 0) | |
| # aggregate outcome by diff-diff vars | |
| k <- aggregate(log_duration ~ post + treat, data = x, FUN = mean) | |
| # plot the points | |
| plot(k$post, k$log_duration, | |
| ylim = c(0.9, 1.7), | |
| xlim = c(-0.1, 1.3), | |
| pch = 19, | |
| xlab = "", | |
| ylab = "log(Weeks on unemployment)", | |
| axes = F) | |
| axis(1, at = c(0,1), labels = c("Pre", "Post")) | |
| axis(2, at = c(0.9,1.1, 1.3, 1.5, 1.7)) | |
| # add segments | |
| # control | |
| segments(x0 = 0, x1 = 1, | |
| y0 = k$log_duration[k$post == 0 & k$treat == 0], | |
| y1 = k$log_duration[k$post == 1 & k$treat == 0], | |
| lwd = 3, | |
| col = "black") | |
| #treat | |
| segments(x0 = 0, x1 = 1, | |
| y0 = k$log_duration[k$post == 0 & k$treat == 1], | |
| y1 = k$log_duration[k$post == 1 & k$treat == 1], | |
| lwd = 3, | |
| col = "red") | |
| # calculate end of diff-diff estimate | |
| control.change <- diff(c(k$log_duration[1], k$log_duration[2])) | |
| segments(x0 = 0, x1 = 1, | |
| y0 = k$log_duration[k$post == 0 & k$treat == 1], | |
| y1 = k$log_duration[k$post == 0 & k$treat == 1] + control.change, | |
| lty = 2) | |
| ## OPTIONAL | |
| text(1.07, 1.49, "}", cex = 5.5, family = "Helvetica Neue Ultralight") | |
| text(1.23, 1.49, "Treatment\neffect", cex = 0.75) | |
| # OPTIONAL legend | |
| legend("topleft", legend = c("High earners", "Low earners"), | |
| col = c("red", "black"), | |
| lty = c(1,1), | |
| bty = "n", | |
| cex = 0.7) | |
| # now run diff-in-diff regression without and with controls | |
| lm1 <- lm(log_duration ~ treat + post + treat*post, | |
| data = x) | |
| lm2 <- lm(log_duration ~ treat + post + treat*post + | |
| male + married + age + hosp + | |
| as.factor(indust) + as.factor(injtype), | |
| data = x) | |
| # print regression output for analysis | |
| summary(lm1) | |
| summary(lm2) | |
| ## OPTIONAL ## | |
| # library in package for creating side-by-side regression tables | |
| #install.packages("modelsummary") # you only have to do this once | |
| library(modelsummary) | |
| # print regression results | |
| modelsummary(list("Simple" = lm1, "With Controls" = lm2), | |
| stars = T, | |
| fmt = 2) # default is 3 digits after decimal, this sets to 2 | |
| # print regression results: HIDE CONTROLS (omit coefficients 5-17) | |
| modelsummary(list("Simple" = lm1, "With Controls" = lm2), | |
| stars = T, | |
| fmt = 2, | |
| coef_omit = c(5:17)) | |
| # save regression results (add output line) | |
| # library package to save files to Excel | |
| install.packages("openxlsx") # you only have to do this once | |
| library(openxlsx) | |
| modelsummary(list("Simple" = lm1, "With Controls" = lm2), | |
| stars = T, | |
| coef_omit = c(5:17), | |
| fmt = 2, | |
| output = "~/Downloads/diff-diff-regression.xlsx") | |
| ########################################## | |
| ## Nov 18: Regression Discontinuity EXAMPLE | |
| ########################################## | |
| # library in relevant packages and their dependencies | |
| library(rdrobust) | |
| library(rddensity) | |
| library(modelsummary) | |
| # read in data | |
| x <- read.csv("http://www.dougspencer.org/data/tutoring_program.csv") | |
| # plot discontinuity of running variable and treatment | |
| plot(x$entrance_exam, | |
| jitter(as.numeric(as.factor(x$tutoring))), | |
| cex = 0.75, | |
| xlab = "Entrance Exam", | |
| ylab = "Tutoring", | |
| xlim = c(20, 100), | |
| axes = F) | |
| axis(1, at = seq(20, 100, 10)) | |
| axis(2, at = c(1, 2), labels = c("No", "Yes")) | |
| box() | |
| abline(v = 70, lty = 2) | |
| # check continuity of running variable | |
| hist(x$entrance, breaks = 30) | |
| abline(v = 70) # note a tiny cluster just below 70 | |
| ## ADVANCED: statistical test whether density at cutoff is statistically significant | |
| test <- rddensity(x$entrance_exam, c = 70) | |
| summary(test) # look at p-value of "Robust" (here 0.58 so NOT significant) | |
| ## ADVANCED: plot of statistical test showing gap at cutoff | |
| ## is not statistically different (from rddensity package) | |
| rdplotdensity(rdd = test, | |
| X = x$entrance_exam, | |
| type = "both") # This adds both points and lines | |
| # Now we build our money plot showing effect of tutoring on exit exam | |
| plot(x$entrance_exam, x$exit_exam, | |
| pch = 19, col = "gray", cex = 0.5, | |
| xlab = "Entrance Exam Score", | |
| ylab = "Final Exam Score", | |
| main = "Effect of Tutoring on Final Exam") | |
| # add color to treatment points | |
| points(x$entrance_exam[x$entrance_exam <= 70], | |
| x$exit_exam[x$entrance_exam <= 70], | |
| col = "blue", pch = 19, cex = 0.6) | |
| # now add lowess lines for treatment and control | |
| lines(lowess(x = x$entrance_exam[x$entrance_exam <= 70], | |
| y = x$exit_exam[x$entrance_exam <= 70]), | |
| lwd = 5) | |
| lines(lowess(x = x$entrance_exam[x$entrance_exam > 70], | |
| y = x$exit_exam[x$entrance_exam > 70]), | |
| lwd = 5) | |
| # add vertical line to denote discontinuity | |
| abline(v = 70, lty = 2) | |
| # for illustrative purposes, add a rectangle to highlight focus of analysis | |
| rect(xleft = 68, xright = 72, ybottom = 60, ytop = 71, lwd = 2, | |
| col = NA, border = "red") | |
| ## ADVANCED: from "rdrobust" package a single line to create outcome plot | |
| rdplot(y = x$exit_exam, x = x$entrance_exam, c = 70) | |
| ## EXTRA ADVANCED: generate regressions at 3 bandwidths | |
| # all data | |
| raw <- lm(exit_exam ~ entrance_exam + tutoring, data = x) | |
| # approx. 5% bandwidth | |
| bw5 <- lm(exit_exam ~ entrance_exam + tutoring, | |
| data = x[x$entrance_exam >= 66 & x$entrance_exam <= 74, ]) | |
| # approx. 10% bandwidth | |
| bw10 <- lm(exit_exam ~ entrance_exam + tutoring, | |
| data = x[x$entrance_exam >= 62 & x$entrance_exam <= 78, ]) | |
| # from "modelsummary" package -- generate table comparing all regression output | |
| modelsummary(list("Full data" = raw, | |
| "Bandwidth = 5%" = bw5, | |
| "Bandwidth = 10%" = bw10)) | |
| # Interpretation: coeficient where "Tutor = TRUE" shows | |
| # that causal effect of tutoring is about 9-10% | |
| # note the difference from just comparing final exam scores | |
| # among those with and without a tutor (not an apples-to-apples comparison) | |
| aggregate(exit_exam ~ tutoring, data = x, FUN = mean) | |
| # difference of the second column of the aggregation (use diff wrapper) | |
| diff(aggregate(exit_exam ~ tutoring, data = x, FUN = mean)[,2]) | |
| ##### | |
| # Second RDD Example | |
| ##### | |
| # required libraries | |
| library(rdrobust) | |
| library(rddensity) | |
| # read in data | |
| x <- read.csv("http://www.dougspencer.org/data/incumbent_rdd.csv") | |
| ### DISCO PLOTS ### | |
| ## DISCONTINUITY | |
| plot(x$margin_t, jitter(as.numeric(as.factor(x$win_t))), | |
| xlab = "Margin of Victory at time = t", | |
| ylab = "Election Outcome", | |
| axes = F) | |
| axis(1, at = seq(-100, 100, 20)) | |
| axis(2, at = c(1,2), labels = c("Lose", "Win")) | |
| abline(v = 0, lty = 2) | |
| ## CONTINUITY | |
| hist(x$margin_t, breaks = 50, | |
| main = "Continuity of Running Variable", | |
| xlab = "Vote Share") | |
| abline(v = 0, col = "red", lwd = 4) | |
| # statistical test of continuity | |
| dens.test <- rddensity(x$margin_t, c = 0, massPoints = F) | |
| # look at p-value for "Robust" (if bigger than 0.05 then continuity confirmed) | |
| summary(dens.test) | |
| # plot of this test | |
| rdplotdensity(rdd = dens.test, | |
| X = x$margin_t, | |
| type = "both") | |
| ## OUTCOME PLOT | |
| plot(x$margin_t, x$voteshare_t1, | |
| xlab = "Margin of Victory at time t", | |
| ylab = "Vote share at time t+1", | |
| main = "Effect of Incumbency on Future Vote Shares\n(Regression Discontinuity Example)", | |
| pch = 19, | |
| col = "gray", | |
| cex = 0.8) | |
| # optional: change color of "treated" group | |
| points(x$margin_t[x$margin_t > 0], x$voteshare_t1[x$margin_t > 0], | |
| pch = 19, | |
| col = "dodgerblue4", | |
| cex = 0.9) | |
| # add lowess lines | |
| lines(lowess(x$margin[x$margin < 0], x$vote[x$margin < 0]), | |
| lwd = 3, col = "black") | |
| lines(lowess(x$margin[x$margin >= 0], x$vote[x$margin >= 0]), | |
| lwd = 3, col = "red") | |
| # add text | |
| text(x = -2, y = 49, "{", cex = 1.5) | |
| text(x = -4, y = 49, "Treatment effect", adj = 1, cex = 0.7) | |
| # ggplot (again just including fyi) | |
| ggplot(data = x, aes(x = margin_t, y = voteshare_t1)) + | |
| geom_point(col = "gray") + | |
| geom_point(data = x[x$margin_t > 0,], aes(x = margin_t, y = voteshare_t1), col = "dodgerblue4") + | |
| ggtitle("Effect of Incumbency on Future Vote Shares (RDD)") + | |
| theme_bw() + | |
| geom_smooth(se = F, aes(col = win_t), size = 1.5) + | |
| labs(x = "Margin of victory at time t", | |
| y = "Vote share at time t+1") + | |
| scale_color_manual("", values = c("black", "red"), guide = "none") + | |
| annotate(geom = "pointrange", x = 0, y = 49, ymin = 45, ymax = 52, | |
| linewidth = 1.5) + | |
| annotate(geom = "text", x = -3, y = 49, | |
| label = "Treatment effect", | |
| hjust = 1, size = 3) | |
| # built in plot in rdrobust | |
| rdplot(x = x$margin_t, y = x$voteshare_t1, | |
| title = "Measure of Incumbency Advantage (RDD)", | |
| y.label = "Vote share at time t+1", | |
| x.label = "Margin of victory at time t") | |
| # experiment with binwidths (binselect = "es" means "equally spaced") | |
| # choose different binwidths (try scale = 5 / scale = 15 / scale = 35) | |
| rdplot(x = x$margin_t, y = x$voteshare_t1, | |
| title = "Measure of Incumbency Advantage (RDD)", | |
| y.label = "Vote share at time t+1", | |
| x.label = "Margin of victory at time t", | |
| binselect = "es", | |
| scale = 17) | |
| # what is the right binwidth? | |
| # this test shows that best bw = 17.754 | |
| rdrobust(x = x$margin_t, y = x$voteshare_t1) | |
| ###################### | |
| ## STARBUCKS NUTRITION | |
| ###################### | |
| ## FOOD | |
| f <- read.csv("http://www.dougspencer.org/data/starbucks_food.csv") | |
| # which food group has highest average caloric content? | |
| aggregate(calories ~ type, data = f, FUN = mean) | |
| # which singular food item has the highest caloric content? | |
| f[f$calories == max(f$calories), ] | |
| # which of the following have the strongest correlation to calories: fat, carg, fiber, protein | |
| cor(f[,c(2:6),]) | |
| # | |
| plot(f$calories, f$fat, pch = 19, col = "dodgerblue4", | |
| main = "Starbucks Food Menu Items", | |
| xlab = "Calories", | |
| ylab = "Fat (g)") | |
| lines(lowess(f$calories, f$fat), | |
| col = "firebrick4", | |
| lwd = 3, | |
| lty = 3) | |
| ## DRINKS | |
| d <- read.csv("http://www.dougspencer.org/data/starbucks_drinks.csv") | |
| # highest caffeine | |
| subset(d, d$caffeine_mg == max(d$caffeine_mg, na.rm = T)) | |
| # subset to nonfat | |
| nonfat <- d[grep("Nonfat", d$bev_prep), ] | |
| # summarize by drink type and size | |
| aggregate(sugar_g ~ beverage + bev_prep, data = nonfat, FUN = mean) | |
| # histogram of grande drinks | |
| nfg <- nonfat[grep("Grande", nonfat$bev_prep), ] | |
| plot(density(nfg$sugar_g), | |
| main = "Sugar content of Starbucks drinks\n(density in grams)", | |
| lwd = 3) | |
| abline(v = c(25, 36), lty = 2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment