Last active
August 29, 2015 14:11
-
-
Save andrewheiss/0ca726bfb93623ef4f8b to your computer and use it in GitHub Desktop.
Rachel's reading report, 2014
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# See report here: http://www.heissatopia.com/2014/12/rachels-2014-reading-report.html | |
# Libraries | |
library(dplyr) | |
library(ggplot2) | |
library(tidyr) | |
library(lubridate) | |
library(stringr) | |
library(httr) | |
library(grid) | |
library(scales) | |
library(Cairo) | |
#------------- | |
# Break data | |
#------------- | |
# Data frame of actual breaks | |
break.dates <- data.frame(start=ymd(c('2014-04-02', '2014-06-14', | |
'2014-09-27', '2014-12-18')), | |
end=ymd(c('2014-04-20', '2014-07-20', | |
'2014-10-19', '2014-12-31'))) | |
# lubridate intervals (for use with %within%) | |
breaks.intervals <- (break.dates %>% mutate(x=new_interval(start, end)))$x | |
# Bands to add to time series plots | |
breaks.rect <- geom_rect(data=break.dates, aes(x=NULL, y=NULL, xmin=start, | |
xmax=end, ymin=0, ymax=+Inf), | |
fill='#ECD078', alpha=0.4) | |
# Data frame of full year, marking if day was a break | |
full.year <- data.frame(day = seq(ymd('2014-01-01'), ymd('2014-12-31'), by='days')) %>% | |
group_by(day) %>% | |
mutate(in.break = any(day %within% breaks.intervals)) | |
# Summarize proporition of each month on break | |
breaks.month <- full.year %>% | |
group_by(month(day, label=TRUE, abbr=FALSE)) %>% | |
summarize(perc.break = sum(in.break) / n(), | |
school.break = ifelse(perc.break > 0.3, TRUE, FALSE)) | |
# Summarize proporition of each week on break | |
breaks.week <- full.year %>% | |
group_by(week(day)) %>% | |
summarize(perc.break = sum(in.break) / n(), | |
school.break = ifelse(perc.break > 0.3, TRUE, FALSE)) | |
#------------ | |
# Book data | |
#------------ | |
# Path to CSV | |
csv.url <- "https://docs.google.com/spreadsheet/pub?key=0Akj2LM2_BuMFdEdpWDcwb2Ytd0s3T1JweWoyMHZVZlE&single=true&gid=0&output=csv" | |
# Clean up data | |
trim <- function (x) gsub("^\\s+|\\s+$", "", x) | |
books <- content(GET(csv.url)) %>% | |
mutate(Timestamp = mdy_hms(Timestamp), | |
Finished = mdy(Finished), | |
Imputed = factor(ifelse(Imputed == "Yes", "Yes", "No")), | |
Title = trim(tolower(Title)), | |
Title.factor = factor(Title), | |
Author = trim(tolower(Author)), | |
Author.factor = factor(Author)) %>% | |
rename(pages = Number.of.pages) %>% | |
mutate(day = day(Finished), | |
day.year = yday(Finished), | |
week = week(Finished), | |
day.factor = factor(day, ordered=TRUE), | |
day.year.factor = factor(day.year, ordered=TRUE), | |
week.factor = factor(week, ordered=TRUE), | |
weekday = wday(Finished, label=TRUE, abbr=FALSE), | |
month = month(Finished, label=TRUE, abbr=FALSE)) %>% | |
arrange(day.year) %>% | |
mutate(total.pages = cumsum(pages), | |
total.books = 1:n(), | |
harry.potter = ifelse(str_detect(Title, ignore.case("Harry Potter")), | |
TRUE, FALSE)) | |
# Summary by month | |
books.month <- books %>% | |
group_by(month) %>% | |
summarize(num.books = n(), | |
num.pages = sum(pages), | |
avg.pages = mean(pages)) %>% | |
mutate(perc.break = breaks.month$perc.break[1:n()], | |
school.break = breaks.month$school.break[1:n()]) | |
# Summary by week | |
books.week <- books %>% | |
group_by(week) %>% | |
summarize(num.books = n(), | |
num.pages = sum(pages), | |
avg.pages = mean(pages)) %>% | |
mutate(week = as.POSIXct(paste("2014", week-1, "7"), format="%Y %U %u", tz="GMT"), | |
perc.break = breaks.week$perc.break[1:n()], | |
school.break = breaks.week$school.break[1:n()]) | |
books.week[1,]$week <- ymd('2013-12-29', tz="GMT") # Adjust first week | |
#--------------------- | |
# Summary statistics | |
#--------------------- | |
# Pages read | |
nrow(books) | |
sum(books$pages) | |
mean(books$pages) | |
summary(books$pages) | |
# Lexile score | |
summary(books$Lexile) | |
#------------ | |
# Plot data | |
#------------ | |
# Main palette: http://www.colourlovers.com/palette/694737/Thought_Provoking | |
# Tufte-esque theme (add white hlines as needed) | |
theme_clean <- function(base_size=12, base_family="Source Sans Pro Light") { | |
ret <- theme_bw(base_size, base_family) + | |
theme(panel.background = element_rect(fill="#ffffff", colour=NA), | |
axis.title.x=element_text(vjust=-0.2), axis.title.y=element_text(vjust=1.5), | |
title=element_text(vjust=1.2, family="Source Sans Pro Semibold"), | |
panel.border = element_blank(), axis.line=element_blank(), | |
panel.grid=element_blank(), axis.ticks=element_blank(), | |
legend.position="none", | |
axis.title=element_text(size=rel(0.8), family="Source Sans Pro Semibold"), | |
strip.text=element_text(size=rel(1), family="Source Sans Pro Semibold"), | |
strip.background=element_rect(fill="#ffffff", colour=NA), | |
panel.margin.y=unit(1.5, "lines")) | |
ret | |
} | |
# Save using Cairo! Both PDF and PNG (though I don't know if PNG is necessary... maybe) | |
# See https://gist.github.com/dsparks/3777731 for PNG and | |
# https://github.com/wch/extrafont/issues/8#issuecomment-50245466 for PDF | |
# ggsave(plot.month, filename="plot_monthly.png", width=10, height=5, | |
# units="in") | |
# ggsave(plot.month, filename="plot_monthly_cairo.png", width=10, height=5, | |
# units="in", type = "cairo-png") | |
# Page distribution | |
page.distribution <- ggplot(books, aes(x=pages)) + | |
geom_histogram(binwidth=25, fill="#D95B43", colour="white") + | |
scale_x_continuous(breaks=seq(0, 900, 100), expand=c(0, 0)) + | |
scale_y_continuous(expand=c(0, 0)) + | |
geom_hline(yintercept=seq(20, 60, by=20), size=0.25, colour="white") + | |
labs(x="Pages", y="Frequency", title="Length of books read in 2014") + | |
theme_clean() | |
ggsave(page.distribution, filename="plot_pages_hist.pdf", width=6, height=3.5, | |
units="in", device=cairo_pdf) | |
ggsave(page.distribution, filename="plot_pages_hist.png", width=6, height=3.5, | |
units="in") | |
# Top authors | |
authors.ordered <- sort(table(books$Author.factor), decreasing=TRUE) | |
authors.reversed <- sort(table(books$Author.factor)) | |
top.authors <- authors.ordered[1:15] | |
author.plot.data <- books %>% | |
filter(Author %in% names(top.authors)) %>% | |
arrange(desc(Author.factor)) %>% | |
mutate(Author.factor = factor(Author.factor, ordered=TRUE, | |
levels=names(authors.reversed))) | |
author.plot <- ggplot(author.plot.data, aes(x=Author.factor)) + | |
geom_bar(fill="#D95B43") + | |
labs(x=NULL, y="Books by author", title="How many times Rachel read a book by each author ") + | |
scale_y_continuous(expand=c(0, 0)) + | |
scale_x_discrete(expand=c(0, 0)) + | |
geom_hline(yintercept=seq(5, 20, by=5), size=0.25, colour="white") + | |
coord_flip() + | |
theme_clean() | |
ggsave(author.plot, filename="plot_authors.pdf", width=6, height=3.5, | |
units="in", device=cairo_pdf) | |
ggsave(author.plot, filename="plot_authors.png", width=6, height=3.5, units="in") | |
# Top books | |
books.ordered <- sort(table(books$Title.factor), decreasing=TRUE) | |
books.reversed <- sort(table(books$Title.factor)) | |
more.than.one <- books.ordered[books.ordered > 1] | |
book.plot.data <- books %>% | |
filter(Title %in% names(more.than.one)) %>% | |
arrange(desc(Title.factor)) %>% | |
mutate(Title.factor = factor(Title.factor, ordered=TRUE, | |
levels=names(books.reversed))) | |
book.plot <- ggplot(book.plot.data, aes(x=Title.factor)) + | |
geom_bar(fill="#D95B43") + | |
labs(x=NULL, y="Times read", title="Most repeated books") + | |
scale_y_continuous(expand=c(0, 0)) + | |
scale_x_discrete(expand=c(0, 0)) + | |
coord_flip() + | |
theme_clean() | |
ggsave(book.plot, filename="plot_books.pdf", width=6, height=5, | |
units="in", device=cairo_pdf) | |
ggsave(book.plot, filename="plot_books.png", width=6, height=5, units="in") | |
# Lexile distribution | |
grade.levels <- data.frame(start=c(140, 565), end=c(500, 910), | |
grade=c("2nd", "5th")) | |
lexile.distribution <- ggplot(books, aes(x=Lexile)) + | |
scale_fill_manual(values=c("#53777A", "#D95B43")) + | |
geom_histogram(binwidth=25, fill="#542437", colour="white") + | |
scale_x_continuous(breaks=seq(300, 1300, by=250), expand=c(0, 0)) + | |
scale_y_continuous(expand=c(0, 0)) + | |
geom_hline(yintercept=seq(10, 20, by=10), size=0.25, colour="white") + | |
geom_rect(data=grade.levels, aes(x=NULL, y=NULL, xmin=start, xmax=end, | |
ymin=0, ymax=+Inf, fill=grade), alpha=0.2) + | |
labs(x="Lexile score", y="Frequency") + | |
ggtitle(expression(atop("Lexile score of books read in 2014", | |
atop("2nd grade IQR in blue; 5th grade IQR in red"), ""))) + | |
theme_clean() | |
ggsave(lexile.distribution, filename="plot_lexile_hist.pdf", width=6, height=3.5, | |
units="in", device=cairo_pdf) | |
ggsave(lexile.distribution, filename="plot_lexile_hist.png", width=6, height=3.5, | |
units="in") | |
# Lexile score and book length | |
plot.lexile.pages <- ggplot(books, aes(x=pages, y=Lexile)) + | |
geom_point(colour="grey40") + | |
geom_smooth(method="lm", colour="#53777A", fill="#53777A") + | |
labs(x="Pages in book", y="Lexile score", title="Book length and Lexile score") + | |
scale_x_continuous(expand=c(0, 15)) + | |
scale_y_continuous(expand=c(0, 15)) + | |
theme_clean() | |
ggsave(plot.lexile.pages, filename="plot_lexile_pages.pdf", width=6, height=3.5, | |
units="in", device=cairo_pdf) | |
ggsave(plot.lexile.pages, filename="plot_lexile_pages.png", width=6, height=3.5, | |
units="in") | |
# Books and pages by month | |
# Melt to long | |
books.month.long <- books.month %>% | |
gather(variable, number, c(num.books, num.pages)) %>% | |
mutate(variable = factor(variable, labels=c("Books", "Pages"))) | |
# Create data frame for horizontal lines | |
month.hline <- data.frame(lines=seq(10, 50, by=10), | |
variable="Books") %>% | |
rbind(., data.frame(lines=seq(2500, 7500, by=2500), variable="Pages")) | |
# Plot | |
plot.month <- ggplot(books.month.long, aes(x=month, y=number)) + | |
geom_bar(aes(fill=school.break), stat="identity") + | |
labs(x=NULL, y=NULL) + | |
ggtitle(expression(atop("Books and pages read in 2014 (by month)", | |
atop("Months with more than 30% time off school colored blue"), ""))) + | |
scale_fill_manual(values=c("grey20", "#53777A")) + | |
scale_y_continuous(labels=comma, expand=c(0, 0)) + | |
scale_x_discrete(expand=c(0, 0)) + | |
facet_wrap(~ variable, ncol=1, scales="free_y") + | |
geom_hline(data=month.hline, aes(yintercept=lines), size=0.25, colour="white") + | |
theme_clean() | |
# Save | |
ggsave(plot.month, filename="plot_monthly.pdf", width=11, height=6, | |
units="in", device=cairo_pdf) | |
ggsave(plot.month, filename="plot_monthly.png", width=11, height=6, | |
units="in") | |
# Books and pages by week | |
# Melt to long | |
books.week.long <- books.week %>% | |
gather(variable, number, c(num.books, num.pages)) %>% | |
mutate(variable = factor(variable, labels=c("Books", "Pages"))) | |
# Create data frame for horizontal lines | |
week.hline <- data.frame(lines=seq(5, 20, by=5), | |
variable="Books") %>% | |
rbind(., data.frame(lines=seq(1000, 2000, by=1000), variable="Pages")) | |
# Plot | |
plot.week <- ggplot(books.week.long, aes(x=week, y=number)) + | |
geom_bar(aes(fill=school.break), stat="identity") + | |
labs(x=NULL, y=NULL) + | |
ggtitle(expression(atop("Books and pages read in 2014 (by week)", | |
atop("Weeks with more than 30% time off school colored blue"), ""))) + | |
scale_fill_manual(values=c("grey20", "#53777A")) + | |
scale_y_continuous(labels=comma, expand=c(0, 0)) + | |
scale_x_datetime(breaks=date_breaks("1 month"), | |
labels=date_format("%B"), expand=c(0, 0)) + | |
facet_wrap(~ variable, ncol=1, scales="free_y") + | |
geom_hline(data=week.hline, aes(yintercept=lines), size=0.25, colour="white") + | |
theme_clean() | |
ggsave(plot.week, filename="plot_weekly.pdf", width=11, height=6, | |
units="in", device=cairo_pdf) | |
ggsave(plot.week, filename="plot_weekly.png", width=11, height=6, | |
units="in") | |
# Scatterplot with averages | |
monthly.avg <- ggplot(books, aes(x=Finished, y=pages)) + | |
breaks.rect + | |
geom_point(aes(colour=harry.potter), size=2, alpha=0.6) + | |
geom_smooth(method="loess", colour="#53777A", fill="#53777A") + | |
geom_line(aes(group=month), stat="hline", yintercept="mean", | |
size=1, colour="#C02942") + | |
scale_colour_manual(values=c("grey40", "#D95B43")) + | |
scale_x_datetime(labels=date_format("%B")) + | |
scale_y_continuous(expand=c(0, 15)) + | |
labs(x=NULL, y="Pages") + | |
ggtitle(expression(atop("Books read in 2014 by length", | |
atop("School breaks shaded in yellow; Harry Potter points marked in orange"), ""))) + | |
theme_clean() | |
ggsave(monthly.avg, filename="plot_scatter.pdf", width=7, height=4.5, | |
units="in", device=cairo_pdf) | |
ggsave(monthly.avg, filename="plot_scatter.png", width=7, height=4.5, | |
units="in") | |
# Lexile over time | |
lexile.time <- ggplot(books, aes(x=Finished, y=Lexile)) + | |
breaks.rect + | |
geom_point(aes(colour=Imputed), size=2, alpha=0.8) + | |
geom_smooth(method="loess", colour="#53777A", fill="#53777A") + | |
geom_line(aes(group=month), stat="hline", yintercept="mean", | |
size=1, colour="#C02942") + | |
scale_colour_manual(values=c("grey40", "#D95B43")) + | |
scale_x_datetime(labels=date_format("%B")) + | |
scale_y_continuous(expand=c(0, 15)) + | |
coord_cartesian(ylim=c(250, 1250)) + | |
labs(x=NULL, y="Lexile score") + | |
ggtitle(expression(atop("Books read in 2014 by Lexile score", | |
atop("School breaks shaded in yellow; imputed Lexile scores in orange"), ""))) + | |
theme_clean() | |
ggsave(lexile.time, filename="plot_lexile.pdf", width=7, height=4.5, | |
units="in", device=cairo_pdf) | |
ggsave(lexile.time, filename="plot_lexile.png", width=7, height=4.5, | |
units="in") | |
# Cumulative books read | |
reference.line <- data.frame(Finished=ymd(c('2014-01-01', '2014-12-31')), | |
total.books=c(0, 365)) | |
plot.cumulative <- ggplot(books, aes(x=Finished, y=total.books)) + | |
breaks.rect + | |
stat_smooth(method="lm", se=FALSE, colour="#BF99A1", | |
size=0.5, alpha=0.5, linetype=2) + # Trend | |
geom_line(data=reference.line, colour="#53777A", size=0.5) + # Reference line | |
geom_line(color="#C02942", size=1) + # Actual cumulative books | |
geom_hline(yintercept=365, linetype=3, size=0.5, colour="#53777A") + # Goal ceiling | |
geom_rug(sides="b", alpha=0.5, colour="grey40") + | |
scale_x_datetime(labels=date_format("%B")) + | |
labs(x=NULL, y="Cumulative number of books") + | |
ggtitle(expression(atop("Progress toward 2014 goal", | |
atop("School breaks shaded in yellow"), ""))) + | |
theme_clean() | |
ggsave(plot.cumulative, filename="plot_cumulative.pdf", width=7, height=4.5, | |
units="in", device=cairo_pdf) | |
ggsave(plot.cumulative, filename="plot_cumulative.png", width=7, height=4.5, | |
units="in") | |
#--------- | |
# Models | |
#--------- | |
# Predict end of year total | |
trend <- lm(total.books ~ Finished, data=books) | |
predict(trend, newdata=data.frame(Finished=ymd(c('2014-01-01', '2014-12-31')))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment