Skip to content

Instantly share code, notes, and snippets.

@andrewheiss
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewheiss/0ca726bfb93623ef4f8b to your computer and use it in GitHub Desktop.
Save andrewheiss/0ca726bfb93623ef4f8b to your computer and use it in GitHub Desktop.
Rachel's reading report, 2014
# See report here: http://www.heissatopia.com/2014/12/rachels-2014-reading-report.html
# Libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(stringr)
library(httr)
library(grid)
library(scales)
library(Cairo)
#-------------
# Break data
#-------------
# Data frame of actual breaks
break.dates <- data.frame(start=ymd(c('2014-04-02', '2014-06-14',
'2014-09-27', '2014-12-18')),
end=ymd(c('2014-04-20', '2014-07-20',
'2014-10-19', '2014-12-31')))
# lubridate intervals (for use with %within%)
breaks.intervals <- (break.dates %>% mutate(x=new_interval(start, end)))$x
# Bands to add to time series plots
breaks.rect <- geom_rect(data=break.dates, aes(x=NULL, y=NULL, xmin=start,
xmax=end, ymin=0, ymax=+Inf),
fill='#ECD078', alpha=0.4)
# Data frame of full year, marking if day was a break
full.year <- data.frame(day = seq(ymd('2014-01-01'), ymd('2014-12-31'), by='days')) %>%
group_by(day) %>%
mutate(in.break = any(day %within% breaks.intervals))
# Summarize proporition of each month on break
breaks.month <- full.year %>%
group_by(month(day, label=TRUE, abbr=FALSE)) %>%
summarize(perc.break = sum(in.break) / n(),
school.break = ifelse(perc.break > 0.3, TRUE, FALSE))
# Summarize proporition of each week on break
breaks.week <- full.year %>%
group_by(week(day)) %>%
summarize(perc.break = sum(in.break) / n(),
school.break = ifelse(perc.break > 0.3, TRUE, FALSE))
#------------
# Book data
#------------
# Path to CSV
csv.url <- "https://docs.google.com/spreadsheet/pub?key=0Akj2LM2_BuMFdEdpWDcwb2Ytd0s3T1JweWoyMHZVZlE&single=true&gid=0&output=csv"
# Clean up data
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
books <- content(GET(csv.url)) %>%
mutate(Timestamp = mdy_hms(Timestamp),
Finished = mdy(Finished),
Imputed = factor(ifelse(Imputed == "Yes", "Yes", "No")),
Title = trim(tolower(Title)),
Title.factor = factor(Title),
Author = trim(tolower(Author)),
Author.factor = factor(Author)) %>%
rename(pages = Number.of.pages) %>%
mutate(day = day(Finished),
day.year = yday(Finished),
week = week(Finished),
day.factor = factor(day, ordered=TRUE),
day.year.factor = factor(day.year, ordered=TRUE),
week.factor = factor(week, ordered=TRUE),
weekday = wday(Finished, label=TRUE, abbr=FALSE),
month = month(Finished, label=TRUE, abbr=FALSE)) %>%
arrange(day.year) %>%
mutate(total.pages = cumsum(pages),
total.books = 1:n(),
harry.potter = ifelse(str_detect(Title, ignore.case("Harry Potter")),
TRUE, FALSE))
# Summary by month
books.month <- books %>%
group_by(month) %>%
summarize(num.books = n(),
num.pages = sum(pages),
avg.pages = mean(pages)) %>%
mutate(perc.break = breaks.month$perc.break[1:n()],
school.break = breaks.month$school.break[1:n()])
# Summary by week
books.week <- books %>%
group_by(week) %>%
summarize(num.books = n(),
num.pages = sum(pages),
avg.pages = mean(pages)) %>%
mutate(week = as.POSIXct(paste("2014", week-1, "7"), format="%Y %U %u", tz="GMT"),
perc.break = breaks.week$perc.break[1:n()],
school.break = breaks.week$school.break[1:n()])
books.week[1,]$week <- ymd('2013-12-29', tz="GMT") # Adjust first week
#---------------------
# Summary statistics
#---------------------
# Pages read
nrow(books)
sum(books$pages)
mean(books$pages)
summary(books$pages)
# Lexile score
summary(books$Lexile)
#------------
# Plot data
#------------
# Main palette: http://www.colourlovers.com/palette/694737/Thought_Provoking
# Tufte-esque theme (add white hlines as needed)
theme_clean <- function(base_size=12, base_family="Source Sans Pro Light") {
ret <- theme_bw(base_size, base_family) +
theme(panel.background = element_rect(fill="#ffffff", colour=NA),
axis.title.x=element_text(vjust=-0.2), axis.title.y=element_text(vjust=1.5),
title=element_text(vjust=1.2, family="Source Sans Pro Semibold"),
panel.border = element_blank(), axis.line=element_blank(),
panel.grid=element_blank(), axis.ticks=element_blank(),
legend.position="none",
axis.title=element_text(size=rel(0.8), family="Source Sans Pro Semibold"),
strip.text=element_text(size=rel(1), family="Source Sans Pro Semibold"),
strip.background=element_rect(fill="#ffffff", colour=NA),
panel.margin.y=unit(1.5, "lines"))
ret
}
# Save using Cairo! Both PDF and PNG (though I don't know if PNG is necessary... maybe)
# See https://gist.github.com/dsparks/3777731 for PNG and
# https://github.com/wch/extrafont/issues/8#issuecomment-50245466 for PDF
# ggsave(plot.month, filename="plot_monthly.png", width=10, height=5,
# units="in")
# ggsave(plot.month, filename="plot_monthly_cairo.png", width=10, height=5,
# units="in", type = "cairo-png")
# Page distribution
page.distribution <- ggplot(books, aes(x=pages)) +
geom_histogram(binwidth=25, fill="#D95B43", colour="white") +
scale_x_continuous(breaks=seq(0, 900, 100), expand=c(0, 0)) +
scale_y_continuous(expand=c(0, 0)) +
geom_hline(yintercept=seq(20, 60, by=20), size=0.25, colour="white") +
labs(x="Pages", y="Frequency", title="Length of books read in 2014") +
theme_clean()
ggsave(page.distribution, filename="plot_pages_hist.pdf", width=6, height=3.5,
units="in", device=cairo_pdf)
ggsave(page.distribution, filename="plot_pages_hist.png", width=6, height=3.5,
units="in")
# Top authors
authors.ordered <- sort(table(books$Author.factor), decreasing=TRUE)
authors.reversed <- sort(table(books$Author.factor))
top.authors <- authors.ordered[1:15]
author.plot.data <- books %>%
filter(Author %in% names(top.authors)) %>%
arrange(desc(Author.factor)) %>%
mutate(Author.factor = factor(Author.factor, ordered=TRUE,
levels=names(authors.reversed)))
author.plot <- ggplot(author.plot.data, aes(x=Author.factor)) +
geom_bar(fill="#D95B43") +
labs(x=NULL, y="Books by author", title="How many times Rachel read a book by each author ") +
scale_y_continuous(expand=c(0, 0)) +
scale_x_discrete(expand=c(0, 0)) +
geom_hline(yintercept=seq(5, 20, by=5), size=0.25, colour="white") +
coord_flip() +
theme_clean()
ggsave(author.plot, filename="plot_authors.pdf", width=6, height=3.5,
units="in", device=cairo_pdf)
ggsave(author.plot, filename="plot_authors.png", width=6, height=3.5, units="in")
# Top books
books.ordered <- sort(table(books$Title.factor), decreasing=TRUE)
books.reversed <- sort(table(books$Title.factor))
more.than.one <- books.ordered[books.ordered > 1]
book.plot.data <- books %>%
filter(Title %in% names(more.than.one)) %>%
arrange(desc(Title.factor)) %>%
mutate(Title.factor = factor(Title.factor, ordered=TRUE,
levels=names(books.reversed)))
book.plot <- ggplot(book.plot.data, aes(x=Title.factor)) +
geom_bar(fill="#D95B43") +
labs(x=NULL, y="Times read", title="Most repeated books") +
scale_y_continuous(expand=c(0, 0)) +
scale_x_discrete(expand=c(0, 0)) +
coord_flip() +
theme_clean()
ggsave(book.plot, filename="plot_books.pdf", width=6, height=5,
units="in", device=cairo_pdf)
ggsave(book.plot, filename="plot_books.png", width=6, height=5, units="in")
# Lexile distribution
grade.levels <- data.frame(start=c(140, 565), end=c(500, 910),
grade=c("2nd", "5th"))
lexile.distribution <- ggplot(books, aes(x=Lexile)) +
scale_fill_manual(values=c("#53777A", "#D95B43")) +
geom_histogram(binwidth=25, fill="#542437", colour="white") +
scale_x_continuous(breaks=seq(300, 1300, by=250), expand=c(0, 0)) +
scale_y_continuous(expand=c(0, 0)) +
geom_hline(yintercept=seq(10, 20, by=10), size=0.25, colour="white") +
geom_rect(data=grade.levels, aes(x=NULL, y=NULL, xmin=start, xmax=end,
ymin=0, ymax=+Inf, fill=grade), alpha=0.2) +
labs(x="Lexile score", y="Frequency") +
ggtitle(expression(atop("Lexile score of books read in 2014",
atop("2nd grade IQR in blue; 5th grade IQR in red"), ""))) +
theme_clean()
ggsave(lexile.distribution, filename="plot_lexile_hist.pdf", width=6, height=3.5,
units="in", device=cairo_pdf)
ggsave(lexile.distribution, filename="plot_lexile_hist.png", width=6, height=3.5,
units="in")
# Lexile score and book length
plot.lexile.pages <- ggplot(books, aes(x=pages, y=Lexile)) +
geom_point(colour="grey40") +
geom_smooth(method="lm", colour="#53777A", fill="#53777A") +
labs(x="Pages in book", y="Lexile score", title="Book length and Lexile score") +
scale_x_continuous(expand=c(0, 15)) +
scale_y_continuous(expand=c(0, 15)) +
theme_clean()
ggsave(plot.lexile.pages, filename="plot_lexile_pages.pdf", width=6, height=3.5,
units="in", device=cairo_pdf)
ggsave(plot.lexile.pages, filename="plot_lexile_pages.png", width=6, height=3.5,
units="in")
# Books and pages by month
# Melt to long
books.month.long <- books.month %>%
gather(variable, number, c(num.books, num.pages)) %>%
mutate(variable = factor(variable, labels=c("Books", "Pages")))
# Create data frame for horizontal lines
month.hline <- data.frame(lines=seq(10, 50, by=10),
variable="Books") %>%
rbind(., data.frame(lines=seq(2500, 7500, by=2500), variable="Pages"))
# Plot
plot.month <- ggplot(books.month.long, aes(x=month, y=number)) +
geom_bar(aes(fill=school.break), stat="identity") +
labs(x=NULL, y=NULL) +
ggtitle(expression(atop("Books and pages read in 2014 (by month)",
atop("Months with more than 30% time off school colored blue"), ""))) +
scale_fill_manual(values=c("grey20", "#53777A")) +
scale_y_continuous(labels=comma, expand=c(0, 0)) +
scale_x_discrete(expand=c(0, 0)) +
facet_wrap(~ variable, ncol=1, scales="free_y") +
geom_hline(data=month.hline, aes(yintercept=lines), size=0.25, colour="white") +
theme_clean()
# Save
ggsave(plot.month, filename="plot_monthly.pdf", width=11, height=6,
units="in", device=cairo_pdf)
ggsave(plot.month, filename="plot_monthly.png", width=11, height=6,
units="in")
# Books and pages by week
# Melt to long
books.week.long <- books.week %>%
gather(variable, number, c(num.books, num.pages)) %>%
mutate(variable = factor(variable, labels=c("Books", "Pages")))
# Create data frame for horizontal lines
week.hline <- data.frame(lines=seq(5, 20, by=5),
variable="Books") %>%
rbind(., data.frame(lines=seq(1000, 2000, by=1000), variable="Pages"))
# Plot
plot.week <- ggplot(books.week.long, aes(x=week, y=number)) +
geom_bar(aes(fill=school.break), stat="identity") +
labs(x=NULL, y=NULL) +
ggtitle(expression(atop("Books and pages read in 2014 (by week)",
atop("Weeks with more than 30% time off school colored blue"), ""))) +
scale_fill_manual(values=c("grey20", "#53777A")) +
scale_y_continuous(labels=comma, expand=c(0, 0)) +
scale_x_datetime(breaks=date_breaks("1 month"),
labels=date_format("%B"), expand=c(0, 0)) +
facet_wrap(~ variable, ncol=1, scales="free_y") +
geom_hline(data=week.hline, aes(yintercept=lines), size=0.25, colour="white") +
theme_clean()
ggsave(plot.week, filename="plot_weekly.pdf", width=11, height=6,
units="in", device=cairo_pdf)
ggsave(plot.week, filename="plot_weekly.png", width=11, height=6,
units="in")
# Scatterplot with averages
monthly.avg <- ggplot(books, aes(x=Finished, y=pages)) +
breaks.rect +
geom_point(aes(colour=harry.potter), size=2, alpha=0.6) +
geom_smooth(method="loess", colour="#53777A", fill="#53777A") +
geom_line(aes(group=month), stat="hline", yintercept="mean",
size=1, colour="#C02942") +
scale_colour_manual(values=c("grey40", "#D95B43")) +
scale_x_datetime(labels=date_format("%B")) +
scale_y_continuous(expand=c(0, 15)) +
labs(x=NULL, y="Pages") +
ggtitle(expression(atop("Books read in 2014 by length",
atop("School breaks shaded in yellow; Harry Potter points marked in orange"), ""))) +
theme_clean()
ggsave(monthly.avg, filename="plot_scatter.pdf", width=7, height=4.5,
units="in", device=cairo_pdf)
ggsave(monthly.avg, filename="plot_scatter.png", width=7, height=4.5,
units="in")
# Lexile over time
lexile.time <- ggplot(books, aes(x=Finished, y=Lexile)) +
breaks.rect +
geom_point(aes(colour=Imputed), size=2, alpha=0.8) +
geom_smooth(method="loess", colour="#53777A", fill="#53777A") +
geom_line(aes(group=month), stat="hline", yintercept="mean",
size=1, colour="#C02942") +
scale_colour_manual(values=c("grey40", "#D95B43")) +
scale_x_datetime(labels=date_format("%B")) +
scale_y_continuous(expand=c(0, 15)) +
coord_cartesian(ylim=c(250, 1250)) +
labs(x=NULL, y="Lexile score") +
ggtitle(expression(atop("Books read in 2014 by Lexile score",
atop("School breaks shaded in yellow; imputed Lexile scores in orange"), ""))) +
theme_clean()
ggsave(lexile.time, filename="plot_lexile.pdf", width=7, height=4.5,
units="in", device=cairo_pdf)
ggsave(lexile.time, filename="plot_lexile.png", width=7, height=4.5,
units="in")
# Cumulative books read
reference.line <- data.frame(Finished=ymd(c('2014-01-01', '2014-12-31')),
total.books=c(0, 365))
plot.cumulative <- ggplot(books, aes(x=Finished, y=total.books)) +
breaks.rect +
stat_smooth(method="lm", se=FALSE, colour="#BF99A1",
size=0.5, alpha=0.5, linetype=2) + # Trend
geom_line(data=reference.line, colour="#53777A", size=0.5) + # Reference line
geom_line(color="#C02942", size=1) + # Actual cumulative books
geom_hline(yintercept=365, linetype=3, size=0.5, colour="#53777A") + # Goal ceiling
geom_rug(sides="b", alpha=0.5, colour="grey40") +
scale_x_datetime(labels=date_format("%B")) +
labs(x=NULL, y="Cumulative number of books") +
ggtitle(expression(atop("Progress toward 2014 goal",
atop("School breaks shaded in yellow"), ""))) +
theme_clean()
ggsave(plot.cumulative, filename="plot_cumulative.pdf", width=7, height=4.5,
units="in", device=cairo_pdf)
ggsave(plot.cumulative, filename="plot_cumulative.png", width=7, height=4.5,
units="in")
#---------
# Models
#---------
# Predict end of year total
trend <- lm(total.books ~ Finished, data=books)
predict(trend, newdata=data.frame(Finished=ymd(c('2014-01-01', '2014-12-31'))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment