Skip to content

Instantly share code, notes, and snippets.

@agisga
Last active January 11, 2017 05:40
Show Gist options
  • Save agisga/ef8ef9264c2b3114175eb1d419045036 to your computer and use it in GitHub Desktop.
Save agisga/ef8ef9264c2b3114175eb1d419045036 to your computer and use it in GitHub Desktop.
# This code was used for the following blog post:
#
# http://www.alexejgossmann.com/email_data_analysis/
#
# All private information was replaced with **********
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
emails_from <- read.csv("emails_from_advisors.csv")
emails_to <- read.csv("emails_to_advisors.csv")
# remove time zone indicators (in order to get the time of day each email was written relative to the time zone
# it was sent from, rather than some common time zone)
emails_from$date <- gsub(".{6}$", "", as.character(emails_from$date))
emails_to$date <- gsub(".{6}$", "", as.character(emails_to$date))
# conceal real names and emails
advisor1 <- "*******@tulane.edu"
advisor2 <- "*******@tulane.edu"
advisor3a <- "********@gmail.com"
advisor3b <- "********@tulane.edu" # Advisor3 uses two email addresses
#--- prepare a data frame of emails that I have received from the three advisors
# From email history: I started working with advisor1 and advisor2 as my co-advisors on 9/11/2014, and we ended working
# together on 12/19/2016. Thus, now I only consider emails exchanged during that time with them.
# Prior to that I was working with advisor3, and we did not have email interaction after.
advisor_duration <- interval(ymd("2014-9-11"), ymd("2016-12-20"))
emails_from <- emails_from %>% tbl_df %>%
mutate(date = ymd_hms(date)) %>%
mutate(weekday = wday(date, label = TRUE)) %>%
filter(date %within% advisor_duration | grepl(advisor3a, from, ignore.case = TRUE) |
grepl(advisor3b, from, ignore.case = TRUE)) %>%
mutate(advisor1 = (grepl(advisor1, from, ignore.case = TRUE)),
advisor2 = (grepl(advisor2, from, ignore.case = TRUE)),
advisor3 = (grepl(advisor3a, from, ignore.case = TRUE) |
grepl(advisor3b, from, ignore.case = TRUE)))
# sanity check:
#table(emails_from$advisor1 | emails_from$advisor2 | emails_from$advisor3) # all TRUE
#table(emails_from$advisor1 & emails_from$advisor2 & emails_from$advisor3) # all FALSE
emails_from$advisor <- rep(NA, nrow(emails_from))
emails_from$advisor[emails_from$advisor1] <- "Advisor1"
emails_from$advisor[emails_from$advisor2] <- "Advisor2"
emails_from$advisor[emails_from$advisor3] <- "Advisor3"
# use only emails with my email address included in "to", "cc", or "bcc"
# (i.e., exclude announcement emails that go to mailing lists for the entire department etc.)
emails_from <- filter(emails_from,
grepl("**********", to, ignore.case = TRUE) |
grepl("**********", cc, ignore.case = TRUE) |
grepl("**********", bcc, ignore.case = TRUE))
#--- prepare a data frame of emails that I have sent to the three advisors
emails_to <- emails_to %>% tbl_df %>%
mutate(date = ymd_hms(date)) %>%
mutate(weekday = wday(date, label = TRUE)) %>%
filter(date %within% advisor_duration |
grepl(advisor3a, to, ignore.case = TRUE) |
grepl(advisor3b, to, ignore.case = TRUE) |
grepl(advisor3a, cc, ignore.case = TRUE) |
grepl(advisor3b, cc, ignore.case = TRUE) |
grepl(advisor3a, bcc, ignore.case = TRUE) |
grepl(advisor3b, bcc, ignore.case = TRUE)) %>%
mutate(advisor1 = (grepl(advisor1, to, ignore.case = TRUE) |
grepl(advisor1, cc, ignore.case = TRUE) |
grepl(advisor1, bcc, ignore.case = TRUE)),
advisor2 = (grepl(advisor2, to, ignore.case = TRUE) |
grepl(advisor2, cc, ignore.case = TRUE) |
grepl(advisor2, bcc, ignore.case = TRUE)),
advisor3 = (grepl(advisor3a, to, ignore.case = TRUE) |
grepl(advisor3a, cc, ignore.case = TRUE) |
grepl(advisor3a, bcc, ignore.case = TRUE) |
grepl(advisor3b, to, ignore.case = TRUE) |
grepl(advisor3b, cc, ignore.case = TRUE) |
grepl(advisor3b, bcc, ignore.case = TRUE)))
# sanity check:
#table(emails_to$advisor1 | emails_to$advisor2 | emails_to$advisor3) # all TRUE
#table(emails_to$advisor1 & emails_to$advisor2 & emails_to$advisor3) # all FALSE
#table(emails_to$advisor1 & emails_to$advisor2) # many TRUE
#table(emails_to$advisor2 & emails_to$advisor3) # all FALSE
#--- Bar graphs of the numbers of emails sent and received
# bargraph: Advisor1 emails sent and received
bind_rows(mutate(emails_from[emails_from$advisor1, ], fromto = "from"),
mutate(emails_to[emails_to$advisor1, ], fromto = "to")) %>%
ggplot(aes(x = fromto)) + geom_bar() +
scale_x_discrete("emails", labels = c("from" = "Received from Advisor1", "to" = "Sent to Advisor1"))
ggsave("advisor1_received_sent.png")
# bargraph: Advisor2&3 emails sent and received
# Here I can use facetting, because there are no emails addresses to both, Advisor2 and Advisor3, simultaneously.
emails_from_2_and_3 <- filter(emails_from, advisor2 | advisor3)
emails_to_2_and_3 <- filter(emails_to, advisor2 | advisor3)
emails_to_2_and_3$advisor <- rep("Advisor2", nrow(emails_to_2_and_3))
emails_to_2_and_3$advisor[emails_to_2_and_3$advisor3] <- "Advisor3"
bind_rows(mutate(emails_from_2_and_3, fromto = "from"),
mutate(emails_to_2_and_3, fromto = "to")) %>%
ggplot(aes(x = fromto)) + geom_bar() +
facet_wrap(~advisor, scales = "free") +
scale_x_discrete("emails", labels = c("from" = "Received from", "to" = "Sent to"))
ggsave("advisors_2_and_3_received_sent.png")
#--- Estimate the probability of reply from each advisor
# I think that the following calculations overestimate the probabilites, because they count multiple replies to the
# same email as replies to different emails, and count replies to other people as replies to me if I am in the CC.
# However, since the point is to show how low the reply rate of Advisor1 is, an overestimate does not defeat the conclusion.
# More on that in the blog post.
# estimate the probability of reply from Advisor1
n_replies <- nrow(filter(emails_from, advisor1 & inreplyto != ""))
n_sent <- nrow(filter(emails_to, advisor1))
print(paste("P(Advisor1 replies) =", n_replies / n_sent))
# estimate the probability of reply from Advisor2
n_replies <- nrow(filter(emails_from, advisor2 & inreplyto != ""))
n_sent <- nrow(filter(emails_to, advisor2))
print(paste("P(Advisor2 replies) =", n_replies / n_sent))
# estimate the probability of reply from Advisor3
n_replies <- nrow(filter(emails_from, advisor3 & inreplyto != ""))
n_sent <- nrow(filter(emails_to, advisor3))
print(paste("P(Advisor3 replies) =", n_replies / n_sent))
#--- Bar graph: Advisors' emails by weekday
emails_by_wday <- emails_from %>% group_by(advisor) %>%
summarize(Sun = sum(weekday == "Sun"),
Mon = sum(weekday == "Mon"),
Tues = sum(weekday == "Tues"),
Wed = sum(weekday == "Wed"),
Thurs = sum(weekday == "Thurs"),
Fri = sum(weekday == "Fri"),
Sat = sum(weekday == "Sat"))
for(i in 1:3) {
emails_by_wday[i, 2:8] <- emails_by_wday[i, 2:8] / sum(emails_by_wday[i, 2:8])
}
emails_by_wday <- emails_by_wday %>% gather(day, proportion, -advisor) %>%
mutate(day = factor(day,
levels = c("Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"),
ordered=TRUE))
ggplot(emails_by_wday, aes(day, proportion, fill = advisor)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("Relative frequency")
ggsave("wday.png")
#--- Advisors' emails by time of day
emails_from %>% mutate(time = as.POSIXct(format(emails_from$date, format = "%H:%M:%S"),
format = "%H:%M:%S")) %>%
ggplot(aes(time, y = ..density.., color = advisor)) +
geom_freqpoly(lwd = 1.3, bins = 18) +
scale_x_datetime(date_labels = "%R")
ggsave("time.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment