Last active
January 11, 2017 05:40
-
-
Save agisga/ef8ef9264c2b3114175eb1d419045036 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code was used for the following blog post: | |
# | |
# http://www.alexejgossmann.com/email_data_analysis/ | |
# | |
# All private information was replaced with ********** | |
library(readr) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(lubridate) | |
emails_from <- read.csv("emails_from_advisors.csv") | |
emails_to <- read.csv("emails_to_advisors.csv") | |
# remove time zone indicators (in order to get the time of day each email was written relative to the time zone | |
# it was sent from, rather than some common time zone) | |
emails_from$date <- gsub(".{6}$", "", as.character(emails_from$date)) | |
emails_to$date <- gsub(".{6}$", "", as.character(emails_to$date)) | |
# conceal real names and emails | |
advisor1 <- "*******@tulane.edu" | |
advisor2 <- "*******@tulane.edu" | |
advisor3a <- "********@gmail.com" | |
advisor3b <- "********@tulane.edu" # Advisor3 uses two email addresses | |
#--- prepare a data frame of emails that I have received from the three advisors | |
# From email history: I started working with advisor1 and advisor2 as my co-advisors on 9/11/2014, and we ended working | |
# together on 12/19/2016. Thus, now I only consider emails exchanged during that time with them. | |
# Prior to that I was working with advisor3, and we did not have email interaction after. | |
advisor_duration <- interval(ymd("2014-9-11"), ymd("2016-12-20")) | |
emails_from <- emails_from %>% tbl_df %>% | |
mutate(date = ymd_hms(date)) %>% | |
mutate(weekday = wday(date, label = TRUE)) %>% | |
filter(date %within% advisor_duration | grepl(advisor3a, from, ignore.case = TRUE) | | |
grepl(advisor3b, from, ignore.case = TRUE)) %>% | |
mutate(advisor1 = (grepl(advisor1, from, ignore.case = TRUE)), | |
advisor2 = (grepl(advisor2, from, ignore.case = TRUE)), | |
advisor3 = (grepl(advisor3a, from, ignore.case = TRUE) | | |
grepl(advisor3b, from, ignore.case = TRUE))) | |
# sanity check: | |
#table(emails_from$advisor1 | emails_from$advisor2 | emails_from$advisor3) # all TRUE | |
#table(emails_from$advisor1 & emails_from$advisor2 & emails_from$advisor3) # all FALSE | |
emails_from$advisor <- rep(NA, nrow(emails_from)) | |
emails_from$advisor[emails_from$advisor1] <- "Advisor1" | |
emails_from$advisor[emails_from$advisor2] <- "Advisor2" | |
emails_from$advisor[emails_from$advisor3] <- "Advisor3" | |
# use only emails with my email address included in "to", "cc", or "bcc" | |
# (i.e., exclude announcement emails that go to mailing lists for the entire department etc.) | |
emails_from <- filter(emails_from, | |
grepl("**********", to, ignore.case = TRUE) | | |
grepl("**********", cc, ignore.case = TRUE) | | |
grepl("**********", bcc, ignore.case = TRUE)) | |
#--- prepare a data frame of emails that I have sent to the three advisors | |
emails_to <- emails_to %>% tbl_df %>% | |
mutate(date = ymd_hms(date)) %>% | |
mutate(weekday = wday(date, label = TRUE)) %>% | |
filter(date %within% advisor_duration | | |
grepl(advisor3a, to, ignore.case = TRUE) | | |
grepl(advisor3b, to, ignore.case = TRUE) | | |
grepl(advisor3a, cc, ignore.case = TRUE) | | |
grepl(advisor3b, cc, ignore.case = TRUE) | | |
grepl(advisor3a, bcc, ignore.case = TRUE) | | |
grepl(advisor3b, bcc, ignore.case = TRUE)) %>% | |
mutate(advisor1 = (grepl(advisor1, to, ignore.case = TRUE) | | |
grepl(advisor1, cc, ignore.case = TRUE) | | |
grepl(advisor1, bcc, ignore.case = TRUE)), | |
advisor2 = (grepl(advisor2, to, ignore.case = TRUE) | | |
grepl(advisor2, cc, ignore.case = TRUE) | | |
grepl(advisor2, bcc, ignore.case = TRUE)), | |
advisor3 = (grepl(advisor3a, to, ignore.case = TRUE) | | |
grepl(advisor3a, cc, ignore.case = TRUE) | | |
grepl(advisor3a, bcc, ignore.case = TRUE) | | |
grepl(advisor3b, to, ignore.case = TRUE) | | |
grepl(advisor3b, cc, ignore.case = TRUE) | | |
grepl(advisor3b, bcc, ignore.case = TRUE))) | |
# sanity check: | |
#table(emails_to$advisor1 | emails_to$advisor2 | emails_to$advisor3) # all TRUE | |
#table(emails_to$advisor1 & emails_to$advisor2 & emails_to$advisor3) # all FALSE | |
#table(emails_to$advisor1 & emails_to$advisor2) # many TRUE | |
#table(emails_to$advisor2 & emails_to$advisor3) # all FALSE | |
#--- Bar graphs of the numbers of emails sent and received | |
# bargraph: Advisor1 emails sent and received | |
bind_rows(mutate(emails_from[emails_from$advisor1, ], fromto = "from"), | |
mutate(emails_to[emails_to$advisor1, ], fromto = "to")) %>% | |
ggplot(aes(x = fromto)) + geom_bar() + | |
scale_x_discrete("emails", labels = c("from" = "Received from Advisor1", "to" = "Sent to Advisor1")) | |
ggsave("advisor1_received_sent.png") | |
# bargraph: Advisor2&3 emails sent and received | |
# Here I can use facetting, because there are no emails addresses to both, Advisor2 and Advisor3, simultaneously. | |
emails_from_2_and_3 <- filter(emails_from, advisor2 | advisor3) | |
emails_to_2_and_3 <- filter(emails_to, advisor2 | advisor3) | |
emails_to_2_and_3$advisor <- rep("Advisor2", nrow(emails_to_2_and_3)) | |
emails_to_2_and_3$advisor[emails_to_2_and_3$advisor3] <- "Advisor3" | |
bind_rows(mutate(emails_from_2_and_3, fromto = "from"), | |
mutate(emails_to_2_and_3, fromto = "to")) %>% | |
ggplot(aes(x = fromto)) + geom_bar() + | |
facet_wrap(~advisor, scales = "free") + | |
scale_x_discrete("emails", labels = c("from" = "Received from", "to" = "Sent to")) | |
ggsave("advisors_2_and_3_received_sent.png") | |
#--- Estimate the probability of reply from each advisor | |
# I think that the following calculations overestimate the probabilites, because they count multiple replies to the | |
# same email as replies to different emails, and count replies to other people as replies to me if I am in the CC. | |
# However, since the point is to show how low the reply rate of Advisor1 is, an overestimate does not defeat the conclusion. | |
# More on that in the blog post. | |
# estimate the probability of reply from Advisor1 | |
n_replies <- nrow(filter(emails_from, advisor1 & inreplyto != "")) | |
n_sent <- nrow(filter(emails_to, advisor1)) | |
print(paste("P(Advisor1 replies) =", n_replies / n_sent)) | |
# estimate the probability of reply from Advisor2 | |
n_replies <- nrow(filter(emails_from, advisor2 & inreplyto != "")) | |
n_sent <- nrow(filter(emails_to, advisor2)) | |
print(paste("P(Advisor2 replies) =", n_replies / n_sent)) | |
# estimate the probability of reply from Advisor3 | |
n_replies <- nrow(filter(emails_from, advisor3 & inreplyto != "")) | |
n_sent <- nrow(filter(emails_to, advisor3)) | |
print(paste("P(Advisor3 replies) =", n_replies / n_sent)) | |
#--- Bar graph: Advisors' emails by weekday | |
emails_by_wday <- emails_from %>% group_by(advisor) %>% | |
summarize(Sun = sum(weekday == "Sun"), | |
Mon = sum(weekday == "Mon"), | |
Tues = sum(weekday == "Tues"), | |
Wed = sum(weekday == "Wed"), | |
Thurs = sum(weekday == "Thurs"), | |
Fri = sum(weekday == "Fri"), | |
Sat = sum(weekday == "Sat")) | |
for(i in 1:3) { | |
emails_by_wday[i, 2:8] <- emails_by_wday[i, 2:8] / sum(emails_by_wday[i, 2:8]) | |
} | |
emails_by_wday <- emails_by_wday %>% gather(day, proportion, -advisor) %>% | |
mutate(day = factor(day, | |
levels = c("Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"), | |
ordered=TRUE)) | |
ggplot(emails_by_wday, aes(day, proportion, fill = advisor)) + | |
geom_bar(stat = "identity", position = "dodge") + | |
ylab("Relative frequency") | |
ggsave("wday.png") | |
#--- Advisors' emails by time of day | |
emails_from %>% mutate(time = as.POSIXct(format(emails_from$date, format = "%H:%M:%S"), | |
format = "%H:%M:%S")) %>% | |
ggplot(aes(time, y = ..density.., color = advisor)) + | |
geom_freqpoly(lwd = 1.3, bins = 18) + | |
scale_x_datetime(date_labels = "%R") | |
ggsave("time.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment