agisga/email_analysis.R

## email_analysis.R
# This code was used for the following blog post:
#
# http://www.alexejgossmann.com/email_data_analysis/
#
# All private information was replaced with **********

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)

emails_from <- read.csv("emails_from_advisors.csv")
emails_to <- read.csv("emails_to_advisors.csv")

# remove time zone indicators (in order to get the time of day each email was written relative to the time zone
# it was sent from, rather than some common time zone)
emails_from$date <- gsub(".{6}$", "", as.character(emails_from$date))
emails_to$date <- gsub(".{6}$", "", as.character(emails_to$date))

# conceal real names and emails
advisor1 <- "*******@tulane.edu"
advisor2 <- "*******@tulane.edu"
advisor3a <- "********@gmail.com"
advisor3b <- "********@tulane.edu" # Advisor3 uses two email addresses

#--- prepare a data frame of emails that I have received from the three advisors

# From email history: I started working with advisor1 and advisor2 as my co-advisors on 9/11/2014, and we ended working
# together on 12/19/2016. Thus, now I only consider emails exchanged during that time with them.
# Prior to that I was working with advisor3, and we did not have email interaction after.

advisor_duration <- interval(ymd("2014-9-11"), ymd("2016-12-20"))

emails_from <- emails_from %>% tbl_df %>%
  mutate(date = ymd_hms(date)) %>%
  mutate(weekday = wday(date, label = TRUE)) %>%
  filter(date %within% advisor_duration | grepl(advisor3a, from, ignore.case = TRUE) |
         grepl(advisor3b, from, ignore.case = TRUE)) %>%
  mutate(advisor1 = (grepl(advisor1, from, ignore.case = TRUE)),
         advisor2 = (grepl(advisor2, from, ignore.case = TRUE)),
         advisor3 = (grepl(advisor3a, from, ignore.case = TRUE) |
                     grepl(advisor3b, from, ignore.case = TRUE)))

# sanity check:
#table(emails_from$advisor1 | emails_from$advisor2 | emails_from$advisor3) # all TRUE
#table(emails_from$advisor1 & emails_from$advisor2 & emails_from$advisor3) # all FALSE

emails_from$advisor <- rep(NA, nrow(emails_from))
emails_from$advisor[emails_from$advisor1] <- "Advisor1"
emails_from$advisor[emails_from$advisor2] <- "Advisor2"
emails_from$advisor[emails_from$advisor3] <- "Advisor3"

# use only emails with my email address included in "to", "cc", or "bcc"
# (i.e., exclude announcement emails that go to mailing lists for the entire department etc.)
emails_from <- filter(emails_from,
                      grepl("**********", to, ignore.case = TRUE) |
                      grepl("**********", cc, ignore.case = TRUE) |
                      grepl("**********", bcc, ignore.case = TRUE))

#--- prepare a data frame of emails that I have sent to the three advisors

emails_to <- emails_to %>% tbl_df %>%
  mutate(date = ymd_hms(date)) %>%
  mutate(weekday = wday(date, label = TRUE)) %>%
  filter(date %within% advisor_duration |
         grepl(advisor3a, to, ignore.case = TRUE) |
         grepl(advisor3b, to, ignore.case = TRUE) |
         grepl(advisor3a, cc, ignore.case = TRUE) |
         grepl(advisor3b, cc, ignore.case = TRUE) |
         grepl(advisor3a, bcc, ignore.case = TRUE) |
         grepl(advisor3b, bcc, ignore.case = TRUE)) %>%
  mutate(advisor1 = (grepl(advisor1, to, ignore.case = TRUE) |
                     grepl(advisor1, cc, ignore.case = TRUE) |
                     grepl(advisor1, bcc, ignore.case = TRUE)),
         advisor2 = (grepl(advisor2, to, ignore.case = TRUE) |
                     grepl(advisor2, cc, ignore.case = TRUE) |
                     grepl(advisor2, bcc, ignore.case = TRUE)),
         advisor3 = (grepl(advisor3a, to, ignore.case = TRUE) |
                     grepl(advisor3a, cc, ignore.case = TRUE) |
                     grepl(advisor3a, bcc, ignore.case = TRUE) |
                     grepl(advisor3b, to, ignore.case = TRUE) |
                     grepl(advisor3b, cc, ignore.case = TRUE) |
                     grepl(advisor3b, bcc, ignore.case = TRUE)))

# sanity check:
#table(emails_to$advisor1 | emails_to$advisor2 | emails_to$advisor3) # all TRUE
#table(emails_to$advisor1 & emails_to$advisor2 & emails_to$advisor3) # all FALSE
#table(emails_to$advisor1 & emails_to$advisor2) # many TRUE
#table(emails_to$advisor2 & emails_to$advisor3) # all FALSE

#--- Bar graphs of the numbers of emails sent and received

# bargraph: Advisor1 emails sent and received
bind_rows(mutate(emails_from[emails_from$advisor1, ], fromto = "from"),
          mutate(emails_to[emails_to$advisor1, ], fromto = "to")) %>%
  ggplot(aes(x = fromto)) + geom_bar() +
  scale_x_discrete("emails", labels = c("from" = "Received from Advisor1", "to" = "Sent to Advisor1"))

ggsave("advisor1_received_sent.png")

# bargraph: Advisor2&3 emails sent and received
# Here I can use facetting, because there are no emails addresses to both, Advisor2 and Advisor3, simultaneously.

emails_from_2_and_3 <- filter(emails_from, advisor2 | advisor3)
emails_to_2_and_3 <- filter(emails_to, advisor2 | advisor3)
emails_to_2_and_3$advisor <- rep("Advisor2", nrow(emails_to_2_and_3))
emails_to_2_and_3$advisor[emails_to_2_and_3$advisor3] <- "Advisor3"

bind_rows(mutate(emails_from_2_and_3, fromto = "from"),
          mutate(emails_to_2_and_3, fromto = "to")) %>%
  ggplot(aes(x = fromto)) + geom_bar() +
  facet_wrap(~advisor, scales = "free") +
  scale_x_discrete("emails", labels = c("from" = "Received from", "to" = "Sent to"))

ggsave("advisors_2_and_3_received_sent.png")

#--- Estimate the probability of reply from each advisor

# I think that the following calculations overestimate the probabilites, because they count multiple replies to the
# same email as replies to different emails, and count replies to other people as replies to me if I am in the CC.
# However, since the point is to show how low the reply rate of Advisor1 is, an overestimate does not defeat the conclusion.
# More on that in the blog post.

# estimate the probability of reply from Advisor1
n_replies <- nrow(filter(emails_from, advisor1 & inreplyto != ""))
n_sent <- nrow(filter(emails_to, advisor1))
print(paste("P(Advisor1 replies) =", n_replies / n_sent))

# estimate the probability of reply from Advisor2
n_replies <- nrow(filter(emails_from, advisor2 & inreplyto != ""))
n_sent <- nrow(filter(emails_to, advisor2))
print(paste("P(Advisor2 replies) =", n_replies / n_sent))

# estimate the probability of reply from Advisor3
n_replies <- nrow(filter(emails_from, advisor3 & inreplyto != ""))
n_sent <- nrow(filter(emails_to, advisor3))
print(paste("P(Advisor3 replies) =", n_replies / n_sent))

#--- Bar graph: Advisors' emails by weekday

emails_by_wday <- emails_from %>% group_by(advisor) %>%
  summarize(Sun = sum(weekday == "Sun"),
            Mon = sum(weekday == "Mon"),
            Tues = sum(weekday == "Tues"),
            Wed = sum(weekday == "Wed"),
            Thurs = sum(weekday == "Thurs"),
            Fri = sum(weekday == "Fri"),
            Sat = sum(weekday == "Sat"))

for(i in 1:3) {
  emails_by_wday[i, 2:8] <- emails_by_wday[i, 2:8] / sum(emails_by_wday[i, 2:8])
}

emails_by_wday <- emails_by_wday %>% gather(day, proportion, -advisor) %>%
  mutate(day = factor(day,
                      levels = c("Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"),
                      ordered=TRUE))

ggplot(emails_by_wday, aes(day, proportion, fill = advisor)) +
  geom_bar(stat = "identity", position = "dodge") +
  ylab("Relative frequency")

ggsave("wday.png")

#--- Advisors' emails by time of day

emails_from %>% mutate(time = as.POSIXct(format(emails_from$date, format = "%H:%M:%S"),
                                         format = "%H:%M:%S")) %>%
  ggplot(aes(time, y = ..density.., color = advisor)) +
  geom_freqpoly(lwd = 1.3, bins = 18) +
  scale_x_datetime(date_labels = "%R")

ggsave("time.png")
	# This code was used for the following blog post:
	#
	# http://www.alexejgossmann.com/email_data_analysis/
	#
	# All private information was replaced with **********

	library(readr)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(lubridate)

	emails_from <- read.csv("emails_from_advisors.csv")
	emails_to <- read.csv("emails_to_advisors.csv")

	# remove time zone indicators (in order to get the time of day each email was written relative to the time zone
	# it was sent from, rather than some common time zone)
	emails_from$date <- gsub(".{6}$", "", as.character(emails_from$date))
	emails_to$date <- gsub(".{6}$", "", as.character(emails_to$date))

	# conceal real names and emails
	advisor1 <- "*******@tulane.edu"
	advisor2 <- "*******@tulane.edu"
	advisor3a <- "********@gmail.com"
	advisor3b <- "********@tulane.edu" # Advisor3 uses two email addresses

	#--- prepare a data frame of emails that I have received from the three advisors

	# From email history: I started working with advisor1 and advisor2 as my co-advisors on 9/11/2014, and we ended working
	# together on 12/19/2016. Thus, now I only consider emails exchanged during that time with them.
	# Prior to that I was working with advisor3, and we did not have email interaction after.

	advisor_duration <- interval(ymd("2014-9-11"), ymd("2016-12-20"))

	emails_from <- emails_from %>% tbl_df %>%
	mutate(date = ymd_hms(date)) %>%
	mutate(weekday = wday(date, label = TRUE)) %>%
	filter(date %within% advisor_duration \| grepl(advisor3a, from, ignore.case = TRUE) \|
	grepl(advisor3b, from, ignore.case = TRUE)) %>%
	mutate(advisor1 = (grepl(advisor1, from, ignore.case = TRUE)),
	advisor2 = (grepl(advisor2, from, ignore.case = TRUE)),
	advisor3 = (grepl(advisor3a, from, ignore.case = TRUE) \|
	grepl(advisor3b, from, ignore.case = TRUE)))

	# sanity check:
	#table(emails_from$advisor1 \| emails_from$advisor2 \| emails_from$advisor3) # all TRUE
	#table(emails_from$advisor1 & emails_from$advisor2 & emails_from$advisor3) # all FALSE

	emails_from$advisor <- rep(NA, nrow(emails_from))
	emails_from$advisor[emails_from$advisor1] <- "Advisor1"
	emails_from$advisor[emails_from$advisor2] <- "Advisor2"
	emails_from$advisor[emails_from$advisor3] <- "Advisor3"

	# use only emails with my email address included in "to", "cc", or "bcc"
	# (i.e., exclude announcement emails that go to mailing lists for the entire department etc.)
	emails_from <- filter(emails_from,
	grepl("**********", to, ignore.case = TRUE) \|
	grepl("**********", cc, ignore.case = TRUE) \|
	grepl("**********", bcc, ignore.case = TRUE))

	#--- prepare a data frame of emails that I have sent to the three advisors

	emails_to <- emails_to %>% tbl_df %>%
	mutate(date = ymd_hms(date)) %>%
	mutate(weekday = wday(date, label = TRUE)) %>%
	filter(date %within% advisor_duration \|
	grepl(advisor3a, to, ignore.case = TRUE) \|
	grepl(advisor3b, to, ignore.case = TRUE) \|
	grepl(advisor3a, cc, ignore.case = TRUE) \|
	grepl(advisor3b, cc, ignore.case = TRUE) \|
	grepl(advisor3a, bcc, ignore.case = TRUE) \|
	grepl(advisor3b, bcc, ignore.case = TRUE)) %>%
	mutate(advisor1 = (grepl(advisor1, to, ignore.case = TRUE) \|
	grepl(advisor1, cc, ignore.case = TRUE) \|
	grepl(advisor1, bcc, ignore.case = TRUE)),
	advisor2 = (grepl(advisor2, to, ignore.case = TRUE) \|
	grepl(advisor2, cc, ignore.case = TRUE) \|
	grepl(advisor2, bcc, ignore.case = TRUE)),
	advisor3 = (grepl(advisor3a, to, ignore.case = TRUE) \|
	grepl(advisor3a, cc, ignore.case = TRUE) \|
	grepl(advisor3a, bcc, ignore.case = TRUE) \|
	grepl(advisor3b, to, ignore.case = TRUE) \|
	grepl(advisor3b, cc, ignore.case = TRUE) \|
	grepl(advisor3b, bcc, ignore.case = TRUE)))

	# sanity check:
	#table(emails_to$advisor1 \| emails_to$advisor2 \| emails_to$advisor3) # all TRUE
	#table(emails_to$advisor1 & emails_to$advisor2 & emails_to$advisor3) # all FALSE
	#table(emails_to$advisor1 & emails_to$advisor2) # many TRUE
	#table(emails_to$advisor2 & emails_to$advisor3) # all FALSE

	#--- Bar graphs of the numbers of emails sent and received

	# bargraph: Advisor1 emails sent and received
	bind_rows(mutate(emails_from[emails_from$advisor1, ], fromto = "from"),
	mutate(emails_to[emails_to$advisor1, ], fromto = "to")) %>%
	ggplot(aes(x = fromto)) + geom_bar() +
	scale_x_discrete("emails", labels = c("from" = "Received from Advisor1", "to" = "Sent to Advisor1"))

	ggsave("advisor1_received_sent.png")

	# bargraph: Advisor2&3 emails sent and received
	# Here I can use facetting, because there are no emails addresses to both, Advisor2 and Advisor3, simultaneously.

	emails_from_2_and_3 <- filter(emails_from, advisor2 \| advisor3)
	emails_to_2_and_3 <- filter(emails_to, advisor2 \| advisor3)
	emails_to_2_and_3$advisor <- rep("Advisor2", nrow(emails_to_2_and_3))
	emails_to_2_and_3$advisor[emails_to_2_and_3$advisor3] <- "Advisor3"

	bind_rows(mutate(emails_from_2_and_3, fromto = "from"),
	mutate(emails_to_2_and_3, fromto = "to")) %>%
	ggplot(aes(x = fromto)) + geom_bar() +
	facet_wrap(~advisor, scales = "free") +
	scale_x_discrete("emails", labels = c("from" = "Received from", "to" = "Sent to"))

	ggsave("advisors_2_and_3_received_sent.png")

	#--- Estimate the probability of reply from each advisor

	# I think that the following calculations overestimate the probabilites, because they count multiple replies to the
	# same email as replies to different emails, and count replies to other people as replies to me if I am in the CC.
	# However, since the point is to show how low the reply rate of Advisor1 is, an overestimate does not defeat the conclusion.
	# More on that in the blog post.

	# estimate the probability of reply from Advisor1
	n_replies <- nrow(filter(emails_from, advisor1 & inreplyto != ""))
	n_sent <- nrow(filter(emails_to, advisor1))
	print(paste("P(Advisor1 replies) =", n_replies / n_sent))

	# estimate the probability of reply from Advisor2
	n_replies <- nrow(filter(emails_from, advisor2 & inreplyto != ""))
	n_sent <- nrow(filter(emails_to, advisor2))
	print(paste("P(Advisor2 replies) =", n_replies / n_sent))

	# estimate the probability of reply from Advisor3
	n_replies <- nrow(filter(emails_from, advisor3 & inreplyto != ""))
	n_sent <- nrow(filter(emails_to, advisor3))
	print(paste("P(Advisor3 replies) =", n_replies / n_sent))

	#--- Bar graph: Advisors' emails by weekday

	emails_by_wday <- emails_from %>% group_by(advisor) %>%
	summarize(Sun = sum(weekday == "Sun"),
	Mon = sum(weekday == "Mon"),
	Tues = sum(weekday == "Tues"),
	Wed = sum(weekday == "Wed"),
	Thurs = sum(weekday == "Thurs"),
	Fri = sum(weekday == "Fri"),
	Sat = sum(weekday == "Sat"))

	for(i in 1:3) {
	emails_by_wday[i, 2:8] <- emails_by_wday[i, 2:8] / sum(emails_by_wday[i, 2:8])
	}

	emails_by_wday <- emails_by_wday %>% gather(day, proportion, -advisor) %>%
	mutate(day = factor(day,
	levels = c("Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"),
	ordered=TRUE))

	ggplot(emails_by_wday, aes(day, proportion, fill = advisor)) +
	geom_bar(stat = "identity", position = "dodge") +
	ylab("Relative frequency")

	ggsave("wday.png")

	#--- Advisors' emails by time of day

	emails_from %>% mutate(time = as.POSIXct(format(emails_from$date, format = "%H:%M:%S"),
	format = "%H:%M:%S")) %>%
	ggplot(aes(time, y = ..density.., color = advisor)) +
	geom_freqpoly(lwd = 1.3, bins = 18) +
	scale_x_datetime(date_labels = "%R")

	ggsave("time.png")