Last active
November 21, 2016 08:34
-
-
Save tts/01fab062155305203756553e3afcc33f to your computer and use it in GitHub Desktop.
Shiny web app on the CRIS project emails
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(shiny) | |
library(highcharter) | |
library(dygraphs) | |
library(xts) | |
library(dplyr) | |
shinyApp( | |
ui = fluidPage( | |
fluidRow( | |
column(12, | |
mainPanel( | |
tabsetPanel( | |
tabPanel("Time series", | |
h4("My CRIS project emails"), | |
dygraphOutput("dp", width = "100%", height = "700") | |
), | |
tabPanel("Hour", | |
h4("Emails/hour"), | |
highchartOutput("hchr", width = "100%", height = "500") | |
), | |
tabPanel("Weekday", | |
h4("Emails/weekday"), | |
highchartOutput("hcwd", width = "100%", height = "500") | |
), | |
tabPanel("Top 400 words 2015", | |
imageOutput("preImage2015") | |
), | |
tabPanel("Top 400 words 2016", | |
imageOutput("preImage2016") | |
), | |
tabPanel("Co-occurence of most frequent words", | |
imageOutput("occ", height = 600)) | |
) | |
) | |
) | |
) | |
), | |
server = function(input, output) { | |
# Import daily stats for dygraph | |
emailstats <- read.csv("emailstats.csv", stringsAsFactors = F) | |
# hourly stats for highcharts | |
emailstatsdetail <- read.csv("emaildetailstats.csv", stringsAsFactors = F) | |
# and weekday stats | |
emailstatsweekday <- read.csv("emailwdstats.csv", stringsAsFactors = F) | |
# Prepare dygraph object | |
# | |
# Fill in missing dates | |
# http://stackoverflow.com/a/31484550 | |
ts <- seq.POSIXt(as.POSIXct("2012-03-23",'%Y-%m-%d'), as.POSIXct("2016-10-25",'%Y-%m-%d'), by="day") | |
df <- data.frame(timestamp=ts) | |
emailstats$date <- as.POSIXct(emailstats$date, '%Y-%m-%d') | |
data_with_missing_dates <- full_join(df,emailstats,by=c("timestamp"="date")) | |
# Replace NA with 0 | |
data_with_missing_dates$emails <- ifelse(is.na(data_with_missing_dates$emails), 0, data_with_missing_dates$emails) | |
# Add rownames | |
rownames(data_with_missing_dates) <- data_with_missing_dates[[1]] | |
# Cast to a XTS object | |
emailstats.xts <- as.xts(data_with_missing_dates, dateFormat = "Date") | |
dygraph_plot <- dygraph(emailstats.xts) %>% | |
dyOptions(stepPlot=TRUE, | |
pointSize=7, | |
fillGraph = TRUE, | |
colors = RColorBrewer::brewer.pal(3, "Set1"), | |
strokeWidth = 3, | |
strokePattern = "dashed") %>% | |
dyLegend(width = 200, | |
show = "follow", | |
hideOnMouseOut = FALSE) %>% | |
dyEvent("2012-05-24", "Project set", labelLoc = "top") %>% | |
dyEvent("2013-10-02", "Investment decision", labelLoc = "top") %>% | |
dyEvent("2013-12-19", "Call for tender", labelLoc = "top") %>% | |
dyEvent("2014-06-05", "Procurement decision", labelLoc = "top") %>% | |
dyEvent("2014-09-12", "Kick-off", labelLoc = "top") %>% | |
dyEvent("2016-09-05", "Portal open!", labelLoc = "top") | |
emailstatsdetail <- emailstatsdetail %>% | |
group_by(hour) %>% | |
summarise(emails = sum(emails)) | |
emailstatsdetail$hour <- factor(emailstatsdetail$hour, levels = 1:24) | |
# Hourly stats | |
hchart_hr <- hchart(emailstatsdetail, "column", x = hour, y = emails) %>% | |
hc_colors("orange") | |
# and weekday stats | |
hchart_wd <- hchart(emailstatsweekday, "column", x = weekday, y = emails) %>% | |
hc_colors("orange") | |
output$dp <- renderDygraph(dygraph_plot) | |
output$hchr <- renderHighchart(hchart_hr) | |
output$hcwd <- renderHighchart(hchart_wd) | |
output$preImage2015 <- renderImage({ | |
filename <- normalizePath(file.path('./images/emailcloud_2015.png')) | |
list(src = filename, | |
alt = "Wordcloud 2015") | |
}, deleteFile = FALSE) | |
output$preImage2016 <- renderImage({ | |
filename <- normalizePath(file.path('./images/emailcloud_2016.png')) | |
list(src = filename, | |
alt = "Wordcloud 2016") | |
}, deleteFile = FALSE) | |
output$occ <- renderImage({ | |
filename <- normalizePath(file.path('./images/word_occ_ggraph.png')) | |
list(src = filename, | |
alt = "Word occurences") | |
}, deleteFile = FALSE) | |
} | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(stringr) | |
library(tidyr) | |
library(wordcloud) | |
library(tm) | |
library(lubridate) | |
raw <- readLines("parsed_w_awk_notempty.txt", encoding = "UTF-8") | |
lines <- data_frame(raw = raw) | |
# http://stackoverflow.com/a/12985882 | |
# Number sequences of emails | |
lines$email <- NA | |
# Define, which head/body belongs to which email in sequence | |
x <- grepl("^head ", lines$raw) | |
spanEmail <- rle(x)$lengths[rle(x)$values == TRUE] | |
lines$email[x] <- rep(seq_along(spanEmail), times = spanEmail) | |
x <- grepl("^body ", lines$raw) | |
spanEmail <- rle(x)$lengths[rle(x)$values == TRUE] | |
lines$email[x] <- rep(seq_along(spanEmail), times = spanEmail) | |
# Group by email Within each group, paste and clean head and body | |
lines_g <- lines %>% | |
group_by(email) %>% | |
summarize(head_raw = str_c(raw, collapse = " "), body_raw = str_c(raw, collapse = " ")) %>% # hacky | |
mutate(head = gsub("body .*", "", head_raw)) %>% # cleaning the hack but a caveat: | |
# if the email is a fwd'ed one, Sent etc are in the body part | |
mutate(body = gsub("(^head .*)(body .*?)", "\\2", body_raw)) | |
lines_g$head <- gsub("head ", "", lines_g$head) | |
lines_g$body <- gsub("body ", "", lines_g$body) | |
# In header, separate From, Sent/To, and Subject to own columns. | |
# First, add a separator | |
lines_g$head <- gsub(" Sent:", "%Sent:", lines_g$head) | |
lines_g$head <- gsub(" Date:", "%Sent:", lines_g$head) | |
lines_g$head <- gsub(" To:", "%To:", lines_g$head) | |
lines_g$head <- gsub(" Subject:", "%Subject:", lines_g$head) | |
lines_g_h <- lines_g %>% | |
separate(head, c("From", "Sent", "To", "Subject"), sep = "%", remove = FALSE) %>% # only the first occurrences are parsed, which is OK | |
select(email, From, Sent, To, Subject, body) | |
# Remove labels and whitespace | |
lines_g_h$From <- gsub("From:", "", lines_g_h$From) | |
lines_g_h$Sent <- gsub("Sent:", "", lines_g_h$Sent) | |
lines_g_h$To <- gsub("To:", "", lines_g_h$To) | |
lines_g_h$Subject <- gsub("Subject:", "", lines_g_h$Subject) | |
lines_g_h$From <- gsub("\t", "", lines_g_h$From) | |
lines_g_h$Sent <- gsub("\t", "", lines_g_h$Sent) | |
lines_g_h$To <- gsub("\t", "", lines_g_h$To) | |
lines_g_h$Subject <- gsub("\t", "", lines_g_h$Subject) | |
# Trouble with those emails where the head doesn't contain all necessary fields. | |
# Leaving them out. | |
lines_g_h <- lines_g_h[!is.na(lines_g_h$Sent),] | |
# Cast day and time to Date and Time, after translating months | |
lines_g_h$Sent <- gsub("tammikuuta","January", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("helmikuuta","February", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("maaliskuuta","March", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("huhtikuuta","April", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("toukokuuta","May", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("kesäkuuta","June", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("heinäkuuta","July", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("elokuuta","August", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("syyskuuta","September", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("lokakuuta","October", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("marraskuuta","November", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("joulukuuta","December", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("maanantaina","Monday", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("tiistaina","Tuesday", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("keskiviikkona","Wednesday", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("torstaina","Thursday", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("perjantaina","Friday", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("lauantaina","Saturday", lines_g_h$Sent) | |
lines_g_h$Sent <- gsub("sununtaina","Sunday", lines_g_h$Sent) | |
library(lubridate) | |
dmy_hm <- dmy_hm(lines_g_h$Sent) | |
lines_g_h$Sent <- dmy_hm | |
# Leaving out those that failed to parse | |
lines_g_h_p <- lines_g_h[!is.na(lines_g_h$Sent),] | |
# xts object to-be-done is off by one day without this | |
Sys.setenv(TZ=Sys.timezone()) | |
# Daily statistics | |
dailysums <- lines_g_h_p %>% | |
mutate(day = format(Sent, "%d"), | |
month = format(Sent, "%m"), | |
year = format(Sent, "%Y")) %>% | |
group_by(day, month, year) %>% | |
summarise(emails = n()) | |
stats_sum <- dailysums %>% | |
mutate(date = as.POSIXct(paste(day, month, year, sep = "-"), | |
format = "%d-%m-%Y" ), tz=Sys.timezone()) | |
stats <- stats_sum[,c("date", "emails")] | |
# Hourly statistics | |
timesums <- lines_g_h_p %>% | |
mutate(hour = hour(Sent)) %>% | |
mutate(day = day(Sent)) %>% | |
mutate(month = month(Sent)) %>% | |
mutate(year = year(Sent)) %>% | |
group_by(year,month,day,hour) %>% | |
summarise(emails = n()) | |
timesums$hour <- factor(timesums$hour, levels = 1:24) | |
# Weekdays statistics | |
timesums_wd <- lines_g_h_p %>% | |
mutate(weekday = weekdays(Sent)) %>% | |
group_by(weekday) %>% | |
summarise(emails = n()) | |
timesums_wd$weekday <- factor(as.character(weekdaysums$weekday), | |
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) | |
timesums_wd <- timesums_wd[order(timesums_wd$weekday), ] | |
# First, a fair bit of cleaning (names, urls etc.) | |
lines_g_h_p$body <- gsub("[strings not shown here on purposes]", "", lines_g_h_p$body) | |
# No FI Pure emails | |
lines_g_h_p <- lines_g_h_p[!grepl(".*FIPure.*", lines_g_h_p$Subject),] | |
# Take the first 400 chars only | |
lines_g_h_p$body <- substr(lines_g_h_p$body,1,400) | |
# All years in data | |
years <- sort(unique(year(lines_g_h_p$Sent))) | |
# Plot wordcloud from every year | |
sapply(years[5], function(x) { | |
thisyear <- lines_g_h_p %>% | |
filter(year(Sent) == x) | |
do_cloud(thisyear, x) | |
}) | |
do_cloud <- function(df,year) { | |
res.corpus <- Corpus(DataframeSource(df[6])) | |
res.corpus <- tm_map(res.corpus, removePunctuation) | |
res.corpus <- tm_map(res.corpus, content_transformer(tolower)) | |
res.corpus <- tm_map(res.corpus, function(x) removeWords(x, | |
c(stopwords("english"), | |
"että", | |
"kun", | |
"jos", | |
"ole"))) | |
tdm <- TermDocumentMatrix(res.corpus) | |
m <- as.matrix(tdm) | |
v <- sort(rowSums(m),decreasing=TRUE) | |
d <- data.frame(word = names(v),freq=v) | |
table(d$freq) | |
pal2 <- brewer.pal(8, "Dark2") | |
png(paste0("emailcloud_",year,".png"), width=1280,height=800) | |
wcloud_plot <- wordcloud(d$word, | |
d$freq, | |
scale=c(8,.2), | |
min.freq=3, | |
max.words=400, | |
random.order=FALSE, | |
rot.per=.15, | |
colors=pal2) | |
dev.off() | |
} | |
# Save CSV data for the Shiny app | |
write.csv(stats, file = "emailstats.csv", row.names = F) | |
write.csv(timesums, file = "emaildetailstats.csv", row.names = F) | |
write.csv(timesums_wd, file = "emailwdstats.csv", row.names = F) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment