Skip to content

Instantly share code, notes, and snippets.

@tts
Last active November 21, 2016 08:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tts/01fab062155305203756553e3afcc33f to your computer and use it in GitHub Desktop.
Save tts/01fab062155305203756553e3afcc33f to your computer and use it in GitHub Desktop.
Shiny web app on the CRIS project emails
library(shiny)
library(highcharter)
library(dygraphs)
library(xts)
library(dplyr)
shinyApp(
ui = fluidPage(
fluidRow(
column(12,
mainPanel(
tabsetPanel(
tabPanel("Time series",
h4("My CRIS project emails"),
dygraphOutput("dp", width = "100%", height = "700")
),
tabPanel("Hour",
h4("Emails/hour"),
highchartOutput("hchr", width = "100%", height = "500")
),
tabPanel("Weekday",
h4("Emails/weekday"),
highchartOutput("hcwd", width = "100%", height = "500")
),
tabPanel("Top 400 words 2015",
imageOutput("preImage2015")
),
tabPanel("Top 400 words 2016",
imageOutput("preImage2016")
),
tabPanel("Co-occurence of most frequent words",
imageOutput("occ", height = 600))
)
)
)
)
),
server = function(input, output) {
# Import daily stats for dygraph
emailstats <- read.csv("emailstats.csv", stringsAsFactors = F)
# hourly stats for highcharts
emailstatsdetail <- read.csv("emaildetailstats.csv", stringsAsFactors = F)
# and weekday stats
emailstatsweekday <- read.csv("emailwdstats.csv", stringsAsFactors = F)
# Prepare dygraph object
#
# Fill in missing dates
# http://stackoverflow.com/a/31484550
ts <- seq.POSIXt(as.POSIXct("2012-03-23",'%Y-%m-%d'), as.POSIXct("2016-10-25",'%Y-%m-%d'), by="day")
df <- data.frame(timestamp=ts)
emailstats$date <- as.POSIXct(emailstats$date, '%Y-%m-%d')
data_with_missing_dates <- full_join(df,emailstats,by=c("timestamp"="date"))
# Replace NA with 0
data_with_missing_dates$emails <- ifelse(is.na(data_with_missing_dates$emails), 0, data_with_missing_dates$emails)
# Add rownames
rownames(data_with_missing_dates) <- data_with_missing_dates[[1]]
# Cast to a XTS object
emailstats.xts <- as.xts(data_with_missing_dates, dateFormat = "Date")
dygraph_plot <- dygraph(emailstats.xts) %>%
dyOptions(stepPlot=TRUE,
pointSize=7,
fillGraph = TRUE,
colors = RColorBrewer::brewer.pal(3, "Set1"),
strokeWidth = 3,
strokePattern = "dashed") %>%
dyLegend(width = 200,
show = "follow",
hideOnMouseOut = FALSE) %>%
dyEvent("2012-05-24", "Project set", labelLoc = "top") %>%
dyEvent("2013-10-02", "Investment decision", labelLoc = "top") %>%
dyEvent("2013-12-19", "Call for tender", labelLoc = "top") %>%
dyEvent("2014-06-05", "Procurement decision", labelLoc = "top") %>%
dyEvent("2014-09-12", "Kick-off", labelLoc = "top") %>%
dyEvent("2016-09-05", "Portal open!", labelLoc = "top")
emailstatsdetail <- emailstatsdetail %>%
group_by(hour) %>%
summarise(emails = sum(emails))
emailstatsdetail$hour <- factor(emailstatsdetail$hour, levels = 1:24)
# Hourly stats
hchart_hr <- hchart(emailstatsdetail, "column", x = hour, y = emails) %>%
hc_colors("orange")
# and weekday stats
hchart_wd <- hchart(emailstatsweekday, "column", x = weekday, y = emails) %>%
hc_colors("orange")
output$dp <- renderDygraph(dygraph_plot)
output$hchr <- renderHighchart(hchart_hr)
output$hcwd <- renderHighchart(hchart_wd)
output$preImage2015 <- renderImage({
filename <- normalizePath(file.path('./images/emailcloud_2015.png'))
list(src = filename,
alt = "Wordcloud 2015")
}, deleteFile = FALSE)
output$preImage2016 <- renderImage({
filename <- normalizePath(file.path('./images/emailcloud_2016.png'))
list(src = filename,
alt = "Wordcloud 2016")
}, deleteFile = FALSE)
output$occ <- renderImage({
filename <- normalizePath(file.path('./images/word_occ_ggraph.png'))
list(src = filename,
alt = "Word occurences")
}, deleteFile = FALSE)
}
)
library(dplyr)
library(stringr)
library(tidyr)
library(wordcloud)
library(tm)
library(lubridate)
raw <- readLines("parsed_w_awk_notempty.txt", encoding = "UTF-8")
lines <- data_frame(raw = raw)
# http://stackoverflow.com/a/12985882
# Number sequences of emails
lines$email <- NA
# Define, which head/body belongs to which email in sequence
x <- grepl("^head ", lines$raw)
spanEmail <- rle(x)$lengths[rle(x)$values == TRUE]
lines$email[x] <- rep(seq_along(spanEmail), times = spanEmail)
x <- grepl("^body ", lines$raw)
spanEmail <- rle(x)$lengths[rle(x)$values == TRUE]
lines$email[x] <- rep(seq_along(spanEmail), times = spanEmail)
# Group by email Within each group, paste and clean head and body
lines_g <- lines %>%
group_by(email) %>%
summarize(head_raw = str_c(raw, collapse = " "), body_raw = str_c(raw, collapse = " ")) %>% # hacky
mutate(head = gsub("body .*", "", head_raw)) %>% # cleaning the hack but a caveat:
# if the email is a fwd'ed one, Sent etc are in the body part
mutate(body = gsub("(^head .*)(body .*?)", "\\2", body_raw))
lines_g$head <- gsub("head ", "", lines_g$head)
lines_g$body <- gsub("body ", "", lines_g$body)
# In header, separate From, Sent/To, and Subject to own columns.
# First, add a separator
lines_g$head <- gsub(" Sent:", "%Sent:", lines_g$head)
lines_g$head <- gsub(" Date:", "%Sent:", lines_g$head)
lines_g$head <- gsub(" To:", "%To:", lines_g$head)
lines_g$head <- gsub(" Subject:", "%Subject:", lines_g$head)
lines_g_h <- lines_g %>%
separate(head, c("From", "Sent", "To", "Subject"), sep = "%", remove = FALSE) %>% # only the first occurrences are parsed, which is OK
select(email, From, Sent, To, Subject, body)
# Remove labels and whitespace
lines_g_h$From <- gsub("From:", "", lines_g_h$From)
lines_g_h$Sent <- gsub("Sent:", "", lines_g_h$Sent)
lines_g_h$To <- gsub("To:", "", lines_g_h$To)
lines_g_h$Subject <- gsub("Subject:", "", lines_g_h$Subject)
lines_g_h$From <- gsub("\t", "", lines_g_h$From)
lines_g_h$Sent <- gsub("\t", "", lines_g_h$Sent)
lines_g_h$To <- gsub("\t", "", lines_g_h$To)
lines_g_h$Subject <- gsub("\t", "", lines_g_h$Subject)
# Trouble with those emails where the head doesn't contain all necessary fields.
# Leaving them out.
lines_g_h <- lines_g_h[!is.na(lines_g_h$Sent),]
# Cast day and time to Date and Time, after translating months
lines_g_h$Sent <- gsub("tammikuuta","January", lines_g_h$Sent)
lines_g_h$Sent <- gsub("helmikuuta","February", lines_g_h$Sent)
lines_g_h$Sent <- gsub("maaliskuuta","March", lines_g_h$Sent)
lines_g_h$Sent <- gsub("huhtikuuta","April", lines_g_h$Sent)
lines_g_h$Sent <- gsub("toukokuuta","May", lines_g_h$Sent)
lines_g_h$Sent <- gsub("kesäkuuta","June", lines_g_h$Sent)
lines_g_h$Sent <- gsub("heinäkuuta","July", lines_g_h$Sent)
lines_g_h$Sent <- gsub("elokuuta","August", lines_g_h$Sent)
lines_g_h$Sent <- gsub("syyskuuta","September", lines_g_h$Sent)
lines_g_h$Sent <- gsub("lokakuuta","October", lines_g_h$Sent)
lines_g_h$Sent <- gsub("marraskuuta","November", lines_g_h$Sent)
lines_g_h$Sent <- gsub("joulukuuta","December", lines_g_h$Sent)
lines_g_h$Sent <- gsub("maanantaina","Monday", lines_g_h$Sent)
lines_g_h$Sent <- gsub("tiistaina","Tuesday", lines_g_h$Sent)
lines_g_h$Sent <- gsub("keskiviikkona","Wednesday", lines_g_h$Sent)
lines_g_h$Sent <- gsub("torstaina","Thursday", lines_g_h$Sent)
lines_g_h$Sent <- gsub("perjantaina","Friday", lines_g_h$Sent)
lines_g_h$Sent <- gsub("lauantaina","Saturday", lines_g_h$Sent)
lines_g_h$Sent <- gsub("sununtaina","Sunday", lines_g_h$Sent)
library(lubridate)
dmy_hm <- dmy_hm(lines_g_h$Sent)
lines_g_h$Sent <- dmy_hm
# Leaving out those that failed to parse
lines_g_h_p <- lines_g_h[!is.na(lines_g_h$Sent),]
# xts object to-be-done is off by one day without this
Sys.setenv(TZ=Sys.timezone())
# Daily statistics
dailysums <- lines_g_h_p %>%
mutate(day = format(Sent, "%d"),
month = format(Sent, "%m"),
year = format(Sent, "%Y")) %>%
group_by(day, month, year) %>%
summarise(emails = n())
stats_sum <- dailysums %>%
mutate(date = as.POSIXct(paste(day, month, year, sep = "-"),
format = "%d-%m-%Y" ), tz=Sys.timezone())
stats <- stats_sum[,c("date", "emails")]
# Hourly statistics
timesums <- lines_g_h_p %>%
mutate(hour = hour(Sent)) %>%
mutate(day = day(Sent)) %>%
mutate(month = month(Sent)) %>%
mutate(year = year(Sent)) %>%
group_by(year,month,day,hour) %>%
summarise(emails = n())
timesums$hour <- factor(timesums$hour, levels = 1:24)
# Weekdays statistics
timesums_wd <- lines_g_h_p %>%
mutate(weekday = weekdays(Sent)) %>%
group_by(weekday) %>%
summarise(emails = n())
timesums_wd$weekday <- factor(as.character(weekdaysums$weekday),
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
timesums_wd <- timesums_wd[order(timesums_wd$weekday), ]
# First, a fair bit of cleaning (names, urls etc.)
lines_g_h_p$body <- gsub("[strings not shown here on purposes]", "", lines_g_h_p$body)
# No FI Pure emails
lines_g_h_p <- lines_g_h_p[!grepl(".*FIPure.*", lines_g_h_p$Subject),]
# Take the first 400 chars only
lines_g_h_p$body <- substr(lines_g_h_p$body,1,400)
# All years in data
years <- sort(unique(year(lines_g_h_p$Sent)))
# Plot wordcloud from every year
sapply(years[5], function(x) {
thisyear <- lines_g_h_p %>%
filter(year(Sent) == x)
do_cloud(thisyear, x)
})
do_cloud <- function(df,year) {
res.corpus <- Corpus(DataframeSource(df[6]))
res.corpus <- tm_map(res.corpus, removePunctuation)
res.corpus <- tm_map(res.corpus, content_transformer(tolower))
res.corpus <- tm_map(res.corpus, function(x) removeWords(x,
c(stopwords("english"),
"että",
"kun",
"jos",
"ole")))
tdm <- TermDocumentMatrix(res.corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
table(d$freq)
pal2 <- brewer.pal(8, "Dark2")
png(paste0("emailcloud_",year,".png"), width=1280,height=800)
wcloud_plot <- wordcloud(d$word,
d$freq,
scale=c(8,.2),
min.freq=3,
max.words=400,
random.order=FALSE,
rot.per=.15,
colors=pal2)
dev.off()
}
# Save CSV data for the Shiny app
write.csv(stats, file = "emailstats.csv", row.names = F)
write.csv(timesums, file = "emaildetailstats.csv", row.names = F)
write.csv(timesums_wd, file = "emailwdstats.csv", row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment