Skip to content

Instantly share code, notes, and snippets.

@MichaelChirico
Last active May 29, 2018 08:40
Show Gist options
  • Save MichaelChirico/aea38513b38a05b417fedad50ad5d5c7 to your computer and use it in GitHub Desktop.
Save MichaelChirico/aea38513b38a05b417fedad50ad5d5c7 to your computer and use it in GitHub Desktop.
extract from Whatsapp history to data.table
library(data.table)
whatsapp_raw = readLines('~/Downloads/WhatsApp Chat with PhDelphia.txt')
#have to deal with multi-line messages :\
idx = grepl('^1?[0-9]/', whatsapp_raw)
idxrle = rle(idx)
bdpts = cumsum(idxrle$lengths)
for (ii in seq_len(length(idxrle$values))) {
if (!idxrle$values[ii]) {
whatsapp_raw[bdpts[ii - 1L]] =
paste(whatsapp_raw[bdpts[ii - 1L]:bdpts[ii]], collapse = '\n')
}
}
whatsapp_raw = whatsapp_raw[idx]
whatsapp = setDT(tstrsplit(whatsapp_raw, '(?<=[AP]M) - ', perl = TRUE))
whatsapp[ , time :=
as.POSIXct(V1, tz = 'EDT', format = '%m/%d/%y, %I:%M %p')]
whatsapp[ , V1 := NULL]
whatsapp[ , is_text := grepl(':', V2, fixed = TRUE)]
alerts = copy(whatsapp[(!is_text)])
alerts[ , is_text := NULL]
setnames(alerts, 'V2', 'alert_text')
whatsapp = whatsapp[(is_text)]
whatsapp[ , is_text := NULL]
whatsapp[ , c('name', 'message') :=
.(gsub('([^:]+):.*', '\\1', V2),
gsub('[^:]+:(.*)', '\\1', V2))]
whatsapp[ , V2 := NULL]
#fun
whatsapp[ , has_media := grepl('<Media omitted>', message, fixed = TRUE)]
whatsapp[ , has_emoji := grepl('[\U{1F300}-\U{1F6FF}]', message)]
whatsapp[ , emoji_only := !nzchar(gsub('[\U{1F300}-\U{1F6FF} ]', '', message))]
whatsapp[ , has_url := grepl('http|[.](edu|org|com)', message)]
#sample plot: messages by day, and a trend line with
# a 60-day moving average (easily adjustable window)
whatsapp[ , .N, keyby = .(t = as.Date(time))
][CJ(seq.Date(min(t), max(t), by = 'day')),
on = c(t = 'V1')
][is.na(N), N := 0
][ , {
plot(t, N, main = 'Messages by Day\nWith 60-day Windowed Average',
xlab = '', ylab = '#', cex = .6, xaxt = 'n')
dts = which(mday(t) == 1L)
axis(side = 1L, at = t[dts], las = 2L,
labels = format(t[dts], '%b-%y'))
mavg = rowMeans(sapply(-30:30, function(n) {
if (n < 0) shift(N, abs(n))
else { if (n == 0) N else shift(N, n, type = 'lead') }}),
na.rm = TRUE)
lines(t, mavg, lwd = 3L, col = 'red')}]
#sample plot: Emoji per message
# and most frequent emoji
whatsapp[ , {
emoji = na.omit(unlist(lapply(strsplit(message, ''), function(x)
x[grepl('[\U{1F300}-\U{1F6FF}]', x)])))
#U1F4FB is a modifier (white skin)
# the rest just wouldn't display on the plot
modif = c('\U{1F389}', '\U{1F3FB}', '\U{1F44F}', '\U{1F37B}',
'\U{1F483}', '\U{1F3B1}', '\U{1F44D}', '\U{1F4A3}',
'\U{1F4A9}', '\U{1F37E}', '\U{1F38A}')
omit_modif = emoji[!emoji %in% modif]
.(freq = names(table2(omit_modif, ord = 'dec'))[1L],
avg = length(omit_modif)/.N)}, by = name
][avg>0][order(-avg), {
par(mar = c(9.1, 4.1, 4.1, 2.1))
x = barplot(avg, col = 'yellow', names.arg = name, las = 2,
main = 'Emoji per Message\nAnd Most Frequent Emoji', cex.names = .8,
ylim = c(0, 1.2*max(avg)))
text(x, avg, freq, pos = 3)
}]
@RMHogervorst
Copy link

Nice! You should make this a small package!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment