Skip to content

Instantly share code, notes, and snippets.

@nacnudus
Created May 15, 2021 18:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nacnudus/b707276d705120963672623f6595d6c2 to your computer and use it in GitHub Desktop.
Save nacnudus/b707276d705120963672623f6595d6c2 to your computer and use it in GitHub Desktop.
Script to analyse Gmail message sizes by from/to/subject, useful when you're running out of space in the free tier
# Script to analyse Gmail message sizes by from/to/subject
# 1. Download Gmail from Google Takeout, specifically the folders "Inbox",
# "Archived", "Sent", and "Bin".
# 2. Extract it into the working directory. It should create the folders
# Takeout/Mail
# 3. Run this script in the working directory
library(tidyverse)
library(tm.plugin.mail)
library(fs)
library(processx)
library(sitools) # For Gigabyte ggplot2 scale
library(here)
box_data <- function(box, overwrite = FALSE, sender = TRUE, recipients = FALSE) {
unbox <- here("data", tolower(box))
mbox <- here("Takeout", "Mail", paste0(box, ".mbox"))
dir_create(unbox)
if (overwrite) {
if (dir_exists(unbox)) dir_delete(unbox)
cat("Splitting mbox file\n")
system(paste0("perl -pe 'open STDOUT, \">", unbox, "/\".++$n if /^From /' < ", mbox))
}
# Collect the file names
cat("Collecting file names\n")
files <- dir_ls(unbox, full.names = TRUE)
# Collect the file sizes
cat("Collecting file sizes\n")
size <- tibble(file = files, size = file_size(files))
if (sender) {
# Collect the "From:" addresses
cat("Collecting sender details\n")
from <-
run(
"rg",
c(
"--no-heading", # No empty lines between matches
"--with-filename", # Filename beside every match
"--no-line-number",
"-i", # Case insensitive
"-m", 1, # Return only the first match
"(^from: ?)(.*)", # Address and sometimes name of sender
"--replace", "$2", # Don't return the "From: " prefix
unbox
)
) %>%
pluck("stdout") %>%
read_lines() %>%
tibble(string = .) %>%
separate(
col = string,
into = c(
"file",
"from"
),
sep = ":",
extra = "merge"
)
} else {
from <- tibble(file = character(0), from = character(0))
}
if (recipients) {
# Collect the "From:" addresses
cat("Collecting recipient details\n")
to <-
run(
"rg",
c(
"--no-heading", # No empty lines between matches
"--with-filename", # Filename beside every match
"--no-line-number",
"-i", # Case insensitive
"-m", 1, # Return only the first match
"(^to: ?)(.*)", # Address and sometimes name of sender
"--replace", "$2", # Don't return the "From: " prefix
unbox
)
) %>%
pluck("stdout") %>%
read_lines() %>%
tibble(string = .) %>%
separate(
col = string,
into = c(
"file",
"to"
),
sep = ":",
extra = "merge"
)
} else {
to <- tibble(file = character(0), to = character(0))
}
# Collect the "Subject:"
cat("Collecting subjects\n")
subject <-
run(
"rg",
c(
"--no-heading", # No empty lines between matches
"--with-filename", # Filename beside every match
"--no-line-number",
"-i", # Case insensitive
"-m", 1, # Return only the first match
"(^subject: ?)(.*)", # Address and sometimes name of sender
"--replace", "$2", # Don't return the "From: " prefix
unbox
)
) %>%
pluck("stdout") %>%
read_lines() %>%
tibble(string = .) %>%
separate(
col = string,
into = c(
"file",
"subject"
),
sep = ":",
extra = "merge"
)
cat("Returning dataset\n")
size %>%
left_join(from, by = "file") %>%
left_join(to, by = "file") %>%
left_join(subject, by = "file")
}
inbox <- box_data("Inbox")
bin <- box_data("Bin")
archived <- box_data("Archived")
sent <- box_data("Sent", sender = FALSE, recipients = TRUE)
all <- bind_rows(
inbox = inbox, bin = bin, archived = archived, sent = sent,
.id = "box"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment