Created
April 6, 2020 19:53
-
-
Save olivierroy/4c88930807a733d3d40409636d9f98b9 to your computer and use it in GitHub Desktop.
Gmail mbox parser to find what is using your space
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This program parses an mbox file from Google Takeout to identify the senders and subjects lines that use the most space in a Gmail account | |
# Disclaimer: sizes or message detection may not be accurate, takes a few minutes for a 30GB file | |
# Usage: ruby gmail_stats.rb filename.mbox | |
# keep track of totals | |
$senders = {} | |
$subjects = {} | |
$labels = {} | |
def trackStatistic(statistic, key, size) | |
statistic[key] ||= {count: 0, size: 0} | |
statistic[key][:count] += 1 | |
statistic[key][:size] += size | |
end | |
def displayStatistic(variable, name, count) | |
puts "=== Top #{count} #{name} ===" | |
variable.sort_by{|k,v| -v[:size] }.first(20).each do |key, data| | |
megabytes = data[:size]/(1024.0*1024.0) | |
puts "#{key} Count: #{data[:count]} Size: #{megabytes.round(2)}mb" | |
end | |
puts "" | |
end | |
def parseMessage(message) | |
message_meta = {} | |
message.each do |line| | |
# parse the headers | |
if line.index(": ") | |
line_parts = line.split(": ") | |
key = line_parts[0] | |
value = line_parts[1..-1].join(": ").strip | |
message_meta[key] = value | |
end | |
# stop when we reach the content | |
if line.index("mimepart") | |
break | |
end | |
end | |
result = { | |
from: message_meta["From"], | |
subject: message_meta["Subject"], | |
size: message.join.bytesize | |
} | |
# Statistics | |
trackStatistic($senders, result[:from], result[:size]) | |
trackStatistic($subjects, result[:subject], result[:size]) | |
(message_meta["X-Gmail-Labels"] || "").split(",").each do |label| | |
trackStatistic($labels, label, result[:size]) | |
end | |
return result | |
end | |
def parseFile(filepath) | |
message = [] | |
File.open(filepath).each do |line| | |
if line.index("From ")==0 && line.index("@") && (message==[] || message.last=="") | |
if message.any?{|l| l!="" } | |
#message to process | |
parseMessage(message) | |
end | |
#start of a new message | |
message = [] | |
end | |
message << line.strip | |
end | |
if message.any?{|l| l!="" } | |
# parse the last message | |
parseMessage(message) | |
end | |
end | |
parseFile(ARGV[0]) | |
displayStatistic($senders, "Senders", 20) | |
displayStatistic($subjects, "Subjects", 20) | |
displayStatistic($labels, "Labels", 5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment