Gmail mbox parser to find what is using your space
# This program parses an mbox file from Google Takeout to identify the senders and subjects lines that use the most space in a Gmail account | |
# Disclaimer: sizes or message detection may not be accurate, takes a few minutes for a 30GB file | |
# Usage: ruby gmail_stats.rb filename.mbox | |
# keep track of totals | |
$senders = {} | |
$subjects = {} | |
$labels = {} | |
def trackStatistic(statistic, key, size) | |
statistic[key] ||= {count: 0, size: 0} | |
statistic[key][:count] += 1 | |
statistic[key][:size] += size | |
end | |
def displayStatistic(variable, name, count) | |
puts "=== Top #{count} #{name} ===" | |
variable.sort_by{|k,v| -v[:size] }.first(20).each do |key, data| | |
megabytes = data[:size]/(1024.0*1024.0) | |
puts "#{key} Count: #{data[:count]} Size: #{megabytes.round(2)}mb" | |
end | |
puts "" | |
end | |
def parseMessage(message) | |
message_meta = {} | |
message.each do |line| | |
# parse the headers | |
if line.index(": ") | |
line_parts = line.split(": ") | |
key = line_parts[0] | |
value = line_parts[1..-1].join(": ").strip | |
message_meta[key] = value | |
end | |
# stop when we reach the content | |
if line.index("mimepart") | |
break | |
end | |
end | |
result = { | |
from: message_meta["From"], | |
subject: message_meta["Subject"], | |
size: message.join.bytesize | |
} | |
# Statistics | |
trackStatistic($senders, result[:from], result[:size]) | |
trackStatistic($subjects, result[:subject], result[:size]) | |
(message_meta["X-Gmail-Labels"] || "").split(",").each do |label| | |
trackStatistic($labels, label, result[:size]) | |
end | |
return result | |
end | |
def parseFile(filepath) | |
message = [] | |
File.open(filepath).each do |line| | |
if line.index("From ")==0 && line.index("@") && (message==[] || message.last=="") | |
if message.any?{|l| l!="" } | |
#message to process | |
parseMessage(message) | |
end | |
#start of a new message | |
message = [] | |
end | |
message << line.strip | |
end | |
if message.any?{|l| l!="" } | |
# parse the last message | |
parseMessage(message) | |
end | |
end | |
parseFile(ARGV[0]) | |
displayStatistic($senders, "Senders", 20) | |
displayStatistic($subjects, "Subjects", 20) | |
displayStatistic($labels, "Labels", 5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment