Skip to content

Instantly share code, notes, and snippets.

@pedramamini
Last active August 29, 2015 14:11
Show Gist options
  • Save pedramamini/03e3a42d521a6a7edf36 to your computer and use it in GitHub Desktop.
Save pedramamini/03e3a42d521a6a7edf36 to your computer and use it in GitHub Desktop.
Break an mbox file out into multiple files by year. Written to chunk my GMail Takeout into manageable pieces.
#!/usr/bin/env python
"""
What
Break an mbox file out into multiple files by year.
Written to chunk my GMail Takeout into manageable pieces.
Will prefix YEAR- to mbox name and *append* to those file names.
Who
Pedram Amini
http://pedramamini.com
https://gist.github.com/pedramamini/03e3a42d521a6a7edf36
Usage
split_mbox_by_year <input_mbox>
Example
$ ./split_mbox_by_year.py GMail.mbox
[**] processing GMail.mbox...
[**] setting output spool 2014-GMail.mbox.
[**] closing output spool 2014-GMail.mbox with 3248 messages.
[**] setting output spool 2013-GMail.mbox.
[**] closing output spool 2013-GMail.mbox with 7 messages.
[**] setting output spool 2014-GMail.mbox.
[**] closing output spool 2014-GMail.mbox with 3306 messages.
[**] setting output spool 2013-GMail.mbox.
[**] closing output spool 2013-GMail.mbox with 8 messages.
[**] setting output spool 2014-GMail.mbox.
[**] closing output spool 2014-GMail.mbox with 3593 messages.
[**] setting output spool 2013-GMail.mbox.
[**] closing output spool 2013-GMail.mbox with 10 messages.
[**] setting output spool 2014-GMail.mbox.
[**] closing output spool 2014-GMail.mbox with 6075 messages.
[**] setting output spool 2013-GMail.mbox.
[**] closing output spool 2013-GMail.mbox with 12 messages.
...
[**] closing output spool 2013-GMail.mbox with 23181 messages.
[**] setting output spool 2012-GMail.mbox.
[**] closing output spool 2012-GMail.mbox with 23689 messages.
[**] setting output spool 2007-GMail.mbox.
[**] closing output spool 2007-GMail.mbox with 3 messages.
[**] setting output spool 2012-GMail.mbox.
[**] completed processing 73151 messages into 4 spools in 213 seconds.
"""
import time
import sys
import os
import re
def USAGE ():
sys.stderr.write("Usage: %s <input_mbox>\n" % __file__)
sys.exit(1)
def dbg (msg, newline=True, wrap=False):
if newline and not wrap:
msg += "\n"
if wrap:
msg += "\r"
sys.stdout.write("[**] %s" % msg)
if __name__ == "__main__":
if len(sys.argv) != 2:
USAGE()
input_mbox_path = sys.argv.pop()
input_mbox_dir = os.path.dirname(input_mbox_path)
input_mbox_base = os.path.basename(input_mbox_path)
if not os.path.exists(input_mbox_path):
sys.stderr.write("[!!] mbox not found: %s\n" % input_mbox_path)
USAGE()
with open(input_mbox_path) as fh:
if not fh.read(4) == "From":
sys.stderr.write("[!!] %s does not look like a valid mbox file" % input_mbox_path)
USAGE()
dbg("processing %s..." % input_mbox_path)
spool_year = 0
spool_path = None
spool_handle = None
spool_counts = {}
msg_count = 0
start_time = int(time.time())
with open(input_mbox_path) as mbox:
while 1:
try:
line = mbox.next()
except StopIteration:
break
msg_start = re.match("^From .* (\d{4})$", line.strip())
if msg_start:
msg_count += 1
dbg("processing message #%d" % msg_count, wrap=True)
year = msg_start.groups()[0]
if year != spool_year:
if spool_handle:
spool_handle.close()
dbg("closing output spool %s with %d messages." % (spool_path, spool_counts[spool_year]))
spool_year = year
spool_path = os.path.join(input_mbox_dir, "%s-%s" % (spool_year, input_mbox_base))
spool_handle = open(spool_path, "a+")
dbg("setting output spool %s." % spool_path)
spool_counts[spool_year] = spool_counts.get(spool_year, 0) + 1
spool_handle.write(line)
delta = int(time.time()) - start_time
dbg("completed processing %d messages into %d spools in %d seconds." % (msg_count, len(spool_counts), delta))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment