Skip to content

Instantly share code, notes, and snippets.

@bonzini
Last active March 30, 2024 02:40
Show Gist options
  • Save bonzini/d5bc1946475487167c529f9699e39512 to your computer and use it in GitHub Desktop.
Save bonzini/d5bc1946475487167c529f9699e39512 to your computer and use it in GitHub Desktop.
Split a mailbox into separate patch files, stripping the transfer encoding and minimizing the headers along the way.
#! /usr/bin/env python3
# mbox_split.py
#
# Split a mailbox into separate patch files, stripping the transfer encoding
# and minimizing the headers along the way.
#
# Written by Paolo Bonzini <pbonzini@redhat.com>
import argparse
import re
import sys
import email.parser, email.header
def subj_to_name(subj):
"""Convert a subject to a filename."""
# You can write Perl in any language. - Edgar Dijkstra, probably.
def dashify(text):
text = re.sub("[^a-zA-Z0-9_-]", "-", text)
text = re.sub("--+", "-", text)
text = re.sub("^[.-]*", "", text)
return re.sub("[.-]*$", "", text)
subj = re.sub("\n\s+", " ", subj, re.S)
m = re.match(r"""\s* (\[ [^]]* \] )""", subj, re.X)
num = 1
if m:
m2 = re.search(r"""([0-9]+)/[0-9]+""", m.group(0), re.X)
if m2:
num = int(m2.group(1))
subj = subj[m.end() :]
m = re.match(r"""\s* ( \[ [^]]* \] | \S+: )?""", subj, re.X)
area = "misc"
if m and m.group(1):
area = dashify(m.group(1))
subj = subj[m.end() :]
text = dashify(subj.strip())
return "%04d-%s-%s.patch" % (num, area, text)
def has_patch(body):
"""Return whether the body includes a patch."""
return re.search(
b"""^---.* ^\\+\\+\\+.* ^@@
|^diff.* ^index.* ^GIT binary patch
|^diff.* ^old mode .* ^new mode""",
body,
re.M | re.S | re.X,
)
def header_to_string(v):
"""Convert a MIME encoded header to Unicode."""
return email.header.make_header(email.header.decode_header(v))
def do_single(msg, outfile=None):
"""Remove unnecessary headers from the message as well as
content-transfer-encoding, and print it to outfile or to
a file whose name is derived from the subject. If the
latter, the name of the file is printed to stdout."""
def open_output_file(msg):
name = subj_to_name(msg["Subject"])
print(name)
return open(name, "wb")
container = msg.get_payload(0) if msg.is_multipart() else msg
body = container.get_payload(decode=True)
if not args.keep_cr:
body = body.replace(b"\r\n", b"\n")
if not args.nopatch and not has_patch(body):
return
with outfile or open_output_file(msg) as f:
for k in ("From", "Subject", "Date", "Content-Type"):
if k in msg:
f.write(("%s: %s\n" % (k, header_to_string(msg[k]))).encode())
f.write(b"\n")
f.write(body)
def split_mbox(stream, func):
"""Split an mbox file and pass each part to a function func."""
parser = None
for line in stream:
if line.startswith(b"From "):
# finish the previous message
if parser:
func(parser.close())
parser = None
else:
if not parser and line.strip() == b"":
continue
if line.startswith(b">From"):
line = line[1:]
if not parser:
parser = email.parser.BytesFeedParser()
parser.feed(line)
if parser:
func(parser.close())
parser = argparse.ArgumentParser(
description="Splits a given mailbox into separate patch files"
)
parser.add_argument(
"--nopatch",
action="store_true",
default=False,
help="exports even if it's not a patch",
)
parser.add_argument(
"--single",
action="store_true",
default=False,
help="do not split mbox file",
)
parser.add_argument(
"--keep-cr",
action="store_true",
default=False,
help=r"do not remove \r from lines ending with \r\n",
)
parser.add_argument(
"mbox",
metavar="<mailbox file>",
nargs="?",
help='specifies the mailbox file; if "-" or absent, read from stdin',
)
args = parser.parse_args()
if not args.mbox or args.mbox == "-":
infile = sys.stdin.buffer
else:
infile = open(args.mbox, "rb")
if args.single:
msg = email.parser.BytesParser().parse(infile)
do_single(msg, sys.stdout.buffer)
else:
split_mbox(infile, do_single)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment