Skip to content

Instantly share code, notes, and snippets.

@corydolphin
Created February 3, 2012 06:57
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save corydolphin/1728592 to your computer and use it in GitHub Desktop.
Save corydolphin/1728592 to your computer and use it in GitHub Desktop.
Convert Mailman archive to text and mbox formatted archives.
#!/usr/bin/env python
"""
mailmanToMBox.py: Inserts line feeds to create mbox format from Mailman Gzip'd
Text archives
Usage: ./to-mbox.py dir
Where dir is a directory containing .txt.gz files pulled from mailman Gzip'd Text
"""
import sys
import os
def makeMBox(fIn,fOut):
'''
from http://lists2.ssc.com/pipermail/linux-list/2006-February/026220.html
'''
if not os.path.exists(fIn):
return False
if os.path.exists(fOut):
return False
out = open(fOut,"w")
lineNum = 0
for line in open(fIn):
if line.find("From ") == 0:
if lineNum != 0:
out.write("\n")
lineNum +=1
line = line.replace(" at ", "@")
out.write(line)
out.close()
return True
if __name__ == '__main__':
if len(sys.argv) !=2:
print __doc__
sys.exit()
rootDir = sys.argv[1]
numConv = 0
for root, dirs, files in os.walk(rootDir):
for fil in files:
if(fil.find('.txt.gz') > -1):
inFile = os.path.join(rootDir,fil)
outFile = inFile.replace('.txt.gz','.mbox')
if not makeMBox(inFile,outFile):
print(outFile,' already exists, did not overwrite')
else:
numConv +=1
print('Converted ' ,str(numConv),'archives to mbox format')
@corydolphin
Copy link
Author

Hey @OlinSLAC we can perhaps salvage some of this for indexing email archives.

@lukemunn
Copy link

hey cory, looks like you have an error in this - or at least it didn't work for me like this.The files should be gunzipped before being processed. Right now they're just being processed as is and I get what looks like binary mbox files which totally don't work! :-).....

line 9: import gzip

line 23: for line in gzip.open(fIn):

@gagarine
Copy link

gagarine commented May 9, 2018

I converted it to python 3. Also stoped decompressiong the gz in python.

#!/bin/bash

# Download and unzip the list archive
wget -r -l1 --no-parent --no-directories "http://www.host.com/pipermail/yourList.com/" -P ./yourList.com -A "*-*.txt.gz"
gzip -d yourList.com/*.txt.gz

# convert to mbox
./mailmanToMBox.py yourList.com

# Concatanate mbox
cat yourList.com/*.mbox > list-cryptography.metzdowd.com/all.mbox
#!/usr/bin/env python3
"""
mailmanToMBox.py:  Inserts line feeds to create mbox format from Mailman Gzip'd
Text archives decompressed
Usage:   ./to-mbox.py  dir
Where dir is a directory containing .txt files pulled from mailman Gzip'd Text and decompressed
"""
import sys
import os
import tokenize

def main():
    if len(sys.argv) !=2:
        print(__doc__)
        sys.exit()

    rootDir = sys.argv[1]
    numConv = 0
    for root, dirs, files in os.walk(rootDir):
        for fil in files:
            if(fil.find('.txt') > -1):
                inFile = os.path.join(rootDir,fil)
                outFile = inFile.replace('.txt','.mbox')
                print('Converting ',fil,' to mbox format')
                if not makeMBox(inFile,outFile):
                    print((outFile,' already exists, did not overwrite'))
                else:
                    numConv +=1
    print('Converted ' ,str(numConv),'archives to mbox format')
    

def makeMBox(fIn,fOut):
    '''
    from http://lists2.ssc.com/pipermail/linux-list/2006-February/026220.html
    '''
    if not os.path.exists(fIn):
        return False
    if os.path.exists(fOut):
        return False

    out = open(fOut,"w")

    lineNum = 0

    # detect encoding
    readsource =  open(fIn,'rb').__next__
    fInCodec = tokenize.detect_encoding(readsource)[0]

    for line in open(fIn,'rt', encoding=fInCodec, errors="replace"):
        if line.find("From ") == 0:
            if lineNum != 0:
                out.write("\n")
            lineNum +=1
            line = line.replace(" at ", "@")
        out.write(line)
        
            
    out.close()
    return True

# INIT
if __name__ == '__main__':
    main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment