Created
February 3, 2012 06:57
-
-
Save corydolphin/1728592 to your computer and use it in GitHub Desktop.
Convert Mailman archive to text and mbox formatted archives.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
mailmanToMBox.py: Inserts line feeds to create mbox format from Mailman Gzip'd | |
Text archives | |
Usage: ./to-mbox.py dir | |
Where dir is a directory containing .txt.gz files pulled from mailman Gzip'd Text | |
""" | |
import sys | |
import os | |
def makeMBox(fIn,fOut): | |
''' | |
from http://lists2.ssc.com/pipermail/linux-list/2006-February/026220.html | |
''' | |
if not os.path.exists(fIn): | |
return False | |
if os.path.exists(fOut): | |
return False | |
out = open(fOut,"w") | |
lineNum = 0 | |
for line in open(fIn): | |
if line.find("From ") == 0: | |
if lineNum != 0: | |
out.write("\n") | |
lineNum +=1 | |
line = line.replace(" at ", "@") | |
out.write(line) | |
out.close() | |
return True | |
if __name__ == '__main__': | |
if len(sys.argv) !=2: | |
print __doc__ | |
sys.exit() | |
rootDir = sys.argv[1] | |
numConv = 0 | |
for root, dirs, files in os.walk(rootDir): | |
for fil in files: | |
if(fil.find('.txt.gz') > -1): | |
inFile = os.path.join(rootDir,fil) | |
outFile = inFile.replace('.txt.gz','.mbox') | |
if not makeMBox(inFile,outFile): | |
print(outFile,' already exists, did not overwrite') | |
else: | |
numConv +=1 | |
print('Converted ' ,str(numConv),'archives to mbox format') |
hey cory, looks like you have an error in this - or at least it didn't work for me like this.The files should be gunzipped before being processed. Right now they're just being processed as is and I get what looks like binary mbox files which totally don't work! :-).....
line 9: import gzip
line 23: for line in gzip.open(fIn):
I converted it to python 3. Also stoped decompressiong the gz in python.
#!/bin/bash
# Download and unzip the list archive
wget -r -l1 --no-parent --no-directories "http://www.host.com/pipermail/yourList.com/" -P ./yourList.com -A "*-*.txt.gz"
gzip -d yourList.com/*.txt.gz
# convert to mbox
./mailmanToMBox.py yourList.com
# Concatanate mbox
cat yourList.com/*.mbox > list-cryptography.metzdowd.com/all.mbox
#!/usr/bin/env python3
"""
mailmanToMBox.py: Inserts line feeds to create mbox format from Mailman Gzip'd
Text archives decompressed
Usage: ./to-mbox.py dir
Where dir is a directory containing .txt files pulled from mailman Gzip'd Text and decompressed
"""
import sys
import os
import tokenize
def main():
if len(sys.argv) !=2:
print(__doc__)
sys.exit()
rootDir = sys.argv[1]
numConv = 0
for root, dirs, files in os.walk(rootDir):
for fil in files:
if(fil.find('.txt') > -1):
inFile = os.path.join(rootDir,fil)
outFile = inFile.replace('.txt','.mbox')
print('Converting ',fil,' to mbox format')
if not makeMBox(inFile,outFile):
print((outFile,' already exists, did not overwrite'))
else:
numConv +=1
print('Converted ' ,str(numConv),'archives to mbox format')
def makeMBox(fIn,fOut):
'''
from http://lists2.ssc.com/pipermail/linux-list/2006-February/026220.html
'''
if not os.path.exists(fIn):
return False
if os.path.exists(fOut):
return False
out = open(fOut,"w")
lineNum = 0
# detect encoding
readsource = open(fIn,'rb').__next__
fInCodec = tokenize.detect_encoding(readsource)[0]
for line in open(fIn,'rt', encoding=fInCodec, errors="replace"):
if line.find("From ") == 0:
if lineNum != 0:
out.write("\n")
lineNum +=1
line = line.replace(" at ", "@")
out.write(line)
out.close()
return True
# INIT
if __name__ == '__main__':
main()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey @OlinSLAC we can perhaps salvage some of this for indexing email archives.