Last active
July 15, 2023 19:07
-
-
Save edsu/760ff538274756b6a793e1982a1f2084 to your computer and use it in GitHub Desktop.
Convert Yahoo Groups WARC archive files to MBOX files: see https://archive.org/search?query=subject%3A%22yahoo+groups%22
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# run like this: | |
# | |
# $ python3 warc2mbox.py yahoo-groups-2016-03-20T12:45:19Z-nyzp9w.warc.gz | |
# | |
# and it will generate an mbox file for each Yahoo Group: | |
# | |
# $ ls -l mboxes | |
# -rw-r--r-- 1 edsummers staff 12522488 Jul 15 14:14 amicigranata.mbox | |
# -rw-r--r-- 1 edsummers staff 6377115 Jul 15 14:14 black-white_a.mbox | |
# -rw-r--r-- 1 edsummers staff 2207823 Jul 15 14:14 boukman.mbox | |
# -rw-r--r-- 1 edsummers staff 781270 Jul 15 14:14 deardavidbeckham.mbox | |
# -rw-r--r-- 1 edsummers staff 95302 Jul 15 14:14 drawingroom2.mbox | |
# -rw-r--r-- 1 edsummers staff 3048044 Jul 15 14:14 dreamwavescomics.mbox | |
# -rw-r--r-- 1 edsummers staff 1962908885 Jul 15 14:14 evolutionary-psychology.mbox | |
# ... | |
# | |
import sys | |
import json | |
import pathlib | |
import mailbox | |
from warcio.archiveiterator import ArchiveIterator | |
warc_file = sys.argv[1] | |
warc = ArchiveIterator(open(warc_file, 'rb')) | |
mboxes = {} | |
mboxes_dir = pathlib.Path('mboxes') | |
if not mboxes_dir.is_dir(): | |
mboxes_dir.mkdir() | |
def get_mbox(uri): | |
group = uri.split('/')[2] | |
mbox = mboxes.get(group) | |
if mbox is None: | |
mbox = mailbox.mbox(mboxes_dir / f"{group}.mbox") | |
mboxes[group] = mbox | |
return mbox | |
for record in warc: | |
if record.rec_type == 'resource': | |
uri = record.rec_headers['WARC-Target-URI'] | |
if uri.endswith('raw'): | |
raw = json.load(record.raw_stream) | |
mbox = get_mbox(uri) | |
mbox.add(raw['rawEmail'].encode('utf8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment