Last active
November 6, 2022 21:49
-
-
Save mosajjal/592d65db89293a90449a38baf8363d1e to your computer and use it in GitHub Desktop.
Parse a directory containing .tgz mailbox imports into one JSONL doc + a folder of attachments.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// email parser built to process large amounts of .tgz exports containing one or more .eml files inside | |
package main | |
import ( | |
"archive/tar" | |
"bytes" | |
"compress/gzip" | |
"crypto/sha256" | |
"encoding/json" | |
"fmt" | |
"io" | |
"os" | |
"path/filepath" | |
"strings" | |
flag "github.com/spf13/pflag" | |
"github.com/ztrue/tracerr" | |
"github.com/DusanKasan/parsemail" | |
"golang.org/x/exp/slog" | |
) | |
type config struct { | |
InputRoot string | |
AttachmentFolder string | |
OutJSON string | |
out *os.File | |
} | |
var c config | |
var log = slog.New(slog.NewTextHandler(os.Stderr)) | |
type attachmentHash struct { | |
Filename string | |
SHA256 string | |
} | |
type emailWithAttachment struct { | |
Email parsemail.Email | |
AttachmentFiles []attachmentHash | |
} | |
func createAttachmentFile(fileBytes []byte, filename string) error { | |
f, err := os.Create(filepath.Join(c.AttachmentFolder, filename)) | |
if err != nil { | |
return tracerr.Wrap(err) | |
} | |
_, err = f.Write(fileBytes) | |
return tracerr.Wrap(err) | |
} | |
func parseEML(reader io.Reader) (emailWithAttachment, error) { | |
outEmail := emailWithAttachment{} | |
var email parsemail.Email | |
email, err := parsemail.Parse(reader) | |
if err != nil { | |
return outEmail, tracerr.Wrap(err) | |
} | |
outEmail.Email = email | |
for _, attach := range email.Attachments { | |
buf := new(bytes.Buffer) | |
_, err = buf.ReadFrom(attach.Data) | |
if err != nil { | |
return outEmail, tracerr.Wrap(err) | |
} | |
// create hash of the data bytes | |
h := sha256.New() | |
h.Write(buf.Bytes()) | |
hash := fmt.Sprintf("%x", h.Sum(nil)) | |
outEmail.AttachmentFiles = append(outEmail.AttachmentFiles, attachmentHash{Filename: attach.Filename, SHA256: hash}) | |
err = createAttachmentFile(buf.Bytes(), fmt.Sprintf("%s.attachment", hash)) | |
if err != nil { | |
return outEmail, tracerr.Wrap(err) | |
} | |
} | |
return outEmail, tracerr.Wrap(err) | |
} | |
func main() { | |
//configure | |
flag.StringVar(&c.InputRoot, "input", "", "Directory containing all the .tgz eml files") | |
flag.StringVar(&c.AttachmentFolder, "attachments", "", "Output Directory for all the extracted attachments") | |
flag.StringVar(&c.OutJSON, "output", "", "output JSONL file path") | |
flag.Parse() | |
c.out, _ = os.Create(c.OutJSON) | |
err := filepath.Walk(c.InputRoot, func(path string, info os.FileInfo, err error) error { | |
if !info.IsDir() && strings.HasSuffix(path, ".tgz") { | |
log.Info("processing " + path) | |
processTarFile(path) | |
} | |
return nil | |
}) | |
if err != nil { | |
tracerr.Print(err) | |
} | |
} | |
func processTarFile(srcFile string) { | |
f, err := os.Open(srcFile) | |
if err != nil { | |
log.Error("error in openning tar file", err) | |
os.Exit(1) | |
} | |
defer f.Close() | |
gzf, err := gzip.NewReader(f) | |
if err != nil { | |
log.Error("error in deflating tar file", err) | |
os.Exit(1) | |
} | |
tarReader := tar.NewReader(gzf) | |
for { | |
header, err := tarReader.Next() | |
if err == io.EOF { | |
break | |
} | |
if err != nil { | |
log.Error("error in reading tar file", err) | |
os.Exit(1) | |
} | |
name := header.Name | |
switch header.Typeflag { | |
case tar.TypeDir: | |
continue | |
case tar.TypeReg: | |
if strings.HasSuffix(name, ".eml") { | |
email, err := parseEML(tarReader) | |
if err != nil { | |
tracerr.Print(err) | |
} | |
marshalledEmail, err := json.Marshal(email) | |
if err != nil { | |
tracerr.Print(err) | |
} | |
fmt.Fprintf(c.out, "%s\n", marshalledEmail) | |
} | |
default: | |
log.Info(fmt.Sprintf("%s : %c %s %s\n", | |
"Yikes! Unable to figure out type", | |
header.Typeflag, | |
"in file", | |
name, | |
)) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment