Skip to content

Instantly share code, notes, and snippets.

@mosajjal
Last active November 6, 2022 21:49
Show Gist options
  • Save mosajjal/592d65db89293a90449a38baf8363d1e to your computer and use it in GitHub Desktop.
Save mosajjal/592d65db89293a90449a38baf8363d1e to your computer and use it in GitHub Desktop.
Parse a directory containing .tgz mailbox imports into one JSONL doc + a folder of attachments.
// email parser built to process large amounts of .tgz exports containing one or more .eml files inside
package main
import (
"archive/tar"
"bytes"
"compress/gzip"
"crypto/sha256"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"strings"
flag "github.com/spf13/pflag"
"github.com/ztrue/tracerr"
"github.com/DusanKasan/parsemail"
"golang.org/x/exp/slog"
)
type config struct {
InputRoot string
AttachmentFolder string
OutJSON string
out *os.File
}
var c config
var log = slog.New(slog.NewTextHandler(os.Stderr))
type attachmentHash struct {
Filename string
SHA256 string
}
type emailWithAttachment struct {
Email parsemail.Email
AttachmentFiles []attachmentHash
}
func createAttachmentFile(fileBytes []byte, filename string) error {
f, err := os.Create(filepath.Join(c.AttachmentFolder, filename))
if err != nil {
return tracerr.Wrap(err)
}
_, err = f.Write(fileBytes)
return tracerr.Wrap(err)
}
func parseEML(reader io.Reader) (emailWithAttachment, error) {
outEmail := emailWithAttachment{}
var email parsemail.Email
email, err := parsemail.Parse(reader)
if err != nil {
return outEmail, tracerr.Wrap(err)
}
outEmail.Email = email
for _, attach := range email.Attachments {
buf := new(bytes.Buffer)
_, err = buf.ReadFrom(attach.Data)
if err != nil {
return outEmail, tracerr.Wrap(err)
}
// create hash of the data bytes
h := sha256.New()
h.Write(buf.Bytes())
hash := fmt.Sprintf("%x", h.Sum(nil))
outEmail.AttachmentFiles = append(outEmail.AttachmentFiles, attachmentHash{Filename: attach.Filename, SHA256: hash})
err = createAttachmentFile(buf.Bytes(), fmt.Sprintf("%s.attachment", hash))
if err != nil {
return outEmail, tracerr.Wrap(err)
}
}
return outEmail, tracerr.Wrap(err)
}
func main() {
//configure
flag.StringVar(&c.InputRoot, "input", "", "Directory containing all the .tgz eml files")
flag.StringVar(&c.AttachmentFolder, "attachments", "", "Output Directory for all the extracted attachments")
flag.StringVar(&c.OutJSON, "output", "", "output JSONL file path")
flag.Parse()
c.out, _ = os.Create(c.OutJSON)
err := filepath.Walk(c.InputRoot, func(path string, info os.FileInfo, err error) error {
if !info.IsDir() && strings.HasSuffix(path, ".tgz") {
log.Info("processing " + path)
processTarFile(path)
}
return nil
})
if err != nil {
tracerr.Print(err)
}
}
func processTarFile(srcFile string) {
f, err := os.Open(srcFile)
if err != nil {
log.Error("error in openning tar file", err)
os.Exit(1)
}
defer f.Close()
gzf, err := gzip.NewReader(f)
if err != nil {
log.Error("error in deflating tar file", err)
os.Exit(1)
}
tarReader := tar.NewReader(gzf)
for {
header, err := tarReader.Next()
if err == io.EOF {
break
}
if err != nil {
log.Error("error in reading tar file", err)
os.Exit(1)
}
name := header.Name
switch header.Typeflag {
case tar.TypeDir:
continue
case tar.TypeReg:
if strings.HasSuffix(name, ".eml") {
email, err := parseEML(tarReader)
if err != nil {
tracerr.Print(err)
}
marshalledEmail, err := json.Marshal(email)
if err != nil {
tracerr.Print(err)
}
fmt.Fprintf(c.out, "%s\n", marshalledEmail)
}
default:
log.Info(fmt.Sprintf("%s : %c %s %s\n",
"Yikes! Unable to figure out type",
header.Typeflag,
"in file",
name,
))
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment