beamzer/parseml.py

## parseml.py
#!/usr/bin/env python3.7

"""
source: https://gist.github.com/urschrei/5258588 by Stephan Hügel
2020 update:
- More iterators, fewer lists
- Python 3 compatible
- Processes files in parallel
(one thread per CPU, but that's not really how it works)

Ewald ( https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a )
2020-09-20:
- handling of same filenames (write everything because contents might be different)
- handling of filenames with *
- handling of mkdir errors
- added arguments
2020-09-23 (v1.2)
- version_nr before extension
- error handling for utf-8 chars in eml (on error continue)
2020-10-02 (v1.3)
- now correctly handles RFC2047 MIME encoded filenames
2020-10-06 (v1.4)
- now handles multi-line filenames
- fixed handling of emails with no attachments
"""

import glob
import os
import email
import argparse
from multiprocessing import Pool
from cs.rfc2047 import unrfc2047

EXTENSION = "eml"

parser = argparse.ArgumentParser(description='extract attachments from eml files')
parser.add_argument(
    '-d','--debug',
    action='store_true',
    help='print debug messages to stderr'
)
parser.add_argument(
    '-s','--single',
    action='store_true',
    help='run as single thread (default = multithreaded, one thread per core)'
)
parser.add_argument(
    '-q','--quiet',
    action='store_true',
    help='no output'
)
args = parser.parse_args()
debug = args.debug
single = args.single
quiet = args.quiet
debug and print("debug output is active")

# ensure that an output dir exists
od = "attachments"
# the exist_ok=True avoids error messages due to us being multithreaded and race-conditions
# that should be no problem since we moved this out of the repetitive extract function
os.path.exists(od) or os.makedirs(od,exist_ok=True)

def extract(filename):
    """
    Try to extract the attachments from filename
    """
    debug and print("=> reading %s" % filename)
    output_count = 0
    try:
        with open(filename, "r") as f:
            try:
                msg = email.message_from_file(f)
                nratt = len(msg.get_payload())
                # this will be 4000something if no attachments are present
                if (nratt > 1 and nratt < 20):
                    for attachment in msg.get_payload()[1:]:
                        of = attachment.get_filename()
                        debug and print("attachment name: %s" % of)

                        # handle multi-line strings, and other problematic characters
                        of = of.replace("\n", "")
                        of = of.replace("\t", "_")
                        of = of.replace("\*", "#")

                        # this is to handle RFC2047 MIME encoded filenames (often used for obfuscation)
                        try:
                            output_filename = unrfc2047(of)
                            if ( of != output_filename):
                                debug and print("decoded attachment name: %s" % output_filename)
                        except Exception as inst:
                            print(type(inst))    # the exception instance
                            print(inst.args)     # arguments stored in .args
                            print(inst)          # __str__ allows args to be printed directly

                        # If no attachments are found, skip this file
                        if output_filename:
                            # check if this filename already exists
                            fn = od + "/" + output_filename
                            debug and print("checking existence of %s" % fn)
                            expand = 0
                            if os.path.isfile(fn):
                                while True:
                                    expand += 1
                                    # add the increment before the filename extension
                                    fn_name, fn_ext = os.path.splitext(output_filename)
                                    new_filename = fn_name + "_" + str(expand) + fn_ext
                                    fn = od + "/" + new_filename
                                    if os.path.isfile(fn):
                                        continue
                                    else:
                                        output_filename = new_filename
                                        break
                            not(quiet) and print("Writing %s " % output_filename)
                            with open(os.path.join(od, output_filename), "wb") as of:
                                of.write(attachment.get_payload(decode=True))
                                output_count += 1

                if output_count == 0:
                    not(quiet) and print("No attachment found for file %s!" % f.name)
            except Exception:
                print('Fail: %s\n' % f)

    # this should catch read and write errors
    except IOError:
        not(quiet) and print("Problem with %s or one of its attachments!" % f.name)
    return 1, output_count

if __name__ == "__main__":
    if not(single):
        debug and print("running multithreaded")
        # let's do this in parallel, using cpu count as number of threads
        pool = Pool(None)
        res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
        # need these if we use _async
        pool.close()
        pool.join()
        # 2-element list holding number of files, number of attachments
        numfiles = [sum(i) for i in zip(*res)]
        not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles))
    else:
        filecnt = 0
        cnt = 0
        debug and print("running single threaded")
        for file in glob.glob("*.%s" % EXTENSION):
            filecnt += 1
            cnt += extract(file)[1]
        not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt))
	#!/usr/bin/env python3.7

	"""
	source: https://gist.github.com/urschrei/5258588 by Stephan Hügel
	2020 update:
	- More iterators, fewer lists
	- Python 3 compatible
	- Processes files in parallel
	(one thread per CPU, but that's not really how it works)

	Ewald ( https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a )
	2020-09-20:
	- handling of same filenames (write everything because contents might be different)
	- handling of filenames with *
	- handling of mkdir errors
	- added arguments
	2020-09-23 (v1.2)
	- version_nr before extension
	- error handling for utf-8 chars in eml (on error continue)
	2020-10-02 (v1.3)
	- now correctly handles RFC2047 MIME encoded filenames
	2020-10-06 (v1.4)
	- now handles multi-line filenames
	- fixed handling of emails with no attachments
	"""

	import glob
	import os
	import email
	import argparse
	from multiprocessing import Pool
	from cs.rfc2047 import unrfc2047

	EXTENSION = "eml"

	parser = argparse.ArgumentParser(description='extract attachments from eml files')
	parser.add_argument(
	'-d','--debug',
	action='store_true',
	help='print debug messages to stderr'
	)
	parser.add_argument(
	'-s','--single',
	action='store_true',
	help='run as single thread (default = multithreaded, one thread per core)'
	)
	parser.add_argument(
	'-q','--quiet',
	action='store_true',
	help='no output'
	)
	args = parser.parse_args()
	debug = args.debug
	single = args.single
	quiet = args.quiet
	debug and print("debug output is active")

	# ensure that an output dir exists
	od = "attachments"
	# the exist_ok=True avoids error messages due to us being multithreaded and race-conditions
	# that should be no problem since we moved this out of the repetitive extract function
	os.path.exists(od) or os.makedirs(od,exist_ok=True)

	def extract(filename):
	"""
	Try to extract the attachments from filename
	"""
	debug and print("=> reading %s" % filename)
	output_count = 0
	try:
	with open(filename, "r") as f:
	try:
	msg = email.message_from_file(f)
	nratt = len(msg.get_payload())
	# this will be 4000something if no attachments are present
	if (nratt > 1 and nratt < 20):
	for attachment in msg.get_payload()[1:]:
	of = attachment.get_filename()
	debug and print("attachment name: %s" % of)

	# handle multi-line strings, and other problematic characters
	of = of.replace("\n", "")
	of = of.replace("\t", "_")
	of = of.replace("\*", "#")

	# this is to handle RFC2047 MIME encoded filenames (often used for obfuscation)
	try:
	output_filename = unrfc2047(of)
	if ( of != output_filename):
	debug and print("decoded attachment name: %s" % output_filename)
	except Exception as inst:
	print(type(inst)) # the exception instance
	print(inst.args) # arguments stored in .args
	print(inst) # __str__ allows args to be printed directly

	# If no attachments are found, skip this file
	if output_filename:
	# check if this filename already exists
	fn = od + "/" + output_filename
	debug and print("checking existence of %s" % fn)
	expand = 0
	if os.path.isfile(fn):
	while True:
	expand += 1
	# add the increment before the filename extension
	fn_name, fn_ext = os.path.splitext(output_filename)
	new_filename = fn_name + "_" + str(expand) + fn_ext
	fn = od + "/" + new_filename
	if os.path.isfile(fn):
	continue
	else:
	output_filename = new_filename
	break
	not(quiet) and print("Writing %s " % output_filename)
	with open(os.path.join(od, output_filename), "wb") as of:
	of.write(attachment.get_payload(decode=True))
	output_count += 1

	if output_count == 0:
	not(quiet) and print("No attachment found for file %s!" % f.name)
	except Exception:
	print('Fail: %s\n' % f)

	# this should catch read and write errors
	except IOError:
	not(quiet) and print("Problem with %s or one of its attachments!" % f.name)
	return 1, output_count

	if __name__ == "__main__":
	if not(single):
	debug and print("running multithreaded")
	# let's do this in parallel, using cpu count as number of threads
	pool = Pool(None)
	res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
	# need these if we use _async
	pool.close()
	pool.join()
	# 2-element list holding number of files, number of attachments
	numfiles = [sum(i) for i in zip(*res)]
	not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles))
	else:
	filecnt = 0
	cnt = 0
	debug and print("running single threaded")
	for file in glob.glob("*.%s" % EXTENSION):
	filecnt += 1
	cnt += extract(file)[1]
	not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt))