reverie/lenient_email_extractor.py

## lenient_email_extractor.py
def extract_first_email(s):
    """
    Assumes `s` is a list of email addresses split by (mixed) spaces or commas.
    Returns (first_email, rest) where first_email + rest == s,
    or (None, None) if we don't think this is an email address.
    """
    first_quote = s.find('"')
    first_at = s.find('@')
    if first_at == -1:
        return (None, None)
    if first_quote != -1 and first_quote < first_at:
        second_quote = s.find('"', first_quote+1)
        if second_quote == -1:
            return (None, None)
        first_at = s.find('@', second_quote)
    next_separator = re.search('[ ,]', s[first_at:])
    if not next_separator:
        return (s, '')
    first_email_end = first_at + next_separator.start()
    return (s[:first_email_end], s[first_email_end:])


def split_email_line_by_spaces_or_commas(s):
    """
    Returns a pair of ([maybe_valid_emails], [invalid_emails]) in s.
    """
    emails = []
    invalid = []
    s = s.strip(' ,')
    while s:
        first, rest = extract_first_email(s)
        if first is None:
            invalid.append(s)
            break
        emails.append(first.strip(' ,'))
        s = rest.strip(' ,')
    return emails, invalid


def lenient_email_extractor(text):
    """
    Returns a pair (address_pairs, invalid_addresses),
    where address_pairs is of the kind returned by email.utils.parseaddr.


    Test case:
    '''

    16@example.com, "Andrew, Esq." <17@example.com>,

    "Mr. Bob Ross" <foo@example.com> test@foo.bar,23432@example.com,

    2235233432@example.com,dsfjkadls@example.com      "Full Name with quotes and <weird@chars.com>" <weird@example.com>

    sdflkadsjfkdalfds@example.com

        1@example.com 2@example.com   3@example.com

    '''
    (note: spaces, commas, blank lines)
    """
    from email.utils import parseaddr
    address_pairs = []
    invalid_addresses = []
    lines = text.strip().splitlines()
    for l in lines:
        emails, invalid = split_email_line_by_spaces_or_commas(l)
        invalid_addresses.extend(invalid)
        for e in emails:
            name, real = parseaddr(e)
            if name == real == '':
                invalid_addresses.append(e)
            else:
                address_pairs.append((name, real))
	def extract_first_email(s):
	"""
	Assumes `s` is a list of email addresses split by (mixed) spaces or commas.
	Returns (first_email, rest) where first_email + rest == s,
	or (None, None) if we don't think this is an email address.
	"""
	first_quote = s.find('"')
	first_at = s.find('@')
	if first_at == -1:
	return (None, None)
	if first_quote != -1 and first_quote < first_at:
	second_quote = s.find('"', first_quote+1)
	if second_quote == -1:
	return (None, None)
	first_at = s.find('@', second_quote)
	next_separator = re.search('[ ,]', s[first_at:])
	if not next_separator:
	return (s, '')
	first_email_end = first_at + next_separator.start()
	return (s[:first_email_end], s[first_email_end:])


	def split_email_line_by_spaces_or_commas(s):
	"""
	Returns a pair of ([maybe_valid_emails], [invalid_emails]) in s.
	"""
	emails = []
	invalid = []
	s = s.strip(' ,')
	while s:
	first, rest = extract_first_email(s)
	if first is None:
	invalid.append(s)
	break
	emails.append(first.strip(' ,'))
	s = rest.strip(' ,')
	return emails, invalid


	def lenient_email_extractor(text):
	"""
	Returns a pair (address_pairs, invalid_addresses),
	where address_pairs is of the kind returned by email.utils.parseaddr.


	Test case:
	'''

	16@example.com, "Andrew, Esq." <17@example.com>,

	"Mr. Bob Ross" <foo@example.com> test@foo.bar,23432@example.com,

	2235233432@example.com,dsfjkadls@example.com "Full Name with quotes and <weird@chars.com>" <weird@example.com>

	sdflkadsjfkdalfds@example.com

	1@example.com 2@example.com 3@example.com

	'''
	(note: spaces, commas, blank lines)
	"""
	from email.utils import parseaddr
	address_pairs = []
	invalid_addresses = []
	lines = text.strip().splitlines()
	for l in lines:
	emails, invalid = split_email_line_by_spaces_or_commas(l)
	invalid_addresses.extend(invalid)
	for e in emails:
	name, real = parseaddr(e)
	if name == real == '':
	invalid_addresses.append(e)
	else:
	address_pairs.append((name, real))