Last active
August 29, 2015 13:56
-
-
Save djds23/9216747 to your computer and use it in GitHub Desktop.
Parses text to find emails, accepts text files or raw input
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
# Email regex taken from django/django | |
# https://github.com/django/django/blob/master/django/core/validators.py#L137 | |
email_re = re.compile( | |
r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*" # dot-atom | |
r'|^"([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\001-011\013\014\016-\177])*"' # quoted-string | |
r')@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?$', re.IGNORECASE) | |
def parse_emails(string): | |
words = string.split() | |
find_emails = map(email_re.match, words) | |
matched_emails = filter(bool, find_emails) | |
processed_emails = map(lambda email: email.string, matched_emails) | |
return processed_emails | |
if __name__=='__main__': | |
try: | |
filename = sys.argv[1] | |
except IndexError: | |
filename = '' | |
if filename: | |
with open(filename, 'rU') as f: | |
bucket = [] | |
for line in f: | |
bucket += parse_emails(line) | |
print list(bucket) | |
else: | |
print 'Please use Ctrl-C to quit' | |
while True: | |
from_user = raw_input('paste text here to extract emails: ') | |
print parse_emails(from_user) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment