leonardreidy/simple-email-extractor

## simple-email-extractor
import os
from bs4 import BeautifulSoup

# get a list of the files in the current directory
here = os.listdir(os.getcwd())

# define preprocessing method to extract email addresses from a given
# html file

def preproc(infile, outfile):

  # open the infile for reading
  file = open(infile, 'r')

  # convert the contents of the infile to a Beautiful Soup object
  soup = BeautifulSoup(file)

  # extract email addresses from the soup
  emails = soup.select('a[href^=mailto]')

  # open the outfile for writing, loop through the email addresses and
  # write a comma-separated list to the outfile

  with open(outfile, 'w') as file:
    for i in emails:

      # to avoid certain Nonetype errors
      if i.string != None:

        # encode() to avoid throwing string errors, strip() to weed out
        # whitespace and other junk, and add a 'comma'

        # to prevent the script writing a comma to the file after
        # the last email

        if emails.index(i) != len(emails)-1:
          file.write(i.string.encode('utf-8').strip()+',')
        else:
          file.write(i.string.encode('utf-8').strip())


# define method to iterate through the files in the directory
# and invoke the preproc() function above on each

def process(a):
  for i in (a):
    preproc(i, "out"+str(a.index(i))+".txt")
	import os
	from bs4 import BeautifulSoup

	# get a list of the files in the current directory
	here = os.listdir(os.getcwd())

	# define preprocessing method to extract email addresses from a given
	# html file

	def preproc(infile, outfile):

	# open the infile for reading
	file = open(infile, 'r')

	# convert the contents of the infile to a Beautiful Soup object
	soup = BeautifulSoup(file)

	# extract email addresses from the soup
	emails = soup.select('a[href^=mailto]')

	# open the outfile for writing, loop through the email addresses and
	# write a comma-separated list to the outfile

	with open(outfile, 'w') as file:
	for i in emails:

	# to avoid certain Nonetype errors
	if i.string != None:

	# encode() to avoid throwing string errors, strip() to weed out
	# whitespace and other junk, and add a 'comma'

	# to prevent the script writing a comma to the file after
	# the last email

	if emails.index(i) != len(emails)-1:
	file.write(i.string.encode('utf-8').strip()+',')
	else:
	file.write(i.string.encode('utf-8').strip())


	# define method to iterate through the files in the directory
	# and invoke the preproc() function above on each

	def process(a):
	for i in (a):
	preproc(i, "out"+str(a.index(i))+".txt")