Created
July 5, 2013 03:09
-
-
Save leonardreidy/5931417 to your computer and use it in GitHub Desktop.
A simple python script to iterate through all the (html) files in a directory, extracting emails from each, and writing a comma-separated list to an outfile for each html file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from bs4 import BeautifulSoup | |
# get a list of the files in the current directory | |
here = os.listdir(os.getcwd()) | |
# define preprocessing method to extract email addresses from a given | |
# html file | |
def preproc(infile, outfile): | |
# open the infile for reading | |
file = open(infile, 'r') | |
# convert the contents of the infile to a Beautiful Soup object | |
soup = BeautifulSoup(file) | |
# extract email addresses from the soup | |
emails = soup.select('a[href^=mailto]') | |
# open the outfile for writing, loop through the email addresses and | |
# write a comma-separated list to the outfile | |
with open(outfile, 'w') as file: | |
for i in emails: | |
# to avoid certain Nonetype errors | |
if i.string != None: | |
# encode() to avoid throwing string errors, strip() to weed out | |
# whitespace and other junk, and add a 'comma' | |
# to prevent the script writing a comma to the file after | |
# the last email | |
if emails.index(i) != len(emails)-1: | |
file.write(i.string.encode('utf-8').strip()+',') | |
else: | |
file.write(i.string.encode('utf-8').strip()) | |
# define method to iterate through the files in the directory | |
# and invoke the preproc() function above on each | |
def process(a): | |
for i in (a): | |
preproc(i, "out"+str(a.index(i))+".txt") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment