Skip to content

Instantly share code, notes, and snippets.

@leonardreidy
Created July 5, 2013 14:07
Show Gist options
  • Save leonardreidy/5934770 to your computer and use it in GitHub Desktop.
Save leonardreidy/5934770 to your computer and use it in GitHub Desktop.
Parse html file with Beautiful Soup, find emails and names and output as json, ready for ponymailer.rb. Emails are found (with href=mailto) and names (inside <strong> tags). The program creates a single list that contains both names, and emails, and then output it as json, ready for ponymailer to send.
# A simple python script to extract names, and emails from
# a certain online directory
import os, json
from bs4 import BeautifulSoup
#get a list of the files in the current directory
inputfiles = os.listdir(os.getcwd())
def postproc(inputfiles):
#for every file in the directory
for i in inputfiles:
#call the preproc function on said file and generate the appropriate outfile
preproc(i, "out"+str(inputfiles.index(i))+".txt")
def preproc(infile, outfile):
# open the infile for reading
file = open(infile, 'r')
# convert the infile to soup object
soup = BeautifulSoup(file)
# find all <strong></strong> elements
strongs = soup.select('strong')
# find all mailto (email) elements
mailtos = soup.select('a[href^=mailto]')
# prep variables for subsequent stages i process
prenames = []
names = []
emails = []
contactzip = []
jsondump = []
# Extract names
for i in strongs:
for j in i:
prenames.append(j.string)
for i in prenames:
if prenames.index(i)%2 != 0:
if i.string != None:
if i != '\n':
names.append(i.string.encode('utf-8').strip())
# Extract emails
for i in mailtos:
if i.string != None:
emails.append(i.string.encode('utf-8').strip())
# zip together names,emails into a list of lists
contactzip = zip(emails, names)
# convert list of lists to json for processing by ponymailer
jsondump = json.dumps(contactzip)
# write to file
with open(outfile, 'w') as file:
file.write(jsondump)
# run the script
postproc(inputfiles)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment