Skip to content

Instantly share code, notes, and snippets.

@ivh
Created August 13, 2012 14:58
Show Gist options
  • Save ivh/3341501 to your computer and use it in GitHub Desktop.
Save ivh/3341501 to your computer and use it in GitHub Desktop.
extract emails from a set of links that have one each
#!/usr/bin/env python
import codecs
import lxml.html
out=codecs.open('names_emails.dat','w','utf-8')
out2=open('emails.dat','w')
for link in open('comlinks.txt'):
html=lxml.html.parse(link)
d=html.xpath('.//div[@class="profile_box"]')[0]
dots=d.findall('.//img[@alt="."]')
for dot in dots: dot.text='.'
ats=d.findall('.//img[@alt="@"]')
for at in ats: at.text='@'
lxml.html.etree.strip_tags(d,'img')
x,name,x,ema=d.getchildren()[:4]
ema=[t for t in ema.itertext()]
for i,s in enumerate(ema):
if s.startswith('Email'):
email=s.split()[-1]
break;
final='"%s" <%s>'%(name.text,email)
print final
out.write(final)
out.write('\n')
out2.write('%s\n'%email)
name,email='',''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment