Skip to content

Instantly share code, notes, and snippets.

@theSage21
Last active August 29, 2015 14:21
Show Gist options
  • Save theSage21/947323396fa9b8225c53 to your computer and use it in GitHub Desktop.
Save theSage21/947323396fa9b8225c53 to your computer and use it in GitHub Desktop.
Email extractor from html table.
import os
import re
from bs4 import BeautifulSoup
def get_files():
folder = os.path.join(os.getcwd(), 'html')
names = [os.path.join(folder, i) for i in os.listdir(folder)]
names.sort()
return names
def get_html(filename):
print('Getting html')
f = open(filename, 'r')
html = ''.join(f.readlines())
f.close()
html = html.replace('(at)', '@')
html = html.replace('(dot)', '.')
return html
def get_emails(html):
print('boiling soup')
soup = BeautifulSoup(html)
print('getting html')
td = [i.text for i in soup.findAll('td')]
lines = ' '.join(td)
print('doing regex matching')
email_regex = re.compile(r'<[^<>]+?@gmail\.com>')
emails = [i[1:-1] for i in email_regex.findall(lines)]
return emails
if __name__ == '__main__':
files = get_files()
for f in files:
print(f)
html = get_html(f)
emails = get_emails(html)
print(len(emails), 'found')
fl = open('mails', 'a')
fl.writelines([i + '\n' for i in emails])
fl.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment