theSage21/get_emails.py

## get_emails.py
import os
import re
from bs4 import BeautifulSoup


def get_files():
    folder = os.path.join(os.getcwd(), 'html')
    names = [os.path.join(folder, i) for i in os.listdir(folder)]
    names.sort()
    return names


def get_html(filename):
    print('Getting html')
    f = open(filename, 'r')
    html = ''.join(f.readlines())
    f.close()
    html = html.replace('(at)', '@')
    html = html.replace('(dot)', '.')
    return html


def get_emails(html):
    print('boiling soup')
    soup = BeautifulSoup(html)
    print('getting html')
    td = [i.text for i in soup.findAll('td')]
    lines = ' '.join(td)
    print('doing regex matching')
    email_regex = re.compile(r'<[^<>]+?@gmail\.com>')
    emails = [i[1:-1] for i in email_regex.findall(lines)]
    return emails


if __name__ == '__main__':
    files = get_files()
    for f in files:
        print(f)
        html = get_html(f)
        emails = get_emails(html)
        print(len(emails), 'found')
        fl = open('mails', 'a')
        fl.writelines([i + '\n' for i in emails])
        fl.close()
	import os
	import re
	from bs4 import BeautifulSoup


	def get_files():
	folder = os.path.join(os.getcwd(), 'html')
	names = [os.path.join(folder, i) for i in os.listdir(folder)]
	names.sort()
	return names


	def get_html(filename):
	print('Getting html')
	f = open(filename, 'r')
	html = ''.join(f.readlines())
	f.close()
	html = html.replace('(at)', '@')
	html = html.replace('(dot)', '.')
	return html


	def get_emails(html):
	print('boiling soup')
	soup = BeautifulSoup(html)
	print('getting html')
	td = [i.text for i in soup.findAll('td')]
	lines = ' '.join(td)
	print('doing regex matching')
	email_regex = re.compile(r'<[^<>]+?@gmail\.com>')
	emails = [i[1:-1] for i in email_regex.findall(lines)]
	return emails


	if __name__ == '__main__':
	files = get_files()
	for f in files:
	print(f)
	html = get_html(f)
	emails = get_emails(html)
	print(len(emails), 'found')
	fl = open('mails', 'a')
	fl.writelines([i + '\n' for i in emails])
	fl.close()