hernamesbarbara/README.md

## README.md

      
    Raw
  

              README.md
            
          
    basic script to extract any email addresses from a website that can be found
usage

get all emails you can find from this page:
http://zombierecords.com/staff
$   python3 findemails.py https://zombierecords.com/staff/
email,label
support@zombierecords.com,support@zombierecords.com
firefly@zombierecords.com,Contact
huttch@zombierecords.com,Contact
99ntt@zombierecords.com,Contact
badassmonkey@zombierecords.com,Contact
canadubstep@zombierecords.com,Contact
captainrevolution@zombierecords.com,Contact
choco@zombierecords.com,Contact
edlac@zombierecords.com,Contact
Hamptonkidd318@zombierecords.com,Contact
juggarnost@zombierecords.com,Contact
killerchaz@zombierecords.com,Contact
mashmanos@zombierecords.com,Contact
plasmidjeff@zombierecords.com,Contact
semtex@zombierecords.com,Contact
smarter@zombierecords.com,Contact
suzuki@zombierecords.com,Contact
tnegrao@zombierecords.com,Contact


## findemails.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""findemails.py

Usage:
    findemails URL [--output OUTPUT]

Arguments:
    URL                    Website from which you want to extract email addresses

Options:
    -o --output OUTPUT     [default: stdout].
    -h --help              Show this message.

Examples:
    findemails https://zombierecords.com/staff/
    findemails findemails.py https://zombierecords.com/staff/ --output zombie-emails.csv

"""

import os
import sys
from docopt import docopt
import requests
from bs4 import BeautifulSoup
import html5lib
import pandas as pd

def get_soup(url):
    r = requests.get(url)
    try:
        soup = BeautifulSoup(r.text, "html5lib")
    except Exception as err:
        sys.stderr.write(str(err))
        soup = None
    return soup

def find_emails(soup):
    emails = []
    for tag in soup.find_all("a"):
        link = tag.get('href','').strip()
        if link and link.startswith('mailto:'):
            email = link.split(':')[-1].strip()
            label = tag.get_text().strip()
            emails.append({"email": email, "label": label})
    return emails


def main():
    args = docopt(__doc__)
    url = args['URL']
    output = args['--output']
    soup = get_soup(url)
    if not soup:
        sys.stderr.write('couldnt access the URL provided')
        sys.exit(1)
    emails = find_emails(soup)
    if not emails:
        sys.stderr.write('couldnt find any emails')
        sys.exit(1)
    emails = pd.DataFrame(emails)
    if output == 'stdout':
        outfile = sys.stdout
    else:
        outfile = output
    emails.to_csv(outfile, index=False, encoding='utf-8')
    sys.exit(0)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""findemails.py

	Usage:
	findemails URL [--output OUTPUT]

	Arguments:
	URL Website from which you want to extract email addresses

	Options:
	-o --output OUTPUT [default: stdout].
	-h --help Show this message.

	Examples:
	findemails https://zombierecords.com/staff/
	findemails findemails.py https://zombierecords.com/staff/ --output zombie-emails.csv

	"""

	import os
	import sys
	from docopt import docopt
	import requests
	from bs4 import BeautifulSoup
	import html5lib
	import pandas as pd

	def get_soup(url):
	r = requests.get(url)
	try:
	soup = BeautifulSoup(r.text, "html5lib")
	except Exception as err:
	sys.stderr.write(str(err))
	soup = None
	return soup

	def find_emails(soup):
	emails = []
	for tag in soup.find_all("a"):
	link = tag.get('href','').strip()
	if link and link.startswith('mailto:'):
	email = link.split(':')[-1].strip()
	label = tag.get_text().strip()
	emails.append({"email": email, "label": label})
	return emails


	def main():
	args = docopt(__doc__)
	url = args['URL']
	output = args['--output']
	soup = get_soup(url)
	if not soup:
	sys.stderr.write('couldnt access the URL provided')
	sys.exit(1)
	emails = find_emails(soup)
	if not emails:
	sys.stderr.write('couldnt find any emails')
	sys.exit(1)
	emails = pd.DataFrame(emails)
	if output == 'stdout':
	outfile = sys.stdout
	else:
	outfile = output
	emails.to_csv(outfile, index=False, encoding='utf-8')
	sys.exit(0)

	if __name__ == '__main__':
	main()