Skip to content

Instantly share code, notes, and snippets.

@psy901
Last active January 18, 2018 04:30
Show Gist options
  • Save psy901/8da054bf9743dbc377cffa36e28b9bed to your computer and use it in GitHub Desktop.
Save psy901/8da054bf9743dbc377cffa36e28b9bed to your computer and use it in GitHub Desktop.
With provided URL, it scrawls email addresses within the landing page and the second level pages
import requests
from bs4 import BeautifulSoup
import re
def extract_email_from_url(url):
'''
From the provided url, it extracts all valid email addresses
:param url: URL to enter
:return: Set of email addresses in string
'''
# enter the URL and retrieved all text
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
text = soup.text
# from the text, it collects all valid email addresses
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))
emails_tuples = re.findall(regex, text)
# returns a set of email addresses
emails = set()
for email in emails_tuples:
emails.add(email[0])
return emails
def get_links(url):
'''
From the URL, it returns all the links embedded
:param url: URL to enter
:return: Set of links in string
'''
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
set_links = set()
# finds all the links and return them as a list
for link in soup.findAll('a'):
href = link.get('href')
set_links.add(href)
return set_links
def get_email(url):
'''
From the URL, it tries to extract emails from the landing page
and its 2nd level pages. And it prints all the email addresses it retrieved
:param url: provided URL from the user
'''
emails = set()
# 1st level
emails.update(extract_email_from_url(url))
# 2nd level -- iterate over the set of links
set_links = get_links(url)
for link_url in set_links:
try:
print("working on: " + link_url)
emails.update(extract_email_from_url(link_url))
except requests.exceptions.MissingSchema:
# some links relative paths, so change them into absolute path
new_url = url.strip('/') + link_url
print("working on changed one..." + new_url)
emails.update(extract_email_from_url(new_url))
except:
pass
# print all email addresses found
for email in emails:
print(email)
get_email("https://www.wisc.edu/")
@psy901
Copy link
Author

psy901 commented Jan 18, 2018

It takes the url, "https://www.wisc.edu", for now

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment