Skip to content

Instantly share code, notes, and snippets.

@jackcrane
Created August 29, 2020 01:40
Show Gist options
  • Save jackcrane/646d4d210bc017e636aad6b80bfe05f5 to your computer and use it in GitHub Desktop.
Save jackcrane/646d4d210bc017e636aad6b80bfe05f5 to your computer and use it in GitHub Desktop.
#Simple, not all that good email scraper that recursively indexes webpages from a domain (replace varible "site" on line 38)
from bs4 import BeautifulSoup
import requests
f = open("data.txt","a")
# lists
urls=[]
# function created
def scrape(site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
for i in s.find_all("a"):
href = i.attrs['href']
print href
if "@" in href:
site = site+href
if site not in urls:
urls.append(site)
f.write((href.split(":")[1]).split("?")[0]+"\n")
# calling it self
scrape(site)
print(href)
# main function
if __name__ =="__main__":
# website to be scrape
site="https://medill.northwestern.edu/directory/faculty/journalism/index.html"
# calling function
scrape(site)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment