jackcrane/rec.py

## rec.py
#Simple, not all that good email scraper that recursively indexes webpages from a domain (replace varible "site" on line 38)

from bs4 import BeautifulSoup
import requests

f = open("data.txt","a")

# lists
urls=[]

# function created
def scrape(site):

	# getting the request from url
	r = requests.get(site)

	# converting the text
	s = BeautifulSoup(r.text,"html.parser")

	for i in s.find_all("a"):

		href = i.attrs['href']
		print href
		if "@" in href:
			site = site+href

			if site not in urls:
				urls.append(site)
				f.write((href.split(":")[1]).split("?")[0]+"\n")
				# calling it self
				scrape(site)
				print(href)

# main function
if __name__ =="__main__":

	# website to be scrape
	site="https://medill.northwestern.edu/directory/faculty/journalism/index.html"

	# calling function
	scrape(site)
	#Simple, not all that good email scraper that recursively indexes webpages from a domain (replace varible "site" on line 38)

	from bs4 import BeautifulSoup
	import requests

	f = open("data.txt","a")

	# lists
	urls=[]

	# function created
	def scrape(site):

	# getting the request from url
	r = requests.get(site)

	# converting the text
	s = BeautifulSoup(r.text,"html.parser")

	for i in s.find_all("a"):

	href = i.attrs['href']
	print href
	if "@" in href:
	site = site+href

	if site not in urls:
	urls.append(site)
	f.write((href.split(":")[1]).split("?")[0]+"\n")
	# calling it self
	scrape(site)
	print(href)

	# main function
	if __name__ =="__main__":

	# website to be scrape
	site="https://medill.northwestern.edu/directory/faculty/journalism/index.html"

	# calling function
	scrape(site)