dlrobertson/spider_crawler_exercise.py

## spider_crawler_exercise.py
# Must install requests first
import requests
# Must install beautifulsoup4 first
from bs4 import BeautifulSoup
# Used to transform relative links to absolute links
# and should already be installed
from urllib.parse import urljoin

# TODO: Initial population of sites to visit
to_visit = ...

# TODO: After we visit a site we need to add it to our history.
# When we start this will be empty, but as we go this will be
# populated
history = ...

# TODO: As long as there are still sites to visit, keep scraping
while ...:
    # TODO: Take the next site off the list of sites we're visiting
    next_url = ...

    # Send an HTTP GET request to the url and return the webpage
    req = requests.get(next_url)

    # TODO: Append the url to our history here
    ...

    # TODO: Ensure the status code is OK (200)
    if ...:
        # Parse the html into something we can word with
        soup = BeautifulSoup(req.text)

        # TODO: Do something more interesting than
        # printing the url of the site we just
        # fetched data from here
        print(next_url)

        # Find all the links or "a" tags in the html
        # e.g. <a href="www.google.com">Google</a>
        for link in soup.find_all("a", href=True):
            # Snag the link from the link tag
            href = link["href"]

            # The link may be a relative link (e.g. /).
            # So we urljoin this site (the variable next_url)
            # and the new link (the variable href)
            url = urljoin(next_url, href)

            # TODO: Add the variable url to the to_visit
            # variable here
            ...
	# Must install requests first
	import requests
	# Must install beautifulsoup4 first
	from bs4 import BeautifulSoup
	# Used to transform relative links to absolute links
	# and should already be installed
	from urllib.parse import urljoin

	# TODO: Initial population of sites to visit
	to_visit = ...

	# TODO: After we visit a site we need to add it to our history.
	# When we start this will be empty, but as we go this will be
	# populated
	history = ...

	# TODO: As long as there are still sites to visit, keep scraping
	while ...:
	# TODO: Take the next site off the list of sites we're visiting
	next_url = ...

	# Send an HTTP GET request to the url and return the webpage
	req = requests.get(next_url)

	# TODO: Append the url to our history here
	...

	# TODO: Ensure the status code is OK (200)
	if ...:
	# Parse the html into something we can word with
	soup = BeautifulSoup(req.text)

	# TODO: Do something more interesting than
	# printing the url of the site we just
	# fetched data from here
	print(next_url)

	# Find all the links or "a" tags in the html
	# e.g. <a href="www.google.com">Google</a>
	for link in soup.find_all("a", href=True):
	# Snag the link from the link tag
	href = link["href"]

	# The link may be a relative link (e.g. /).
	# So we urljoin this site (the variable next_url)
	# and the new link (the variable href)
	url = urljoin(next_url, href)

	# TODO: Add the variable url to the to_visit
	# variable here
	...