batemapf/scraping.py

## scraping.py
import requests
from bs4 import BeautifulSoup

# Set the variable `url` to a URL of your choice.
url = 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/'

# Send a request to the URL and save what we get back in the variable `response`
response = requests.get(url)

# Get the raw text of the response. This will be a whole bunch of HTML.
raw_text = response.text

# Make some soup! That is, create an instance of the BeautifulSoup
# class and feed it the raw text of the response.
soup = BeautifulSoup(raw_text)

# Now let's get all of the link addresses in the page, which is the value contained
# in the `href` attribute of each `<a>` tag.
#
# First, create an empty list to store the link addresses.
hrefs = []

# Second, get all of the <a> tags on the page using BeautifulSoup's handy `.all()` method, which returns a list.
a_tags = soup.find_all('a')

# Third, loop through all of the <a> tags and append a copy of the `href` value for each one to our empty list.
for tag in a_tags:
  # Get the `href` attribute for the tag
  href = tag['href']
  # Append it to the list.
  hrefs.append(href)

# Check out your work!
print(len(hrefs))
print(hrefs[-1])
print(hrefs[0])
for h in hrefs:
  if h.count('/') > 1:
    print(h)

# Your result should look something like this:
"""
347
http://sphinx-doc.org/
genindex.html
http://www.crummy.com/software/BeautifulSoup/
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
http://kondou.com/BS4/
http://coreapython.hosting.paran.com/etc/beautifulsoup4.html
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
http://www.crummy.com/software/BeautifulSoup/download/4.x/
http://lxml.de/
http://code.google.com/p/html5lib/
http://example.com/elsie
http://example.com/lacie
http://www.w3.org/TR/html5/syntax.html#syntax
http://wiki.python.org/moin/PrintFails
http://lxml.de/
http://pypi.python.org/pypi/cchardet/
http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
http://www.python.org/dev/peps/pep-0008/
http://sphinx-doc.org/
"""
	import requests
	from bs4 import BeautifulSoup

	# Set the variable `url` to a URL of your choice.
	url = 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/'

	# Send a request to the URL and save what we get back in the variable `response`
	response = requests.get(url)

	# Get the raw text of the response. This will be a whole bunch of HTML.
	raw_text = response.text

	# Make some soup! That is, create an instance of the BeautifulSoup
	# class and feed it the raw text of the response.
	soup = BeautifulSoup(raw_text)

	# Now let's get all of the link addresses in the page, which is the value contained
	# in the `href` attribute of each `<a>` tag.
	#
	# First, create an empty list to store the link addresses.
	hrefs = []

	# Second, get all of the <a> tags on the page using BeautifulSoup's handy `.all()` method, which returns a list.
	a_tags = soup.find_all('a')

	# Third, loop through all of the <a> tags and append a copy of the `href` value for each one to our empty list.
	for tag in a_tags:
	# Get the `href` attribute for the tag
	href = tag['href']
	# Append it to the list.
	hrefs.append(href)

	# Check out your work!
	print(len(hrefs))
	print(hrefs[-1])
	print(hrefs[0])
	for h in hrefs:
	if h.count('/') > 1:
	print(h)

	# Your result should look something like this:
	"""
	347
	http://sphinx-doc.org/
	genindex.html
	http://www.crummy.com/software/BeautifulSoup/
	http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
	http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
	http://kondou.com/BS4/
	http://coreapython.hosting.paran.com/etc/beautifulsoup4.html
	https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
	http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
	http://www.crummy.com/software/BeautifulSoup/download/4.x/
	http://lxml.de/
	http://code.google.com/p/html5lib/
	http://example.com/elsie
	http://example.com/lacie
	http://www.w3.org/TR/html5/syntax.html#syntax
	http://wiki.python.org/moin/PrintFails
	http://lxml.de/
	http://pypi.python.org/pypi/cchardet/
	http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz
	http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
	http://www.python.org/dev/peps/pep-0008/
	http://sphinx-doc.org/
	"""