Jwpe/html_parser.py

## html_parser.py
# First we 'import' a Python package - a piece of code written for a specific
# job and freely available to install - called BeautifulSoup. This package
# lets us search through the structure of HTML.
# For installation instructions, see the guide to pip at the end of the post.
from bs4 import BeautifulSoup

# Here we add our HTML document. In a more advanced example, we could fetch
# this directly from the web, but here we just copy it into our Python file.
# Notice the triple-quotes around the HTML!
html = """
    <html>
      <body>
        <div class="lead-list">
          <div class="lead">
            <p class="first-name">Paul</p>
            <p class="last-name">McCartney</p>
            <p class="company">Beatles Inc.</p>
            <p class="title">Chief Marketing Officer</p>
            <p class="contact-email">paul@abbeyroadstudios.co.uk</p>
          </div>
          <div class="lead">
            <p class="first-name">Nina</p>
            <p class="last-name">Simone</p>
            <p class="company">Jazz & Soul</p>
            <p class="title">CEO</p>
            <p class="contact-email">ninasimone@feelinggood.com</p>
          </div>
          <div class="lead">
            <p class="first-name">Michael</p>
            <p class="last-name">Jagger</p>
            <p class="company">Rolling Stone Corp.</p>
            <p class="title">SVP Marketing</p>
            <p class="contact-email">mick@rollingstones.com</p>
          </div>
        </div>
      </body>
    </html>
"""

# We set up an empty list of data, which we will fill with our lead info.
lead_list = []

# Next, we tell BeautifulSoup to parse the HTML into a form that it understands
html_data = BeautifulSoup(html)

# Then we search through the HTML data to find each of the the <div> elements
# with the class 'lead', and put them into a list. These elements contain
# all of the HTML elements inside them.
lead_data = html_data.find_all('div', attrs={'class': 'lead'})

# We use another for loop to iterate through each lead in our lead data list.
for lead in lead_data:

    # Now, we want to build a dictionary like we saw in Example 2, containing
    # all the information about the lead
    lead_dict = {
        # For each piece of information, we find the <p> tag with
        # the class that corresponds to it, and extract the string - AKA
        # the words inside it
        'first_name': lead.find('p', attrs={'class':'first-name'}).string,
        'last_name': lead.find('p', attrs={'class':'last-name'}).string,
        'company': lead.find('p', attrs={'class':'company'}).string,
        'title': lead.find('p', attrs={'class':'title'}).string,
        'contact_email': lead.find('p', attrs={'class':'contact-email'}).string,
    }

    # Finally, we add our lead dictionary to our list of leads!
    lead_list.append(lead_dict)

# Let's take a look at what we've got! If everything goes to plan, we should
# see a list of dictionaries very similar to the one we used in Example 2.
print lead_list
	# First we 'import' a Python package - a piece of code written for a specific
	# job and freely available to install - called BeautifulSoup. This package
	# lets us search through the structure of HTML.
	# For installation instructions, see the guide to pip at the end of the post.
	from bs4 import BeautifulSoup

	# Here we add our HTML document. In a more advanced example, we could fetch
	# this directly from the web, but here we just copy it into our Python file.
	# Notice the triple-quotes around the HTML!
	html = """
	<html>
	<body>
	<div class="lead-list">
	<div class="lead">
	<p class="first-name">Paul</p>
	<p class="last-name">McCartney</p>
	<p class="company">Beatles Inc.</p>
	<p class="title">Chief Marketing Officer</p>
	<p class="contact-email">paul@abbeyroadstudios.co.uk</p>
	</div>
	<div class="lead">
	<p class="first-name">Nina</p>
	<p class="last-name">Simone</p>
	<p class="company">Jazz & Soul</p>
	<p class="title">CEO</p>
	<p class="contact-email">ninasimone@feelinggood.com</p>
	</div>
	<div class="lead">
	<p class="first-name">Michael</p>
	<p class="last-name">Jagger</p>
	<p class="company">Rolling Stone Corp.</p>
	<p class="title">SVP Marketing</p>
	<p class="contact-email">mick@rollingstones.com</p>
	</div>
	</div>
	</body>
	</html>
	"""

	# We set up an empty list of data, which we will fill with our lead info.
	lead_list = []

	# Next, we tell BeautifulSoup to parse the HTML into a form that it understands
	html_data = BeautifulSoup(html)

	# Then we search through the HTML data to find each of the the <div> elements
	# with the class 'lead', and put them into a list. These elements contain
	# all of the HTML elements inside them.
	lead_data = html_data.find_all('div', attrs={'class': 'lead'})

	# We use another for loop to iterate through each lead in our lead data list.
	for lead in lead_data:

	# Now, we want to build a dictionary like we saw in Example 2, containing
	# all the information about the lead
	lead_dict = {
	# For each piece of information, we find the <p> tag with
	# the class that corresponds to it, and extract the string - AKA
	# the words inside it
	'first_name': lead.find('p', attrs={'class':'first-name'}).string,
	'last_name': lead.find('p', attrs={'class':'last-name'}).string,
	'company': lead.find('p', attrs={'class':'company'}).string,
	'title': lead.find('p', attrs={'class':'title'}).string,
	'contact_email': lead.find('p', attrs={'class':'contact-email'}).string,
	}

	# Finally, we add our lead dictionary to our list of leads!
	lead_list.append(lead_dict)

	# Let's take a look at what we've got! If everything goes to plan, we should
	# see a list of dictionaries very similar to the one we used in Example 2.
	print lead_list