bschoenfeld/scrape-restaurants.py

## scrape-restaurants.py
import urllib2
from bs4 import BeautifulSoup

# get the web page
url = "http://healthspace.com/Clients/VDH/NewRiver/web.nsf/module_facilities.xsp?module=Food"
web_page = urllib2.urlopen(url)
web_page_html = web_page.read()

# write the web page to a file
# google "python write to file"
with open("webpage.html", "w") as web_page_file:
    web_page_file.write(web_page_html)

soup = BeautifulSoup(web_page_html, "html.parser")
print "Number of Tables", len(soup.find_all("table"))

table = soup.find_all("table")[0]
print "Number of rows", len(table.find_all("tr"))

for row in table.find_all("tr"):
    print "\t".join(row.stripped_strings)

## scrape-salaries.py
import urllib
import urllib2
import csv
from bs4 import BeautifulSoup
from time import sleep

# get the web page
url = "http://data.richmond.com/salaries/2014/state/virginia-polytechnic-institute-and-state-university-virginia-tech"
page = 1
next_page_exists = True

# Let's open a file to write the data to
with open("salaries.csv", "w") as salaries:
    # And instead of writing to the file ourselves, we will use a CSV
    # writer, which will automatically add quotes, commas, etc
    writer = csv.writer(salaries)

    # we are going to get every page by adding the page field to the url
    # then checking for the Next button. When the Next button is gone,
    # we will know we are at the end
    while next_page_exists:
        # open the page
        web_page = urllib2.urlopen(url + "?page=" + str(page))
        web_page_html = web_page.read()

        # use the lxml parser, because it handles bad HTML better
        # I just learned about this today when I described your problem to a friend!
        soup = BeautifulSoup(web_page_html, "lxml")

        # The HTML is bad so all the TR tags got thrown out by BeautifulSoup
        # Theres a Table tag and then a bunch of TD tags inside, so we will get
        # all the TD tags
        table = soup.find(id="namelist")
        cells = table.find_all("td")

        # Since the TR tags are gone, we have to tell the code that a row
        # only has 3 cells. The range method is built into Python. This code
        # says, "create a list of numbers from 0 to however many cells there are,
        # but only include every third number in the list" e.g. [0, 3, 6, 9, ...]
        for i in range(0, len(cells), 3):
            # Get the text out of the current three cells. Each step of the loop
            # will increase i by 3
            salary = [cells[i].text, cells[i+1].text, cells[i+2].text]
            # print the current row and write it to our file
            print salary
            writer.writerow(salary)

        # Check if there is another page and increase the page number
        next_page_exists = "Next" in soup.find(class_="pagination").text
        page += 1
	import urllib2
	from bs4 import BeautifulSoup

	# get the web page
	url = "http://healthspace.com/Clients/VDH/NewRiver/web.nsf/module_facilities.xsp?module=Food"
	web_page = urllib2.urlopen(url)
	web_page_html = web_page.read()

	# write the web page to a file
	# google "python write to file"
	with open("webpage.html", "w") as web_page_file:
	web_page_file.write(web_page_html)

	soup = BeautifulSoup(web_page_html, "html.parser")
	print "Number of Tables", len(soup.find_all("table"))

	table = soup.find_all("table")[0]
	print "Number of rows", len(table.find_all("tr"))

	for row in table.find_all("tr"):
	print "\t".join(row.stripped_strings)
	import urllib
	import urllib2
	import csv
	from bs4 import BeautifulSoup
	from time import sleep

	# get the web page
	url = "http://data.richmond.com/salaries/2014/state/virginia-polytechnic-institute-and-state-university-virginia-tech"
	page = 1
	next_page_exists = True

	# Let's open a file to write the data to
	with open("salaries.csv", "w") as salaries:
	# And instead of writing to the file ourselves, we will use a CSV
	# writer, which will automatically add quotes, commas, etc
	writer = csv.writer(salaries)

	# we are going to get every page by adding the page field to the url
	# then checking for the Next button. When the Next button is gone,
	# we will know we are at the end
	while next_page_exists:
	# open the page
	web_page = urllib2.urlopen(url + "?page=" + str(page))
	web_page_html = web_page.read()

	# use the lxml parser, because it handles bad HTML better
	# I just learned about this today when I described your problem to a friend!
	soup = BeautifulSoup(web_page_html, "lxml")

	# The HTML is bad so all the TR tags got thrown out by BeautifulSoup
	# Theres a Table tag and then a bunch of TD tags inside, so we will get
	# all the TD tags
	table = soup.find(id="namelist")
	cells = table.find_all("td")

	# Since the TR tags are gone, we have to tell the code that a row
	# only has 3 cells. The range method is built into Python. This code
	# says, "create a list of numbers from 0 to however many cells there are,
	# but only include every third number in the list" e.g. [0, 3, 6, 9, ...]
	for i in range(0, len(cells), 3):
	# Get the text out of the current three cells. Each step of the loop
	# will increase i by 3
	salary = [cells[i].text, cells[i+1].text, cells[i+2].text]
	# print the current row and write it to our file
	print salary
	writer.writerow(salary)

	# Check if there is another page and increase the page number
	next_page_exists = "Next" in soup.find(class_="pagination").text
	page += 1