Last active
March 31, 2016 23:30
-
-
Save bschoenfeld/b51c6d3555972b7805bc1edcf72f4462 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from bs4 import BeautifulSoup | |
# get the web page | |
url = "http://healthspace.com/Clients/VDH/NewRiver/web.nsf/module_facilities.xsp?module=Food" | |
web_page = urllib2.urlopen(url) | |
web_page_html = web_page.read() | |
# write the web page to a file | |
# google "python write to file" | |
with open("webpage.html", "w") as web_page_file: | |
web_page_file.write(web_page_html) | |
soup = BeautifulSoup(web_page_html, "html.parser") | |
print "Number of Tables", len(soup.find_all("table")) | |
table = soup.find_all("table")[0] | |
print "Number of rows", len(table.find_all("tr")) | |
for row in table.find_all("tr"): | |
print "\t".join(row.stripped_strings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import urllib2 | |
import csv | |
from bs4 import BeautifulSoup | |
from time import sleep | |
# get the web page | |
url = "http://data.richmond.com/salaries/2014/state/virginia-polytechnic-institute-and-state-university-virginia-tech" | |
page = 1 | |
next_page_exists = True | |
# Let's open a file to write the data to | |
with open("salaries.csv", "w") as salaries: | |
# And instead of writing to the file ourselves, we will use a CSV | |
# writer, which will automatically add quotes, commas, etc | |
writer = csv.writer(salaries) | |
# we are going to get every page by adding the page field to the url | |
# then checking for the Next button. When the Next button is gone, | |
# we will know we are at the end | |
while next_page_exists: | |
# open the page | |
web_page = urllib2.urlopen(url + "?page=" + str(page)) | |
web_page_html = web_page.read() | |
# use the lxml parser, because it handles bad HTML better | |
# I just learned about this today when I described your problem to a friend! | |
soup = BeautifulSoup(web_page_html, "lxml") | |
# The HTML is bad so all the TR tags got thrown out by BeautifulSoup | |
# Theres a Table tag and then a bunch of TD tags inside, so we will get | |
# all the TD tags | |
table = soup.find(id="namelist") | |
cells = table.find_all("td") | |
# Since the TR tags are gone, we have to tell the code that a row | |
# only has 3 cells. The range method is built into Python. This code | |
# says, "create a list of numbers from 0 to however many cells there are, | |
# but only include every third number in the list" e.g. [0, 3, 6, 9, ...] | |
for i in range(0, len(cells), 3): | |
# Get the text out of the current three cells. Each step of the loop | |
# will increase i by 3 | |
salary = [cells[i].text, cells[i+1].text, cells[i+2].text] | |
# print the current row and write it to our file | |
print salary | |
writer.writerow(salary) | |
# Check if there is another page and increase the page number | |
next_page_exists = "Next" in soup.find(class_="pagination").text | |
page += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment