Skip to content

Instantly share code, notes, and snippets.

@bschoenfeld
Last active March 31, 2016 23:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bschoenfeld/b51c6d3555972b7805bc1edcf72f4462 to your computer and use it in GitHub Desktop.
Save bschoenfeld/b51c6d3555972b7805bc1edcf72f4462 to your computer and use it in GitHub Desktop.
import urllib2
from bs4 import BeautifulSoup
# get the web page
url = "http://healthspace.com/Clients/VDH/NewRiver/web.nsf/module_facilities.xsp?module=Food"
web_page = urllib2.urlopen(url)
web_page_html = web_page.read()
# write the web page to a file
# google "python write to file"
with open("webpage.html", "w") as web_page_file:
web_page_file.write(web_page_html)
soup = BeautifulSoup(web_page_html, "html.parser")
print "Number of Tables", len(soup.find_all("table"))
table = soup.find_all("table")[0]
print "Number of rows", len(table.find_all("tr"))
for row in table.find_all("tr"):
print "\t".join(row.stripped_strings)
import urllib
import urllib2
import csv
from bs4 import BeautifulSoup
from time import sleep
# get the web page
url = "http://data.richmond.com/salaries/2014/state/virginia-polytechnic-institute-and-state-university-virginia-tech"
page = 1
next_page_exists = True
# Let's open a file to write the data to
with open("salaries.csv", "w") as salaries:
# And instead of writing to the file ourselves, we will use a CSV
# writer, which will automatically add quotes, commas, etc
writer = csv.writer(salaries)
# we are going to get every page by adding the page field to the url
# then checking for the Next button. When the Next button is gone,
# we will know we are at the end
while next_page_exists:
# open the page
web_page = urllib2.urlopen(url + "?page=" + str(page))
web_page_html = web_page.read()
# use the lxml parser, because it handles bad HTML better
# I just learned about this today when I described your problem to a friend!
soup = BeautifulSoup(web_page_html, "lxml")
# The HTML is bad so all the TR tags got thrown out by BeautifulSoup
# Theres a Table tag and then a bunch of TD tags inside, so we will get
# all the TD tags
table = soup.find(id="namelist")
cells = table.find_all("td")
# Since the TR tags are gone, we have to tell the code that a row
# only has 3 cells. The range method is built into Python. This code
# says, "create a list of numbers from 0 to however many cells there are,
# but only include every third number in the list" e.g. [0, 3, 6, 9, ...]
for i in range(0, len(cells), 3):
# Get the text out of the current three cells. Each step of the loop
# will increase i by 3
salary = [cells[i].text, cells[i+1].text, cells[i+2].text]
# print the current row and write it to our file
print salary
writer.writerow(salary)
# Check if there is another page and increase the page number
next_page_exists = "Next" in soup.find(class_="pagination").text
page += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment