Skip to content

Instantly share code, notes, and snippets.

@justgrimes
Created April 10, 2012 05:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save justgrimes/2348459 to your computer and use it in GitHub Desktop.
Save justgrimes/2348459 to your computer and use it in GitHub Desktop.
publiclibraries.com scraper
# publiclibraries.com scraper
import BeautifulSoup
import urllib2
l = [("http://www.publiclibraries.com/alabama.htm","AL"),
("http://www.publiclibraries.com/alaska.htm","AK"),
("http://www.publiclibraries.com/arizona.htm","AZ"),
("http://www.publiclibraries.com/arkansas.htm","AR"),
("http://www.publiclibraries.com/california.htm","CA"),
("http://www.publiclibraries.com/colorado.htm","CO"),
("http://www.publiclibraries.com/connecticut.htm","CT"),
("http://www.publiclibraries.com/delaware.htm","DE"),
("http://www.publiclibraries.com/dc.htm","DC"),
("http://www.publiclibraries.com/florida.htm","FL"),
("http://www.publiclibraries.com/georgia.htm","GA"),
("http://www.publiclibraries.com/hawaii.htm","HI"),
("http://www.publiclibraries.com/idaho.htm","ID"),
("http://www.publiclibraries.com/illinois.htm","IL"),
("http://www.publiclibraries.com/indiana.htm","IN"),
("http://www.publiclibraries.com/iowa.htm","IA"),
("http://www.publiclibraries.com/kansas.htm","KS"),
("http://www.publiclibraries.com/kentucky.htm","KY"),
("http://www.publiclibraries.com/louisiana.htm","LA"),
("http://www.publiclibraries.com/maine.htm","ME"),
("http://www.publiclibraries.com/maryland.htm","MD"),
("http://www.publiclibraries.com/massachusetts.htm","MA"),
("http://www.publiclibraries.com/michigan.htm","MI"),
("http://www.publiclibraries.com/minnesota.htm","MN"),
("http://www.publiclibraries.com/mississippi.htm","MS"),
("http://www.publiclibraries.com/missouri.htm","MO"),
("http://www.publiclibraries.com/montana.htm","MT"),
("http://www.publiclibraries.com/nebraska.htm","NE"),
("http://www.publiclibraries.com/nevada.htm","NV"),
("http://www.publiclibraries.com/newhampshire.htm","NH"),
("http://www.publiclibraries.com/newjersey.htm","NJ"),
("http://www.publiclibraries.com/newmexico.htm","NM"),
("http://www.publiclibraries.com/newyork.htm","NY"),
("http://www.publiclibraries.com/northcarolina.htm","NC"),
("http://www.publiclibraries.com/northdakota.htm","ND"),
("http://www.publiclibraries.com/ohio.htm","OH"),
("http://www.publiclibraries.com/oklahoma.htm","OK"),
("http://www.publiclibraries.com/oregon.htm","OR"),
("http://www.publiclibraries.com/pennsylvania.htm","PA"),
("http://www.publiclibraries.com/rhodeisland.htm","RI"),
("http://www.publiclibraries.com/southcarolina.htm","SC"),
("http://www.publiclibraries.com/southdakota.htm","SD"),
("http://www.publiclibraries.com/tennessee.htm","TN"),
("http://www.publiclibraries.com/texas.htm","TX"),
("http://www.publiclibraries.com/utah.htm","UT"),
("http://www.publiclibraries.com/vermont.htm","VT"),
("http://www.publiclibraries.com/virginia.htm","VA"),
("http://www.publiclibraries.com/washington.htm","WA"),
("http://www.publiclibraries.com/westvirginia.htm","WV"),
("http://www.publiclibraries.com/wisconsin.htm","WI"),
("http://www.publiclibraries.com/wyoming.htm","WY")
`]
z = open("output.csv","w")
for i, j in l:
o = urllib2.urlopen(i)
soup = BeautifulSoup.BeautifulSoup(o)
t = soup.findAll('table')[1:] #grab table of data
for table in t:
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
z.write(j)
z.write(",")
for td in cols:
t = td.find()
if not "<strong>" in str(t):
if t!=None:
z.write(str(td.find()))
else:
z.write("\"")
z.write(str(td.find(text=True)))
z.write("\"")
z.write(",")
z.write("\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment