Skip to content

Instantly share code, notes, and snippets.

@wrgoldstein
Created December 5, 2014 01:24
Show Gist options
  • Save wrgoldstein/07427f27efa14a52c02d to your computer and use it in GitHub Desktop.
Save wrgoldstein/07427f27efa14a52c02d to your computer and use it in GitHub Desktop.
quick and dirty scraping script for DOB landmark violations
import urllib2
import re
import sys
def open_url(url):
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
raw_html = opener.open(url).read()
return raw_html
def list_of_bins():
"""
this comes from
https://nycopendata.socrata.com/\
City-Government/Property-Address-\
Directory/mw4z-r6bv
"""
return SPECIFIC_BINS # shady
def dob_url(bin_):
return "http://a810-bisweb.nyc.gov/bisweb/PropertyProfileOverviewServlet?bin=%s&requestid=1" % bin_
def landmark_status(s):
r = re.compile("Landmark Status:.*[\n\t\r]*<td[^>]*>([^<]*)")
result = r.findall(s)
return result[0]
def violations_dob(s):
r = re.compile("Violations-DOB.*[\n\t\r]*<td[^>]*>([^<]*).*[\n\t]*<td[^>]*>([^<]*)")
result = r.findall(s)[0]
return result
def main_info(s):
r = re.compile("maininfo[^>]*>([^<]*)")
result = r.findall(s)
return result
def nice_row(s):
row = []
row += main_info(s)
row += [landmark_status(s)]
row += violations_dob(s)
return row
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
if __name__ == "__main__":
rows = []
batches = chunks(list_of_bins(), 50) # ~48,000
for i, bins in enumerate(batches):
for bin_ in bins:
try:
url = dob_url(bin_)
raw_html = open_url(url)
row = nice_row(raw_html)
rows.append(row)
time.sleep(1)
except:
continue # ¯\_(ツ)_/¯
pd.DataFrame(rows).to_csv("output/run%s.csv" % i)
print "wrote %s" % i
print "...done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment