Created
September 8, 2017 14:42
-
-
Save garrettdashnelson/14bc668ea7cb1d2c01a482eb02402c3d to your computer and use it in GitHub Desktop.
Scraper for VGSI-style property tax database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import urllib2 | |
import re | |
from string import ascii_uppercase, replace | |
with open('results.tsv','w') as outfile: | |
for letter in ascii_uppercase: | |
street_listing_page = urllib2.urlopen('http://gis.vgsi.com/hanovernh/Streets.aspx?Letter={}'.format(letter)) | |
parsed_street_listing_page = BeautifulSoup(street_listing_page,'html.parser') | |
for street_url in parsed_street_listing_page.find_all(href=re.compile('Streets.aspx\?Name=')): | |
req_url = "http://gis.vgsi.com/hanovernh/{}".format(street_url['href']) | |
single_street_page = urllib2.urlopen(replace(req_url," ","%20")) | |
parsed_single_street_page = BeautifulSoup(single_street_page,'html.parser') | |
for property_url in parsed_single_street_page.find_all(href=re.compile('Parcel.aspx')): | |
single_property_page = urllib2.urlopen("http://gis.vgsi.com/hanovernh/{}".format(property_url['href'])) | |
parsed_single_property_page = BeautifulSoup(single_property_page,'html.parser') | |
outfile.write( parsed_single_property_page.find(id="MainContent_lblLocation").contents[0] + "\t ") | |
outfile.write( parsed_single_property_page.find(id="MainContent_lblGenOwner").contents[0] + "\t ") | |
table = parsed_single_property_page.find(id="MainContent_grdCurrentValueAsmt") | |
for n in table.find(class_="RowStyle").find_all('td'): | |
outfile.write( n.contents[0] + "\t ") | |
outfile.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment