garrettdashnelson/scraper.py

## scraper.py
from bs4 import BeautifulSoup
import urllib2
import re
from string import ascii_uppercase, replace


with open('results.tsv','w') as outfile:

	for letter in ascii_uppercase:

		street_listing_page = urllib2.urlopen('http://gis.vgsi.com/hanovernh/Streets.aspx?Letter={}'.format(letter))
		parsed_street_listing_page = BeautifulSoup(street_listing_page,'html.parser')

		for street_url in parsed_street_listing_page.find_all(href=re.compile('Streets.aspx\?Name=')):

			req_url = "http://gis.vgsi.com/hanovernh/{}".format(street_url['href'])

			single_street_page = urllib2.urlopen(replace(req_url," ","%20"))
			parsed_single_street_page = BeautifulSoup(single_street_page,'html.parser')

			for property_url in parsed_single_street_page.find_all(href=re.compile('Parcel.aspx')):

				single_property_page = urllib2.urlopen("http://gis.vgsi.com/hanovernh/{}".format(property_url['href']))
				parsed_single_property_page = BeautifulSoup(single_property_page,'html.parser')

				outfile.write( parsed_single_property_page.find(id="MainContent_lblLocation").contents[0] + "\t ")
				outfile.write( parsed_single_property_page.find(id="MainContent_lblGenOwner").contents[0] + "\t ")

				table = parsed_single_property_page.find(id="MainContent_grdCurrentValueAsmt")

				for n in table.find(class_="RowStyle").find_all('td'):
					outfile.write( n.contents[0] + "\t ")

				outfile.write('\n')
	from bs4 import BeautifulSoup
	import urllib2
	import re
	from string import ascii_uppercase, replace


	with open('results.tsv','w') as outfile:

	for letter in ascii_uppercase:

	street_listing_page = urllib2.urlopen('http://gis.vgsi.com/hanovernh/Streets.aspx?Letter={}'.format(letter))
	parsed_street_listing_page = BeautifulSoup(street_listing_page,'html.parser')

	for street_url in parsed_street_listing_page.find_all(href=re.compile('Streets.aspx\?Name=')):

	req_url = "http://gis.vgsi.com/hanovernh/{}".format(street_url['href'])

	single_street_page = urllib2.urlopen(replace(req_url," ","%20"))
	parsed_single_street_page = BeautifulSoup(single_street_page,'html.parser')

	for property_url in parsed_single_street_page.find_all(href=re.compile('Parcel.aspx')):

	single_property_page = urllib2.urlopen("http://gis.vgsi.com/hanovernh/{}".format(property_url['href']))
	parsed_single_property_page = BeautifulSoup(single_property_page,'html.parser')

	outfile.write( parsed_single_property_page.find(id="MainContent_lblLocation").contents[0] + "\t ")
	outfile.write( parsed_single_property_page.find(id="MainContent_lblGenOwner").contents[0] + "\t ")

	table = parsed_single_property_page.find(id="MainContent_grdCurrentValueAsmt")

	for n in table.find(class_="RowStyle").find_all('td'):
	outfile.write( n.contents[0] + "\t ")

	outfile.write('\n')