gitabites/perac_2.py

## perac_2.py
# modified from http://brianabelson.com/open-news/2013/12/17/scrape-the-gibson.html

import requests
from bs4 import BeautifulSoup
from pprint import pprint
from urlparse import urljoin
import dataset
import csv
import pandas as pd

# the base url for PERA rfps
BASE_URL = "http://www.mass.gov/perac/new.htm"

def scrape_perac():
	response = requests.get(BASE_URL)
	#parse HTML using BS
	# this returns a soup object which gives convenience methods for parsing html
	soup = BeautifulSoup(response.content)
	# find all the rfps in the page
	# here we're telling BS to get every <td> tag that has a "align: left"
	perac = soup.find_all('tr')
#get all the links to rfp pages
	urls = []
	for rfp in perac:
	# for each <td>, find the "a" tag which represents the link to the rfp page
		rfp_a = rfp.find('a')
		if rfp_a:
			link = rfp.find('a').attrs['href']
	# join this relative link the with base url to create an absolute link
			if not link.startswith("mailto:"):
				url = urljoin(BASE_URL, link)
				urls.append(url)
	# pass this url to a function to scrape info about that rfp
				#scrape_rfp(url)
	data = pd.Series(urls)
	data.to_csv('perac_rfp.csv', index=False)


if __name__ == '__main__':
	scrape_perac()
	# modified from http://brianabelson.com/open-news/2013/12/17/scrape-the-gibson.html

	import requests
	from bs4 import BeautifulSoup
	from pprint import pprint
	from urlparse import urljoin
	import dataset
	import csv
	import pandas as pd

	# the base url for PERA rfps
	BASE_URL = "http://www.mass.gov/perac/new.htm"

	def scrape_perac():
	response = requests.get(BASE_URL)
	#parse HTML using BS
	# this returns a soup object which gives convenience methods for parsing html
	soup = BeautifulSoup(response.content)
	# find all the rfps in the page
	# here we're telling BS to get every <td> tag that has a "align: left"
	perac = soup.find_all('tr')
	#get all the links to rfp pages
	urls = []
	for rfp in perac:
	# for each <td>, find the "a" tag which represents the link to the rfp page
	rfp_a = rfp.find('a')
	if rfp_a:
	link = rfp.find('a').attrs['href']
	# join this relative link the with base url to create an absolute link
	if not link.startswith("mailto:"):
	url = urljoin(BASE_URL, link)
	urls.append(url)
	# pass this url to a function to scrape info about that rfp
	#scrape_rfp(url)
	data = pd.Series(urls)
	data.to_csv('perac_rfp.csv', index=False)


	if __name__ == '__main__':
	scrape_perac()