Skip to content

Instantly share code, notes, and snippets.

@gitabites
Created June 3, 2014 13:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gitabites/ff6ae9fa25573ebfb28b to your computer and use it in GitHub Desktop.
Save gitabites/ff6ae9fa25573ebfb28b to your computer and use it in GitHub Desktop.
Scraping PERAC for RFPS
# modified from http://brianabelson.com/open-news/2013/12/17/scrape-the-gibson.html
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from urlparse import urljoin
import dataset
import csv
import pandas as pd
# the base url for PERA rfps
BASE_URL = "http://www.mass.gov/perac/new.htm"
def scrape_perac():
response = requests.get(BASE_URL)
#parse HTML using BS
# this returns a soup object which gives convenience methods for parsing html
soup = BeautifulSoup(response.content)
# find all the rfps in the page
# here we're telling BS to get every <td> tag that has a "align: left"
perac = soup.find_all('tr')
#get all the links to rfp pages
urls = []
for rfp in perac:
# for each <td>, find the "a" tag which represents the link to the rfp page
rfp_a = rfp.find('a')
if rfp_a:
link = rfp.find('a').attrs['href']
# join this relative link the with base url to create an absolute link
if not link.startswith("mailto:"):
url = urljoin(BASE_URL, link)
urls.append(url)
# pass this url to a function to scrape info about that rfp
#scrape_rfp(url)
data = pd.Series(urls)
data.to_csv('perac_rfp.csv', index=False)
if __name__ == '__main__':
scrape_perac()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment