Created
June 3, 2014 13:11
-
-
Save gitabites/ff6ae9fa25573ebfb28b to your computer and use it in GitHub Desktop.
Scraping PERAC for RFPS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# modified from http://brianabelson.com/open-news/2013/12/17/scrape-the-gibson.html | |
import requests | |
from bs4 import BeautifulSoup | |
from pprint import pprint | |
from urlparse import urljoin | |
import dataset | |
import csv | |
import pandas as pd | |
# the base url for PERA rfps | |
BASE_URL = "http://www.mass.gov/perac/new.htm" | |
def scrape_perac(): | |
response = requests.get(BASE_URL) | |
#parse HTML using BS | |
# this returns a soup object which gives convenience methods for parsing html | |
soup = BeautifulSoup(response.content) | |
# find all the rfps in the page | |
# here we're telling BS to get every <td> tag that has a "align: left" | |
perac = soup.find_all('tr') | |
#get all the links to rfp pages | |
urls = [] | |
for rfp in perac: | |
# for each <td>, find the "a" tag which represents the link to the rfp page | |
rfp_a = rfp.find('a') | |
if rfp_a: | |
link = rfp.find('a').attrs['href'] | |
# join this relative link the with base url to create an absolute link | |
if not link.startswith("mailto:"): | |
url = urljoin(BASE_URL, link) | |
urls.append(url) | |
# pass this url to a function to scrape info about that rfp | |
#scrape_rfp(url) | |
data = pd.Series(urls) | |
data.to_csv('perac_rfp.csv', index=False) | |
if __name__ == '__main__': | |
scrape_perac() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment