Skip to content

Instantly share code, notes, and snippets.

@jroo
Created March 22, 2009 21:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jroo/83306 to your computer and use it in GitHub Desktop.
Save jroo/83306 to your computer and use it in GitHub Desktop.
returns list of GAO reports for past year
import datetime, time
import urllib2
from BeautifulSoup import BeautifulSoup
#returns list of GAO reports for past year
def grab_gao():
doc_type = "GAO"
page = urllib2.urlopen("http://gao.gov/docsearch/pastyear.html")
soup = BeautifulSoup(page)
report_list = []
for report in soup('dl'):
title = report('dt')[0].renderContents().replace('<strong>', '').replace('</strong>', '')
category = report('dt')[0]('strong')[0].renderContents()
original_url = report('dd')[0].find('a')['href']
gov_id = report('dd')[0].find('a').string.strip()
date_str = report('dd')[0].find('a').nextSibling.lstrip(', ')
release_date = time.strftime('%Y-%m-%d', time.strptime(date_str, '%B %d, %Y'))
gao_report = {'doc_type':doc_type, 'gov_id':gov_id, 'release_date':release_date, 'title':title,
'category':category, 'original_url':original_url}
report_list.append(gao_report)
return report_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment