Skip to content

Instantly share code, notes, and snippets.

@bycoffe
Created April 20, 2011 13:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bycoffe/931340 to your computer and use it in GitHub Desktop.
Save bycoffe/931340 to your computer and use it in GitHub Desktop.
First pass at scraping the FAPIIS site, just to see if it's possible.
"""
First pass at scraping the FAPIIS site, just to see if it's possible.
This seems to work, but it's impossible to know whether the data it's
returning will be accurate since FAPIIS doesn't currently contain any
data.
This method requires knowing the company's DUNS number, though it's
likely possible to back this up a step to allow for searching by name.
"""
import urllib
import urllib2
import lxml.html
def get_summary_page(duns):
url = 'https://www.fapiis.gov/fapiis/fapiis/govt/adversereportsearch.do'
data = {'org.apache.struts.taglib.html.TOKEN': '',
'action': 'showReportsSummary',
'rctrID': duns, # DUNS number
'rctrName': '',
'nameOption': '',
'searchctrName': 'This can really be anything',
'searchduns': '',
'cageCode': '',
'sequence': '',
'inputsequence': '',
}
req = urllib2.Request(url, data=urllib.urlencode(data))
response = urllib2.urlopen(req)
return response.read()
def parse_summary_page(page):
doc = lxml.html.fromstring(page)
try:
table = doc.cssselect('#listdata')[0]
except IndexError:
return
for row in table.cssselect('tr')[2:]:
cells = row.cssselect('td')
try:
link = row.cssselect('input')[0].attrib['onclick'].strip()
report_type = cells[1].text_content().strip()
count = cells[2].text_content().strip()
except IndexError:
continue
print link
print report_type
print count
print
def _main():
duns = 192835515
page = get_summary_page(duns)
parse_summary_page(page)
if __name__ == '__main__':
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment