bycoffe/fapiis_scraper.py

## fapiis_scraper.py
"""
First pass at scraping the FAPIIS site, just to see if it's possible.
This seems to work, but it's impossible to know whether the data it's
returning will be accurate since FAPIIS doesn't currently contain any
data.

This method requires knowing the company's DUNS number, though it's
likely possible to back this up a step to allow for searching by name.
"""
import urllib
import urllib2

import lxml.html


def get_summary_page(duns):
    url = 'https://www.fapiis.gov/fapiis/fapiis/govt/adversereportsearch.do'
    data = {'org.apache.struts.taglib.html.TOKEN': '',
            'action': 'showReportsSummary',
            'rctrID': duns, # DUNS number
            'rctrName': '',
            'nameOption': '',
            'searchctrName': 'This can really be anything',
            'searchduns': '',
            'cageCode': '',
            'sequence': '',
            'inputsequence': '',
            }
    req = urllib2.Request(url, data=urllib.urlencode(data))
    response = urllib2.urlopen(req)
    return response.read()


def parse_summary_page(page):
    doc = lxml.html.fromstring(page)
    try:
        table = doc.cssselect('#listdata')[0]
    except IndexError:
        return

    for row in table.cssselect('tr')[2:]:
        cells = row.cssselect('td')
        try:
            link = row.cssselect('input')[0].attrib['onclick'].strip()
            report_type = cells[1].text_content().strip()
            count = cells[2].text_content().strip()
        except IndexError:
            continue

        print link
        print report_type
        print count
        print


def _main():
    duns = 192835515
    page = get_summary_page(duns)
    parse_summary_page(page)


if __name__ == '__main__':
    _main()
	"""
	First pass at scraping the FAPIIS site, just to see if it's possible.
	This seems to work, but it's impossible to know whether the data it's
	returning will be accurate since FAPIIS doesn't currently contain any
	data.

	This method requires knowing the company's DUNS number, though it's
	likely possible to back this up a step to allow for searching by name.
	"""
	import urllib
	import urllib2

	import lxml.html


	def get_summary_page(duns):
	url = 'https://www.fapiis.gov/fapiis/fapiis/govt/adversereportsearch.do'
	data = {'org.apache.struts.taglib.html.TOKEN': '',
	'action': 'showReportsSummary',
	'rctrID': duns, # DUNS number
	'rctrName': '',
	'nameOption': '',
	'searchctrName': 'This can really be anything',
	'searchduns': '',
	'cageCode': '',
	'sequence': '',
	'inputsequence': '',
	}
	req = urllib2.Request(url, data=urllib.urlencode(data))
	response = urllib2.urlopen(req)
	return response.read()


	def parse_summary_page(page):
	doc = lxml.html.fromstring(page)
	try:
	table = doc.cssselect('#listdata')[0]
	except IndexError:
	return

	for row in table.cssselect('tr')[2:]:
	cells = row.cssselect('td')
	try:
	link = row.cssselect('input')[0].attrib['onclick'].strip()
	report_type = cells[1].text_content().strip()
	count = cells[2].text_content().strip()
	except IndexError:
	continue

	print link
	print report_type
	print count
	print


	def _main():
	duns = 192835515
	page = get_summary_page(duns)
	parse_summary_page(page)


	if __name__ == '__main__':
	_main()