Skip to content

Instantly share code, notes, and snippets.

@seanherron
Created July 15, 2013 03:16
Show Gist options
  • Save seanherron/5997278 to your computer and use it in GitHub Desktop.
Save seanherron/5997278 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#!/usr/bin/env python
import scraperwiki
import requests
import lxml.html
import lxml.etree
import string
import md5
index_categories = string.ascii_uppercase
def DataTableScrape(descriptor, keyword):
try:
descriptor = drug_page_root.cssselect('td:contains("%s") + td' % keyword)[0].text_content().strip().replace(u"\u2022", "").replace(u'\r\n\t\t', '').replace(u'\r\n\t', '').split(';')
except IndexError:
descriptor = []
return descriptor
# Next, we'll set a session up to ensure we have the right cookies. We grab these from a drug index page.
session = requests.Session()
session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=A')
# Next, We'll loop through this and open up each listing of drugs
for letter in index_categories:
page = session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=%s&StartRow=1&StepSize=1000000' % letter)
# Now the fun begins. We'll begin by using lxml to grab all the links to drug detail pages.
root = lxml.html.fromstring(page.content)
# select all the relevant product links
links = root.cssselect('td.product_table li a')
for link in links:
# extract what we *really* want
url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/' + link.get('href')
drug_page = session.get(url)
drug_page_root = lxml.html.fromstring(drug_page.content)
# Key
key = md5.new(url).digest()
# We'll create the Drug Name Object
name = DataTableScrape(descriptor="name", keyword = "Drug Name")
# We'll get the Active Ingredients Set Up
active_ingredients = DataTableScrape(descriptor="active_ingredient", keyword = "Active Ingredient")
# And the Company
company = DataTableScrape(descriptor="company", keyword = "Company")
# The FDA NDA (Application Number)
application_num = DataTableScrape(descriptor="application_num", keyword = "FDA Application")
unique_keys = [ 'id' ]
data = {'id':key, 'name':name,'active_ingredients':active_ingredients,'company':company,'application_num':application_num}
for item in data:
print type(item)
#scraperwiki.sql.save(unique_keys, data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment