Skip to content

Instantly share code, notes, and snippets.

@whichlight
Created June 25, 2011 01:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save whichlight/1045974 to your computer and use it in GitHub Desktop.
Save whichlight/1045974 to your computer and use it in GitHub Desktop.
scraper wiki scraper for the access to medicine index
import scraperwiki
import lxml.html
import re
html = scraperwiki.scrape("http://www.accesstomedicineindex.org/content/index-2010-0")
root = lxml.html.fromstring(html)
data = {}
table = root.cssselect("tbody")[0]
for tr in table.cssselect("tr"):
for td in tr:
#pull name
if (td.find_class("naam")):
name = td.text_content()
for div in td:
for d in div:
#pull vals from div
#get label
labelRE = "title=.*:"
string = lxml.html.tostring(d)
label= re.search(labelRE, string).group(0).replace("title=\"","").replace(":","").replace("R&D","RD")
labelvalRE = labelRE + ".*\""
numRE = ":.*\""
value = re.search(numRE,re.search(labelvalRE, string).group(0)).group(0).replace(": ","").replace("\"","")
data[label]=value
data["name"] = name
scraperwiki.sqlite.save(unique_keys=['name'], data=data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment