Skip to content

Instantly share code, notes, and snippets.

@redapple
Last active December 19, 2015 03:39
Show Gist options
  • Save redapple/5891624 to your computer and use it in GitHub Desktop.
Save redapple/5891624 to your computer and use it in GitHub Desktop.
Parsing XML with parslepy
import lxml.etree
import parslepy
import urllib2
import pprint
xml_parser = lxml.etree.XMLParser()
url = 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml'
req = urllib2.Request(url)
root = lxml.etree.parse(urllib2.urlopen(req), parser=xml_parser).getroot()
xsh = parslepy.XPathSelectorHandler(
namespaces={
'atom': 'http://www.w3.org/2005/Atom',
'im': 'http://itunes.apple.com/rss'
})
rules = {
"entries(//atom:feed/atom:entry)": [
{
"title": "atom:title",
"name": "im:name",
"id": "atom:id/@im:id",
"artist(im:artist)": {
"name": ".",
"href": "@href"
},
"images(im:image)": [{
"height": "@height",
"url": "."
}],
"releasedate": "im:releaseDate"
}
]
}
parselet = parslepy.Parselet(rules, selector_handler=xsh)
pprint.pprint(parselet.extract(root))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment