Skip to content

Instantly share code, notes, and snippets.

@michaeljoseph
Last active December 25, 2015 01:59
Show Gist options
  • Save michaeljoseph/6899354 to your computer and use it in GitHub Desktop.
Save michaeljoseph/6899354 to your computer and use it in GitHub Desktop.
use pq to parse xml
→ python parse_xml.py
['__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__init__', '__iter__', '__len__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '_init', 'addnext', 'addprevious', 'append', 'attrib', 'base', 'clear', 'extend', 'find', 'findall', 'findtext', 'get', 'getchildren', 'getiterator', 'getnext', 'getparent', 'getprevious', 'getroottree', 'index', 'insert', 'items', 'iter', 'iterancestors', 'iterchildren', 'iterdescendants', 'iterfind', 'itersiblings', 'itertext', 'keys', 'makeelement', 'nsmap', 'prefix', 'remove', 'replace', 'set', 'sourceline', 'tag', 'tail', 'text', 'values', 'xpath']
http://badsite.com/baduri.jsp
# use pq to parse xml
from pyquery import PyQuery as pq
xml_string = """
<report xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="https://report.cybertip.org/ispws/xsd">
<incidentSummary>
<incidentType>Child Pornography (possession, manufacture, and distribution)</incidentType>
<incidentDateTime>2012-10-15T08:00:00-07:00</incidentDateTime>
</incidentSummary>
<internetDetails>
<webPageIncident>
<url>http://badsite.com/baduri.jsp</url>
</webPageIncident>
</internetDetails>
<reporter>
<reportingPerson>
<firstName>John</firstName>
<lastName>Smith</lastName>
</reportingPerson>
</reporter>
</report>
"""
urls = pq(xml_string, parser='xml')('url')
for url in urls:
print(dir(url))
print(url.text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment