Skip to content

Instantly share code, notes, and snippets.

@b1
Last active December 14, 2015 20:09
Show Gist options
  • Save b1/5141901 to your computer and use it in GitHub Desktop.
Save b1/5141901 to your computer and use it in GitHub Desktop.
Test script. parse VS iterparse On google.com/sitemap.xml
#!/usr/bin/python
# coding: utf-8
""" Parsing xml file. Basic example """
from StringIO import StringIO
from datetime import datetime
from lxml import etree
import urllib2
print """
Test script.
parse VS iterparse
On google.com/sitemap.xml
https://gist.github.com/b1/5141901/raw/google_sitemap_example.py
"""
sitemap = urllib2.urlopen(
'http://google.com/sitemap.xml',
timeout=10
).read()
# manually get this namespaces from xml file
NS = {
'x': 'http://www.sitemaps.org/schemas/sitemap/0.9',
'x2': 'http://www.google.com/schemas/sitemap-mobile/1.0'
}
res = []
print '>>parse'
start_time = datetime.now()
tree = etree.parse(StringIO(sitemap))
urls = tree.xpath('//*/x:url', namespaces=NS)
for url in urls:
t = []
t = url.xpath('.//x:loc/text() | .//x:priority/text()', namespaces=NS)
t.append(url.xpath('boolean(.//x2:mobile)', namespaces=NS))
res.append(t)
print 'Time: %s' % str(datetime.now() - start_time)
print 'Test:'
print res[5000]
print '------'
res = []
print '>>iterparse'
start_time = datetime.now()
urls = etree.iterparse(StringIO(sitemap), tag='{http://www.sitemaps.org/schemas/sitemap/0.9}url')
for event, url in urls:
t = []
t = url.xpath('.//x:loc/text() | .//x:priority/text()', namespaces=NS)
t.append(url.xpath('boolean(.//x2:mobile)', namespaces=NS))
res.append(t)
print 'Time: %s' % str(datetime.now() - start_time)
print 'Test:'
print res[5000]
print '------'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment