Skip to content

Instantly share code, notes, and snippets.

@kravietz
Created February 19, 2015 12:43
Show Gist options
  • Save kravietz/f2eec690ecb4dde84206 to your computer and use it in GitHub Desktop.
Save kravietz/f2eec690ecb4dde84206 to your computer and use it in GitHub Desktop.
XML sitemap split into 50k chunks
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
from gzip import GzipFile
import gzip
import sys
__author__ = 'Paweł Krawczyk'
from lxml import etree
URL_NUM_LIMIT = 50000
# http://lxml.de/tutorial.html#namespaces
URL_TAG = '{http://www.sitemaps.org/schemas/sitemap/0.9}url'
if len(sys.argv) < 2:
print('Usage: {} input.xml', sys.argv[0])
exit(1)
input_file_name = sys.argv[1]
output_file_template = input_file_name.split('.')
# open input file supporting gzipped and uncompressed XML
if input_file_name.find('.gz') > 0:
input_file = gzip.open(input_file_name)
else:
input_file = open(input_file_name, 'rb')
# parse the input XML
input_tree = etree.parse(input_file)
input_root = input_tree.getroot()
print('Input {} root {}'.format(input_file, input_root))
# start processing through the input file urls
current_urls_seen = 0
total_urls_seen = 0
iteration = 0
for url in input_root.iterfind(URL_TAG):
# create output file for current 50k chunk
if current_urls_seen == 0:
# sitemap20.xml -> sitemap200.xml, sitemap201.xml...
current_output_filename = '{}{}.{}.gz'.format(output_file_template[0], iteration, output_file_template[1])
output_gzip = GzipFile(current_output_filename, 'wb')
output_set = etree.Element('urlset', nsmap={None: 'http://www.sitemaps.org/schemas/sitemap/0.9'})
print('Writing', current_output_filename, 'current position in input file is', total_urls_seen)
# append the current URL to the new output set
output_set.append(url)
current_urls_seen += 1
total_urls_seen += 1
# close the current set when approaching 50k
if current_urls_seen == 50000:
output_gzip.write(etree.tostring(output_set))
output_gzip.close()
print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total')
# reset/update counters
current_urls_seen = 0
iteration += 1
# close last file after loop finished
output_gzip.write(etree.tostring(output_set))
output_gzip.close()
print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment