XML sitemap split into 50k chunks
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
from gzip import GzipFile | |
import gzip | |
import sys | |
__author__ = 'Paweł Krawczyk' | |
from lxml import etree | |
URL_NUM_LIMIT = 50000 | |
# http://lxml.de/tutorial.html#namespaces | |
URL_TAG = '{http://www.sitemaps.org/schemas/sitemap/0.9}url' | |
if len(sys.argv) < 2: | |
print('Usage: {} input.xml', sys.argv[0]) | |
exit(1) | |
input_file_name = sys.argv[1] | |
output_file_template = input_file_name.split('.') | |
# open input file supporting gzipped and uncompressed XML | |
if input_file_name.find('.gz') > 0: | |
input_file = gzip.open(input_file_name) | |
else: | |
input_file = open(input_file_name, 'rb') | |
# parse the input XML | |
input_tree = etree.parse(input_file) | |
input_root = input_tree.getroot() | |
print('Input {} root {}'.format(input_file, input_root)) | |
# start processing through the input file urls | |
current_urls_seen = 0 | |
total_urls_seen = 0 | |
iteration = 0 | |
for url in input_root.iterfind(URL_TAG): | |
# create output file for current 50k chunk | |
if current_urls_seen == 0: | |
# sitemap20.xml -> sitemap200.xml, sitemap201.xml... | |
current_output_filename = '{}{}.{}.gz'.format(output_file_template[0], iteration, output_file_template[1]) | |
output_gzip = GzipFile(current_output_filename, 'wb') | |
output_set = etree.Element('urlset', nsmap={None: 'http://www.sitemaps.org/schemas/sitemap/0.9'}) | |
print('Writing', current_output_filename, 'current position in input file is', total_urls_seen) | |
# append the current URL to the new output set | |
output_set.append(url) | |
current_urls_seen += 1 | |
total_urls_seen += 1 | |
# close the current set when approaching 50k | |
if current_urls_seen == 50000: | |
output_gzip.write(etree.tostring(output_set)) | |
output_gzip.close() | |
print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total') | |
# reset/update counters | |
current_urls_seen = 0 | |
iteration += 1 | |
# close last file after loop finished | |
output_gzip.write(etree.tostring(output_set)) | |
output_gzip.close() | |
print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment