kravietz/sitemap-split.py

## sitemap-split.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function

from gzip import GzipFile
import gzip
import sys

__author__ = 'Paweł Krawczyk'

from lxml import etree

URL_NUM_LIMIT = 50000
# http://lxml.de/tutorial.html#namespaces
URL_TAG = '{http://www.sitemaps.org/schemas/sitemap/0.9}url'

if len(sys.argv) < 2:
    print('Usage: {} input.xml', sys.argv[0])
    exit(1)

input_file_name = sys.argv[1]

output_file_template = input_file_name.split('.')

# open input file supporting gzipped and uncompressed XML
if input_file_name.find('.gz') > 0:
    input_file = gzip.open(input_file_name)
else:
    input_file = open(input_file_name, 'rb')

# parse the input XML
input_tree = etree.parse(input_file)
input_root = input_tree.getroot()
print('Input {} root {}'.format(input_file, input_root))

# start processing through the input file urls
current_urls_seen = 0
total_urls_seen = 0
iteration = 0

for url in input_root.iterfind(URL_TAG):

    # create output file for current 50k chunk
    if current_urls_seen == 0:
        # sitemap20.xml -> sitemap200.xml, sitemap201.xml...
        current_output_filename = '{}{}.{}.gz'.format(output_file_template[0], iteration, output_file_template[1])

        output_gzip = GzipFile(current_output_filename, 'wb')
        output_set = etree.Element('urlset', nsmap={None: 'http://www.sitemaps.org/schemas/sitemap/0.9'})

        print('Writing', current_output_filename, 'current position in input file is', total_urls_seen)

    # append the current URL to the new output set
    output_set.append(url)
    current_urls_seen += 1
    total_urls_seen += 1

    # close the current set when approaching 50k
    if current_urls_seen == 50000:

        output_gzip.write(etree.tostring(output_set))
        output_gzip.close()

        print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total')

        # reset/update counters
        current_urls_seen = 0
        iteration += 1

# close last file after loop finished
output_gzip.write(etree.tostring(output_set))
output_gzip.close()
print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total')
	#!/usr/bin/python
	# -- coding: utf-8 --
	from __future__ import print_function

	from gzip import GzipFile
	import gzip
	import sys

	__author__ = 'Paweł Krawczyk'

	from lxml import etree

	URL_NUM_LIMIT = 50000
	# http://lxml.de/tutorial.html#namespaces
	URL_TAG = '{http://www.sitemaps.org/schemas/sitemap/0.9}url'

	if len(sys.argv) < 2:
	print('Usage: {} input.xml', sys.argv[0])
	exit(1)

	input_file_name = sys.argv[1]

	output_file_template = input_file_name.split('.')

	# open input file supporting gzipped and uncompressed XML
	if input_file_name.find('.gz') > 0:
	input_file = gzip.open(input_file_name)
	else:
	input_file = open(input_file_name, 'rb')

	# parse the input XML
	input_tree = etree.parse(input_file)
	input_root = input_tree.getroot()
	print('Input {} root {}'.format(input_file, input_root))

	# start processing through the input file urls
	current_urls_seen = 0
	total_urls_seen = 0
	iteration = 0

	for url in input_root.iterfind(URL_TAG):

	# create output file for current 50k chunk
	if current_urls_seen == 0:
	# sitemap20.xml -> sitemap200.xml, sitemap201.xml...
	current_output_filename = '{}{}.{}.gz'.format(output_file_template[0], iteration, output_file_template[1])

	output_gzip = GzipFile(current_output_filename, 'wb')
	output_set = etree.Element('urlset', nsmap={None: 'http://www.sitemaps.org/schemas/sitemap/0.9'})

	print('Writing', current_output_filename, 'current position in input file is', total_urls_seen)

	# append the current URL to the new output set
	output_set.append(url)
	current_urls_seen += 1
	total_urls_seen += 1

	# close the current set when approaching 50k
	if current_urls_seen == 50000:

	output_gzip.write(etree.tostring(output_set))
	output_gzip.close()

	print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total')

	# reset/update counters
	current_urls_seen = 0
	iteration += 1

	# close last file after loop finished
	output_gzip.write(etree.tostring(output_set))
	output_gzip.close()
	print('Closed', current_output_filename, 'with', current_urls_seen, 'URLs saved', total_urls_seen, 'total')