ian-holden/update_sitemap_ecwid.py

## update_sitemap_ecwid.py
# create a sitemap for an Ecwid shop site.
#
# This script will create product and category page URLs in one of three forms to suit your Ecwid setup:
# default     - the original, default URLs e.g. http://shop.com/store#!/product_name/p/12345
# clean       - clean Ecwid URLs e.g. http://shop.com/store/product_name-p12345
# querystring - querystring URLs e.g. http://shop.com/store?store-page=product_name-p12345
#
# Steps:
# 1. read the current sitemap.xml which has possibly been generated by the site, or manually
# 2. read the Ecwid shop product and category details using the Ecwid v1 API and create sitemap url entries
# 3. merge in the Ecwid product and category url entries into the sitemap
# 4. write the new sitemap to STDOUT
#
# Usage:
# Set the 5 constants below, then run the script e.g.
#
# python update_sitemap_ecwid.py > sitemap.xml
#
# Notes:
#
# The existing sitemap must exist at the URL and contain at least one url entry for the shop page
#
# All url entries for page URLs that start with STORE_PAGE_URL will be removed
# and replaced with URL entries for the STORE_PAGE_URL and all the store products and categories
#
# This script should be compatible with Python 2 and 3
#
# gzipped sitemap files are not supported
#
from __future__ import print_function


# set the following 5 constants for your site and Ecwid store
STORE_ID = '1003'
URL_TYPE = 'clean'  # querystring, clean or normal (default) - set this to the type of Ecwid URLs you are using on the site
STORE_PAGE_URL = 'https://www.example.com/shop' # the URL of your store page. Pages starting with this are assumed to be in the Ecwid store
SITEMAP_SOURCE_URL = 'https://www.example.com/sitemap.xml' # URL to the sites sitemap.xml file
FORCE_HTTPS = False # True to force any urls for Ecwid pages to be https://... in the sitemap. False to leave the protocol as Ecwid provided

import json
import requests
import re
import time
import sys


TODAYS_DATE = time.strftime("%Y-%m-%d")
ECWID_APIV1 = 'https://app.ecwid.com/api/v1/%s/' % STORE_ID
have_inserted_sitemap_store = False

# make an alternative STORE_PAGE_URL in the alternative protocol from that sp[ecified (http/https) for us to also check
if STORE_PAGE_URL[0:5] == 'http:':
    STORE_PAGE_URL_ALT = 'https:' + STORE_PAGE_URL[5:]
else:
    STORE_PAGE_URL_ALT = 'http:' + STORE_PAGE_URL[6:]


# get the new sitemap referencing all the site pages and the Ecwid store pages
def get_sitemap(sitemap_store):
    sitemap_new = ''
    response = requests.get(SITEMAP_SOURCE_URL)
    if response.status_code != 200:
        eprint("ERROR reading existing sitemap. Code: %s, response: %s" % (response.status_code, response.text))
        exit(1)
    else:
        sitemap = response.text

        # make sure we have the image namespace defined
        #  xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
        if sitemap.find('xmlns:image=') < 0:
            sitemap = re.sub(r'xmlns=', 'xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns=', sitemap, 1)

        # parse the xml
        sitemap_new = re.sub(r'<url>(.*?)<loc>(.*?)</loc>(.*?)</url>\s*', lambda x: replace_sitemap_url(x, sitemap_store),
                             sitemap, 0, re.DOTALL)

    return sitemap_new

# A lambda function used by re.sub
# takes a url element and returns the same element, or the store page url elements (only once)
# when we find the store url. Any further url elements for store pages are ignored
#
def replace_sitemap_url(match, sitemap_store):
    global have_inserted_sitemap_store
    str = ''
    page_url = match.group(2).strip()
    if is_store_page_url(page_url): # is this the store page or some page under it?
        if not have_inserted_sitemap_store:
            # str = "\n<!-- INSERTING STORE SITEMAP DATA %s -->\n" % page_url
            str += "\n" + make_sitemap_entry(STORE_PAGE_URL) + "\n" # add an entry for the store page
            str += sitemap_store.rstrip() # add all the product and category page entries
            # str += "\n<!-- FINISHED INSERTING STORE SITEMAP DATA -->\n"
            have_inserted_sitemap_store = True # only do this once and ignore any other urls inside the store
        else:
            # we have already included the store url elements from sitemap_store, so we ignore this entry
            # str = "<!-- skip store page: %s -->" % page_url
            pass
    else:
        str = '<url>%s<loc>%s</loc>%s</url>' % (match.group(1), page_url, match.group(3))
    if str != '':
        str = str + "\n   "
    return str


# is this a store page url check both http and https protocol versions of th eurl specified
def is_store_page_url(url):
    is_store = False

    l = len(STORE_PAGE_URL)
    if url[0:l] == STORE_PAGE_URL: # is this the store page or some page under it?
        is_store = True
    else:
        l = len(STORE_PAGE_URL_ALT)
        if url[0:l] == STORE_PAGE_URL_ALT: # is this the store page or some page under it?
            is_store = True

    return is_store


# get store data from Ecwid for 'products' or 'categories' using the v1 API
def get_store_data(page_type):
    data = []
    response = requests.get(ECWID_APIV1 + page_type)
    if response.status_code != 200:
        eprint("ERROR getting Ecwid data. code: %s: %s, response: %s" % (page_type, response.status_code, response.text))
        exit(1)
    else:
        s = response.text
        # print(s)
        data = json.loads(s)
    return data


# convert a standard Ecwid URL into a clean URL based on URL_TYPE
def clean_url(url):
    clean_url = url
    if URL_TYPE == 'querystring':
        clean_url = re.sub(r'^(.*)#!/(.*)/(c|p)/(\d+)$', r'\1?store-page=\2-\3\4', clean_url)
    if URL_TYPE == 'clean':
        clean_url = re.sub(r'^(.*)#!/(.*)/(c|p)/(\d+)$', r'\1/\2-\3\4', clean_url)

    clean_url = re.sub(r'&', '%26', clean_url)  # & need to be changed to hex
    clean_url = re.sub(r'>', '%3E', clean_url)  # & need to be changed to hex
    clean_url = re.sub(r'<', '%3C', clean_url)  # & need to be changed to hex

    return clean_url


# check url protocol and force https if required
def check_url_protocol(url):
    if FORCE_HTTPS:
        url = re.sub(r'^http:', 'https:', url, 1)
    return url


# Make a sitemap url element for a page URL and optional image URL
#
# e.g.
#
# <url>
# 	<loc>
# 		https://www.pumpshoppro.com/shop?store-page=Pressure-Test-Pump-c18494815
# 	</loc>
#   <lastmod>2017-09-20</lastmod>
# 	<changefreq>weekly</changefreq>
# 	<image:image>
# 		<image:loc>
# 			https://dqzrr9k4bjpzk.cloudfront.net/images/9113287/380217779.jpg
# 		</image:loc>
# 	</image:image>
# </url>
def make_sitemap_entry(p_url, i_url = ''):
    sme = "<url>\n   <loc>\n      %s\n  </loc>\n   <lastmod>%s</lastmod>" % (check_url_protocol(p_url), TODAYS_DATE)
    sme += "\n   <changefreq>weekly</changefreq>\n   <priority>0.5</priority>"
    if i_url != '':
        sme += "\n   <image:image>\n      <image:loc>%s</image:loc>\n   </image:image>" % check_url_protocol(i_url)
    sme += "\n</url>"

    sme = re.sub(r'\n', "\n   ", sme)
    return "   " + sme


# get the sitemap url elements for 'products' or 'categories from the data returned by Ecwid v1 API
def get_sitemap_for_store_data(data, page_type):
    str = ''
    for item in data:
        if 'url' in item:
            p_url = clean_url(item['url'])
            i_url = ''
            if 'thumbnailUrl' in item:
                i_url = item['thumbnailUrl']
            # print("%s: %s" % (page_type, p_url))
            str += make_sitemap_entry(p_url, i_url) + "\n"
        else:
            eprint("ERROR no 'url' key in one of the %s items. ignoring it" % page_type)
    return str


# print to stderr
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


sitemap_store = ''

for page_type in ['categories', 'products']:
    d = get_store_data(page_type)
    sitemap_store += get_sitemap_for_store_data(d, page_type)

# output in utf8
o = get_sitemap(sitemap_store)

# On Python 2 we must encode as utf8 to avoid possible UnicodeEncodeError
if sys.version_info[0] < 3:
    o = o.encode('utf-8')

print(o)

# warn if we didn't find the store page
if not have_inserted_sitemap_store:
    eprint("ERROR! the sitemap got from '%s'\ndid not contain any store page urls like '%s*'" % (SITEMAP_SOURCE_URL, STORE_PAGE_URL))
    eprint("So no store pages have been inserted into the sitemap.")
    eprint("Check that the store_page setting is correct.")
	# create a sitemap for an Ecwid shop site.
	#
	# This script will create product and category page URLs in one of three forms to suit your Ecwid setup:
	# default - the original, default URLs e.g. http://shop.com/store#!/product_name/p/12345
	# clean - clean Ecwid URLs e.g. http://shop.com/store/product_name-p12345
	# querystring - querystring URLs e.g. http://shop.com/store?store-page=product_name-p12345
	#
	# Steps:
	# 1. read the current sitemap.xml which has possibly been generated by the site, or manually
	# 2. read the Ecwid shop product and category details using the Ecwid v1 API and create sitemap url entries
	# 3. merge in the Ecwid product and category url entries into the sitemap
	# 4. write the new sitemap to STDOUT
	#
	# Usage:
	# Set the 5 constants below, then run the script e.g.
	#
	# python update_sitemap_ecwid.py > sitemap.xml
	#
	# Notes:
	#
	# The existing sitemap must exist at the URL and contain at least one url entry for the shop page
	#
	# All url entries for page URLs that start with STORE_PAGE_URL will be removed
	# and replaced with URL entries for the STORE_PAGE_URL and all the store products and categories
	#
	# This script should be compatible with Python 2 and 3
	#
	# gzipped sitemap files are not supported
	#
	from __future__ import print_function


	# set the following 5 constants for your site and Ecwid store
	STORE_ID = '1003'
	URL_TYPE = 'clean' # querystring, clean or normal (default) - set this to the type of Ecwid URLs you are using on the site
	STORE_PAGE_URL = 'https://www.example.com/shop' # the URL of your store page. Pages starting with this are assumed to be in the Ecwid store
	SITEMAP_SOURCE_URL = 'https://www.example.com/sitemap.xml' # URL to the sites sitemap.xml file
	FORCE_HTTPS = False # True to force any urls for Ecwid pages to be https://... in the sitemap. False to leave the protocol as Ecwid provided

	import json
	import requests
	import re
	import time
	import sys


	TODAYS_DATE = time.strftime("%Y-%m-%d")
	ECWID_APIV1 = 'https://app.ecwid.com/api/v1/%s/' % STORE_ID
	have_inserted_sitemap_store = False

	# make an alternative STORE_PAGE_URL in the alternative protocol from that sp[ecified (http/https) for us to also check
	if STORE_PAGE_URL[0:5] == 'http:':
	STORE_PAGE_URL_ALT = 'https:' + STORE_PAGE_URL[5:]
	else:
	STORE_PAGE_URL_ALT = 'http:' + STORE_PAGE_URL[6:]


	# get the new sitemap referencing all the site pages and the Ecwid store pages
	def get_sitemap(sitemap_store):
	sitemap_new = ''
	response = requests.get(SITEMAP_SOURCE_URL)
	if response.status_code != 200:
	eprint("ERROR reading existing sitemap. Code: %s, response: %s" % (response.status_code, response.text))
	exit(1)
	else:
	sitemap = response.text

	# make sure we have the image namespace defined
	# xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
	if sitemap.find('xmlns:image=') < 0:
	sitemap = re.sub(r'xmlns=', 'xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns=', sitemap, 1)

	# parse the xml
	sitemap_new = re.sub(r'<url>(.?)<loc>(.?)</loc>(.?)</url>\s', lambda x: replace_sitemap_url(x, sitemap_store),
	sitemap, 0, re.DOTALL)

	return sitemap_new

	# A lambda function used by re.sub
	# takes a url element and returns the same element, or the store page url elements (only once)
	# when we find the store url. Any further url elements for store pages are ignored
	#
	def replace_sitemap_url(match, sitemap_store):
	global have_inserted_sitemap_store
	str = ''
	page_url = match.group(2).strip()
	if is_store_page_url(page_url): # is this the store page or some page under it?
	if not have_inserted_sitemap_store:
	# str = "\n<!-- INSERTING STORE SITEMAP DATA %s -->\n" % page_url
	str += "\n" + make_sitemap_entry(STORE_PAGE_URL) + "\n" # add an entry for the store page
	str += sitemap_store.rstrip() # add all the product and category page entries
	# str += "\n<!-- FINISHED INSERTING STORE SITEMAP DATA -->\n"
	have_inserted_sitemap_store = True # only do this once and ignore any other urls inside the store
	else:
	# we have already included the store url elements from sitemap_store, so we ignore this entry
	# str = "<!-- skip store page: %s -->" % page_url
	pass
	else:
	str = '<url>%s<loc>%s</loc>%s</url>' % (match.group(1), page_url, match.group(3))
	if str != '':
	str = str + "\n "
	return str


	# is this a store page url check both http and https protocol versions of th eurl specified
	def is_store_page_url(url):
	is_store = False

	l = len(STORE_PAGE_URL)
	if url[0:l] == STORE_PAGE_URL: # is this the store page or some page under it?
	is_store = True
	else:
	l = len(STORE_PAGE_URL_ALT)
	if url[0:l] == STORE_PAGE_URL_ALT: # is this the store page or some page under it?
	is_store = True

	return is_store


	# get store data from Ecwid for 'products' or 'categories' using the v1 API
	def get_store_data(page_type):
	data = []
	response = requests.get(ECWID_APIV1 + page_type)
	if response.status_code != 200:
	eprint("ERROR getting Ecwid data. code: %s: %s, response: %s" % (page_type, response.status_code, response.text))
	exit(1)
	else:
	s = response.text
	# print(s)
	data = json.loads(s)
	return data


	# convert a standard Ecwid URL into a clean URL based on URL_TYPE
	def clean_url(url):
	clean_url = url
	if URL_TYPE == 'querystring':
	clean_url = re.sub(r'^(.)#!/(.)/(c\|p)/(\d+)$', r'\1?store-page=\2-\3\4', clean_url)
	if URL_TYPE == 'clean':
	clean_url = re.sub(r'^(.)#!/(.)/(c\|p)/(\d+)$', r'\1/\2-\3\4', clean_url)

	clean_url = re.sub(r'&', '%26', clean_url) # & need to be changed to hex
	clean_url = re.sub(r'>', '%3E', clean_url) # & need to be changed to hex
	clean_url = re.sub(r'<', '%3C', clean_url) # & need to be changed to hex

	return clean_url


	# check url protocol and force https if required
	def check_url_protocol(url):
	if FORCE_HTTPS:
	url = re.sub(r'^http:', 'https:', url, 1)
	return url


	# Make a sitemap url element for a page URL and optional image URL
	#
	# e.g.
	#
	# <url>
	# <loc>
	# https://www.pumpshoppro.com/shop?store-page=Pressure-Test-Pump-c18494815
	# </loc>
	# <lastmod>2017-09-20</lastmod>
	# <changefreq>weekly</changefreq>
	# <image:image>
	# <image:loc>
	# https://dqzrr9k4bjpzk.cloudfront.net/images/9113287/380217779.jpg
	# </image:loc>
	# </image:image>
	# </url>
	def make_sitemap_entry(p_url, i_url = ''):
	sme = "<url>\n <loc>\n %s\n </loc>\n <lastmod>%s</lastmod>" % (check_url_protocol(p_url), TODAYS_DATE)
	sme += "\n <changefreq>weekly</changefreq>\n <priority>0.5</priority>"
	if i_url != '':
	sme += "\n <image:image>\n <image:loc>%s</image:loc>\n </image:image>" % check_url_protocol(i_url)
	sme += "\n</url>"

	sme = re.sub(r'\n', "\n ", sme)
	return " " + sme


	# get the sitemap url elements for 'products' or 'categories from the data returned by Ecwid v1 API
	def get_sitemap_for_store_data(data, page_type):
	str = ''
	for item in data:
	if 'url' in item:
	p_url = clean_url(item['url'])
	i_url = ''
	if 'thumbnailUrl' in item:
	i_url = item['thumbnailUrl']
	# print("%s: %s" % (page_type, p_url))
	str += make_sitemap_entry(p_url, i_url) + "\n"
	else:
	eprint("ERROR no 'url' key in one of the %s items. ignoring it" % page_type)
	return str


	# print to stderr
	def eprint(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)


	sitemap_store = ''

	for page_type in ['categories', 'products']:
	d = get_store_data(page_type)
	sitemap_store += get_sitemap_for_store_data(d, page_type)

	# output in utf8
	o = get_sitemap(sitemap_store)

	# On Python 2 we must encode as utf8 to avoid possible UnicodeEncodeError
	if sys.version_info[0] < 3:
	o = o.encode('utf-8')

	print(o)

	# warn if we didn't find the store page
	if not have_inserted_sitemap_store:
	eprint("ERROR! the sitemap got from '%s'\ndid not contain any store page urls like '%s*'" % (SITEMAP_SOURCE_URL, STORE_PAGE_URL))
	eprint("So no store pages have been inserted into the sitemap.")
	eprint("Check that the store_page setting is correct.")