Skip to content

Instantly share code, notes, and snippets.

@ian-holden
Last active September 23, 2017 11:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ian-holden/5ca5460b69efd11b04f0067b4074f9c3 to your computer and use it in GitHub Desktop.
Save ian-holden/5ca5460b69efd11b04f0067b4074f9c3 to your computer and use it in GitHub Desktop.
Update an existing site sitemap.xml with entries for all Ecwid products and categories in a store
# create a sitemap for an Ecwid shop site.
#
# This script will create product and category page URLs in one of three forms to suit your Ecwid setup:
# default - the original, default URLs e.g. http://shop.com/store#!/product_name/p/12345
# clean - clean Ecwid URLs e.g. http://shop.com/store/product_name-p12345
# querystring - querystring URLs e.g. http://shop.com/store?store-page=product_name-p12345
#
# Steps:
# 1. read the current sitemap.xml which has possibly been generated by the site, or manually
# 2. read the Ecwid shop product and category details using the Ecwid v1 API and create sitemap url entries
# 3. merge in the Ecwid product and category url entries into the sitemap
# 4. write the new sitemap to STDOUT
#
# Usage:
# Set the 5 constants below, then run the script e.g.
#
# python update_sitemap_ecwid.py > sitemap.xml
#
# Notes:
#
# The existing sitemap must exist at the URL and contain at least one url entry for the shop page
#
# All url entries for page URLs that start with STORE_PAGE_URL will be removed
# and replaced with URL entries for the STORE_PAGE_URL and all the store products and categories
#
# This script should be compatible with Python 2 and 3
#
# gzipped sitemap files are not supported
#
from __future__ import print_function
# set the following 5 constants for your site and Ecwid store
STORE_ID = '1003'
URL_TYPE = 'clean' # querystring, clean or normal (default) - set this to the type of Ecwid URLs you are using on the site
STORE_PAGE_URL = 'https://www.example.com/shop' # the URL of your store page. Pages starting with this are assumed to be in the Ecwid store
SITEMAP_SOURCE_URL = 'https://www.example.com/sitemap.xml' # URL to the sites sitemap.xml file
FORCE_HTTPS = False # True to force any urls for Ecwid pages to be https://... in the sitemap. False to leave the protocol as Ecwid provided
import json
import requests
import re
import time
import sys
TODAYS_DATE = time.strftime("%Y-%m-%d")
ECWID_APIV1 = 'https://app.ecwid.com/api/v1/%s/' % STORE_ID
have_inserted_sitemap_store = False
# make an alternative STORE_PAGE_URL in the alternative protocol from that sp[ecified (http/https) for us to also check
if STORE_PAGE_URL[0:5] == 'http:':
STORE_PAGE_URL_ALT = 'https:' + STORE_PAGE_URL[5:]
else:
STORE_PAGE_URL_ALT = 'http:' + STORE_PAGE_URL[6:]
# get the new sitemap referencing all the site pages and the Ecwid store pages
def get_sitemap(sitemap_store):
sitemap_new = ''
response = requests.get(SITEMAP_SOURCE_URL)
if response.status_code != 200:
eprint("ERROR reading existing sitemap. Code: %s, response: %s" % (response.status_code, response.text))
exit(1)
else:
sitemap = response.text
# make sure we have the image namespace defined
# xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
if sitemap.find('xmlns:image=') < 0:
sitemap = re.sub(r'xmlns=', 'xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns=', sitemap, 1)
# parse the xml
sitemap_new = re.sub(r'<url>(.*?)<loc>(.*?)</loc>(.*?)</url>\s*', lambda x: replace_sitemap_url(x, sitemap_store),
sitemap, 0, re.DOTALL)
return sitemap_new
# A lambda function used by re.sub
# takes a url element and returns the same element, or the store page url elements (only once)
# when we find the store url. Any further url elements for store pages are ignored
#
def replace_sitemap_url(match, sitemap_store):
global have_inserted_sitemap_store
str = ''
page_url = match.group(2).strip()
if is_store_page_url(page_url): # is this the store page or some page under it?
if not have_inserted_sitemap_store:
# str = "\n<!-- INSERTING STORE SITEMAP DATA %s -->\n" % page_url
str += "\n" + make_sitemap_entry(STORE_PAGE_URL) + "\n" # add an entry for the store page
str += sitemap_store.rstrip() # add all the product and category page entries
# str += "\n<!-- FINISHED INSERTING STORE SITEMAP DATA -->\n"
have_inserted_sitemap_store = True # only do this once and ignore any other urls inside the store
else:
# we have already included the store url elements from sitemap_store, so we ignore this entry
# str = "<!-- skip store page: %s -->" % page_url
pass
else:
str = '<url>%s<loc>%s</loc>%s</url>' % (match.group(1), page_url, match.group(3))
if str != '':
str = str + "\n "
return str
# is this a store page url check both http and https protocol versions of th eurl specified
def is_store_page_url(url):
is_store = False
l = len(STORE_PAGE_URL)
if url[0:l] == STORE_PAGE_URL: # is this the store page or some page under it?
is_store = True
else:
l = len(STORE_PAGE_URL_ALT)
if url[0:l] == STORE_PAGE_URL_ALT: # is this the store page or some page under it?
is_store = True
return is_store
# get store data from Ecwid for 'products' or 'categories' using the v1 API
def get_store_data(page_type):
data = []
response = requests.get(ECWID_APIV1 + page_type)
if response.status_code != 200:
eprint("ERROR getting Ecwid data. code: %s: %s, response: %s" % (page_type, response.status_code, response.text))
exit(1)
else:
s = response.text
# print(s)
data = json.loads(s)
return data
# convert a standard Ecwid URL into a clean URL based on URL_TYPE
def clean_url(url):
clean_url = url
if URL_TYPE == 'querystring':
clean_url = re.sub(r'^(.*)#!/(.*)/(c|p)/(\d+)$', r'\1?store-page=\2-\3\4', clean_url)
if URL_TYPE == 'clean':
clean_url = re.sub(r'^(.*)#!/(.*)/(c|p)/(\d+)$', r'\1/\2-\3\4', clean_url)
clean_url = re.sub(r'&', '%26', clean_url) # & need to be changed to hex
clean_url = re.sub(r'>', '%3E', clean_url) # & need to be changed to hex
clean_url = re.sub(r'<', '%3C', clean_url) # & need to be changed to hex
return clean_url
# check url protocol and force https if required
def check_url_protocol(url):
if FORCE_HTTPS:
url = re.sub(r'^http:', 'https:', url, 1)
return url
# Make a sitemap url element for a page URL and optional image URL
#
# e.g.
#
# <url>
# <loc>
# https://www.pumpshoppro.com/shop?store-page=Pressure-Test-Pump-c18494815
# </loc>
# <lastmod>2017-09-20</lastmod>
# <changefreq>weekly</changefreq>
# <image:image>
# <image:loc>
# https://dqzrr9k4bjpzk.cloudfront.net/images/9113287/380217779.jpg
# </image:loc>
# </image:image>
# </url>
def make_sitemap_entry(p_url, i_url = ''):
sme = "<url>\n <loc>\n %s\n </loc>\n <lastmod>%s</lastmod>" % (check_url_protocol(p_url), TODAYS_DATE)
sme += "\n <changefreq>weekly</changefreq>\n <priority>0.5</priority>"
if i_url != '':
sme += "\n <image:image>\n <image:loc>%s</image:loc>\n </image:image>" % check_url_protocol(i_url)
sme += "\n</url>"
sme = re.sub(r'\n', "\n ", sme)
return " " + sme
# get the sitemap url elements for 'products' or 'categories from the data returned by Ecwid v1 API
def get_sitemap_for_store_data(data, page_type):
str = ''
for item in data:
if 'url' in item:
p_url = clean_url(item['url'])
i_url = ''
if 'thumbnailUrl' in item:
i_url = item['thumbnailUrl']
# print("%s: %s" % (page_type, p_url))
str += make_sitemap_entry(p_url, i_url) + "\n"
else:
eprint("ERROR no 'url' key in one of the %s items. ignoring it" % page_type)
return str
# print to stderr
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
sitemap_store = ''
for page_type in ['categories', 'products']:
d = get_store_data(page_type)
sitemap_store += get_sitemap_for_store_data(d, page_type)
# output in utf8
o = get_sitemap(sitemap_store)
# On Python 2 we must encode as utf8 to avoid possible UnicodeEncodeError
if sys.version_info[0] < 3:
o = o.encode('utf-8')
print(o)
# warn if we didn't find the store page
if not have_inserted_sitemap_store:
eprint("ERROR! the sitemap got from '%s'\ndid not contain any store page urls like '%s*'" % (SITEMAP_SOURCE_URL, STORE_PAGE_URL))
eprint("So no store pages have been inserted into the sitemap.")
eprint("Check that the store_page setting is correct.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment