Last active
September 23, 2017 11:25
-
-
Save ian-holden/5ca5460b69efd11b04f0067b4074f9c3 to your computer and use it in GitHub Desktop.
Update an existing site sitemap.xml with entries for all Ecwid products and categories in a store
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a sitemap for an Ecwid shop site. | |
# | |
# This script will create product and category page URLs in one of three forms to suit your Ecwid setup: | |
# default - the original, default URLs e.g. http://shop.com/store#!/product_name/p/12345 | |
# clean - clean Ecwid URLs e.g. http://shop.com/store/product_name-p12345 | |
# querystring - querystring URLs e.g. http://shop.com/store?store-page=product_name-p12345 | |
# | |
# Steps: | |
# 1. read the current sitemap.xml which has possibly been generated by the site, or manually | |
# 2. read the Ecwid shop product and category details using the Ecwid v1 API and create sitemap url entries | |
# 3. merge in the Ecwid product and category url entries into the sitemap | |
# 4. write the new sitemap to STDOUT | |
# | |
# Usage: | |
# Set the 5 constants below, then run the script e.g. | |
# | |
# python update_sitemap_ecwid.py > sitemap.xml | |
# | |
# Notes: | |
# | |
# The existing sitemap must exist at the URL and contain at least one url entry for the shop page | |
# | |
# All url entries for page URLs that start with STORE_PAGE_URL will be removed | |
# and replaced with URL entries for the STORE_PAGE_URL and all the store products and categories | |
# | |
# This script should be compatible with Python 2 and 3 | |
# | |
# gzipped sitemap files are not supported | |
# | |
from __future__ import print_function | |
# set the following 5 constants for your site and Ecwid store | |
STORE_ID = '1003' | |
URL_TYPE = 'clean' # querystring, clean or normal (default) - set this to the type of Ecwid URLs you are using on the site | |
STORE_PAGE_URL = 'https://www.example.com/shop' # the URL of your store page. Pages starting with this are assumed to be in the Ecwid store | |
SITEMAP_SOURCE_URL = 'https://www.example.com/sitemap.xml' # URL to the sites sitemap.xml file | |
FORCE_HTTPS = False # True to force any urls for Ecwid pages to be https://... in the sitemap. False to leave the protocol as Ecwid provided | |
import json | |
import requests | |
import re | |
import time | |
import sys | |
TODAYS_DATE = time.strftime("%Y-%m-%d") | |
ECWID_APIV1 = 'https://app.ecwid.com/api/v1/%s/' % STORE_ID | |
have_inserted_sitemap_store = False | |
# make an alternative STORE_PAGE_URL in the alternative protocol from that sp[ecified (http/https) for us to also check | |
if STORE_PAGE_URL[0:5] == 'http:': | |
STORE_PAGE_URL_ALT = 'https:' + STORE_PAGE_URL[5:] | |
else: | |
STORE_PAGE_URL_ALT = 'http:' + STORE_PAGE_URL[6:] | |
# get the new sitemap referencing all the site pages and the Ecwid store pages | |
def get_sitemap(sitemap_store): | |
sitemap_new = '' | |
response = requests.get(SITEMAP_SOURCE_URL) | |
if response.status_code != 200: | |
eprint("ERROR reading existing sitemap. Code: %s, response: %s" % (response.status_code, response.text)) | |
exit(1) | |
else: | |
sitemap = response.text | |
# make sure we have the image namespace defined | |
# xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" | |
if sitemap.find('xmlns:image=') < 0: | |
sitemap = re.sub(r'xmlns=', 'xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns=', sitemap, 1) | |
# parse the xml | |
sitemap_new = re.sub(r'<url>(.*?)<loc>(.*?)</loc>(.*?)</url>\s*', lambda x: replace_sitemap_url(x, sitemap_store), | |
sitemap, 0, re.DOTALL) | |
return sitemap_new | |
# A lambda function used by re.sub | |
# takes a url element and returns the same element, or the store page url elements (only once) | |
# when we find the store url. Any further url elements for store pages are ignored | |
# | |
def replace_sitemap_url(match, sitemap_store): | |
global have_inserted_sitemap_store | |
str = '' | |
page_url = match.group(2).strip() | |
if is_store_page_url(page_url): # is this the store page or some page under it? | |
if not have_inserted_sitemap_store: | |
# str = "\n<!-- INSERTING STORE SITEMAP DATA %s -->\n" % page_url | |
str += "\n" + make_sitemap_entry(STORE_PAGE_URL) + "\n" # add an entry for the store page | |
str += sitemap_store.rstrip() # add all the product and category page entries | |
# str += "\n<!-- FINISHED INSERTING STORE SITEMAP DATA -->\n" | |
have_inserted_sitemap_store = True # only do this once and ignore any other urls inside the store | |
else: | |
# we have already included the store url elements from sitemap_store, so we ignore this entry | |
# str = "<!-- skip store page: %s -->" % page_url | |
pass | |
else: | |
str = '<url>%s<loc>%s</loc>%s</url>' % (match.group(1), page_url, match.group(3)) | |
if str != '': | |
str = str + "\n " | |
return str | |
# is this a store page url check both http and https protocol versions of th eurl specified | |
def is_store_page_url(url): | |
is_store = False | |
l = len(STORE_PAGE_URL) | |
if url[0:l] == STORE_PAGE_URL: # is this the store page or some page under it? | |
is_store = True | |
else: | |
l = len(STORE_PAGE_URL_ALT) | |
if url[0:l] == STORE_PAGE_URL_ALT: # is this the store page or some page under it? | |
is_store = True | |
return is_store | |
# get store data from Ecwid for 'products' or 'categories' using the v1 API | |
def get_store_data(page_type): | |
data = [] | |
response = requests.get(ECWID_APIV1 + page_type) | |
if response.status_code != 200: | |
eprint("ERROR getting Ecwid data. code: %s: %s, response: %s" % (page_type, response.status_code, response.text)) | |
exit(1) | |
else: | |
s = response.text | |
# print(s) | |
data = json.loads(s) | |
return data | |
# convert a standard Ecwid URL into a clean URL based on URL_TYPE | |
def clean_url(url): | |
clean_url = url | |
if URL_TYPE == 'querystring': | |
clean_url = re.sub(r'^(.*)#!/(.*)/(c|p)/(\d+)$', r'\1?store-page=\2-\3\4', clean_url) | |
if URL_TYPE == 'clean': | |
clean_url = re.sub(r'^(.*)#!/(.*)/(c|p)/(\d+)$', r'\1/\2-\3\4', clean_url) | |
clean_url = re.sub(r'&', '%26', clean_url) # & need to be changed to hex | |
clean_url = re.sub(r'>', '%3E', clean_url) # & need to be changed to hex | |
clean_url = re.sub(r'<', '%3C', clean_url) # & need to be changed to hex | |
return clean_url | |
# check url protocol and force https if required | |
def check_url_protocol(url): | |
if FORCE_HTTPS: | |
url = re.sub(r'^http:', 'https:', url, 1) | |
return url | |
# Make a sitemap url element for a page URL and optional image URL | |
# | |
# e.g. | |
# | |
# <url> | |
# <loc> | |
# https://www.pumpshoppro.com/shop?store-page=Pressure-Test-Pump-c18494815 | |
# </loc> | |
# <lastmod>2017-09-20</lastmod> | |
# <changefreq>weekly</changefreq> | |
# <image:image> | |
# <image:loc> | |
# https://dqzrr9k4bjpzk.cloudfront.net/images/9113287/380217779.jpg | |
# </image:loc> | |
# </image:image> | |
# </url> | |
def make_sitemap_entry(p_url, i_url = ''): | |
sme = "<url>\n <loc>\n %s\n </loc>\n <lastmod>%s</lastmod>" % (check_url_protocol(p_url), TODAYS_DATE) | |
sme += "\n <changefreq>weekly</changefreq>\n <priority>0.5</priority>" | |
if i_url != '': | |
sme += "\n <image:image>\n <image:loc>%s</image:loc>\n </image:image>" % check_url_protocol(i_url) | |
sme += "\n</url>" | |
sme = re.sub(r'\n', "\n ", sme) | |
return " " + sme | |
# get the sitemap url elements for 'products' or 'categories from the data returned by Ecwid v1 API | |
def get_sitemap_for_store_data(data, page_type): | |
str = '' | |
for item in data: | |
if 'url' in item: | |
p_url = clean_url(item['url']) | |
i_url = '' | |
if 'thumbnailUrl' in item: | |
i_url = item['thumbnailUrl'] | |
# print("%s: %s" % (page_type, p_url)) | |
str += make_sitemap_entry(p_url, i_url) + "\n" | |
else: | |
eprint("ERROR no 'url' key in one of the %s items. ignoring it" % page_type) | |
return str | |
# print to stderr | |
def eprint(*args, **kwargs): | |
print(*args, file=sys.stderr, **kwargs) | |
sitemap_store = '' | |
for page_type in ['categories', 'products']: | |
d = get_store_data(page_type) | |
sitemap_store += get_sitemap_for_store_data(d, page_type) | |
# output in utf8 | |
o = get_sitemap(sitemap_store) | |
# On Python 2 we must encode as utf8 to avoid possible UnicodeEncodeError | |
if sys.version_info[0] < 3: | |
o = o.encode('utf-8') | |
print(o) | |
# warn if we didn't find the store page | |
if not have_inserted_sitemap_store: | |
eprint("ERROR! the sitemap got from '%s'\ndid not contain any store page urls like '%s*'" % (SITEMAP_SOURCE_URL, STORE_PAGE_URL)) | |
eprint("So no store pages have been inserted into the sitemap.") | |
eprint("Check that the store_page setting is correct.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment