Skip to content

Instantly share code, notes, and snippets.

@iurisilvio
Last active August 29, 2015 14:15
Show Gist options
  • Save iurisilvio/217ecc660dbf10e4d075 to your computer and use it in GitHub Desktop.
Save iurisilvio/217ecc660dbf10e4d075 to your computer and use it in GitHub Desktop.

Usage: python parser.py --input /your/input/directory --output /your/output/directory --sleep 5 --step 500

Output example:

---
layout: 'single-product'
categories: 'SaksFifthAvenue-UK Kids Toys-and-Books'
merchantName: 'Saks Fifth Avenue - UK'
manufacturer_name: 'Janod'
sku_number: '0405148128850'
product_id: '110013244684007288214612306030'
name: 'Barbecue Trolley'
primary: 'Kids'
secondary: 'Toys and Books'
product: 'http://click.linksynergy.com/link?id=v3EaLjWOvJQ&offerid=268285.110013244684007288214612306030&type=15&murl=http%3A%2F%2Fwww.saksfifthavenue.com%2Fmain%2FProductDetail.jsp%3FFOLDER%253C%253Efolder_id%3D2534374306439561%26PRODUCT%253C%253Eprd_id%3D845524446623895'
productImage: 'http://image.s5a.com/is/image/saks/0405148128850_396x528.jpg'
short: 'Your budding chef will cook up a storm on this rolling barbecue trolley, complete with one magnetic spatula, one magnetic barbecue fork, one piece of pork, two sausages, one fish, three tomatoes and one piece of beef.;Wheeled bottom;12.8" X 12.8" X 17.3";Recommended for ages 18 months and up;Assembly required;Wood;Wipe clean;Imported'
long: 'Your budding chef will cook up a storm on this rolling barbecue trolley, complete with one magnetic spatula, one magnetic barbecue fork, one piece of pork, two sausages, one fish, three tomatoes and one piece of beef.;Wheeled bottom;12.8" X 12.8" X 17.3";Recommended for ages 18 months and up;Assembly required;Wood;Wipe clean;Imported'
currency: 'GBP'
type: 'amount'
sale: '65.91'
retail: '65.91'
brand: 'Janod'
information: '5 - 14 business days'
availability: 'in stock'
keywords: 'Janod'
pixel: 'http://ad.linksynergy.com/fs-bin/show?id=v3EaLjWOvJQ&bids=268285.110013244684007288214612306030&type=15&subid=0'
class_id: '60'
Misc: 'No'
Age: 'Adult'
---
from __future__ import unicode_literals
import os
import re
import time
import unicodedata
from collections import OrderedDict
from datetime import date
from optparse import OptionParser
from xml.dom.pulldom import START_ELEMENT, parse
def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
"""
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub('[^\w\s-]', '', value).strip())
return value
def traverse(node, properties):
if node.attributes is not None:
for k, v in node.attributes.items():
properties[k] = v
data = []
for child in node.childNodes:
if hasattr(child, "data"):
if node.tagName not in properties:
data.append(child.data)
else:
traverse(child, properties)
if data:
properties[node.tagName] = "".join(data)
def parse_xml(filename):
doc = parse(filename)
merchant_name = None
for event, node in doc:
if event == START_ELEMENT and node.localName == "merchantName":
doc.expandNode(node)
merchant_name = node.childNodes[0].toxml()
if event == START_ELEMENT and node.localName == "product":
properties = OrderedDict()
doc.expandNode(node)
traverse(node, properties)
if "manufacturer_name" not in properties:
properties["manufacturer_name"] = merchant_name
categories = [merchant_name.replace(" ", "")]
primary = properties.get("primary")
if primary:
categories.append(primary.replace(" ", "-"))
secondary = properties.get("secondary")
if secondary:
categories.append(secondary.replace(" ", "-"))
categories = [slugify(c) for c in categories]
yield merchant_name, properties["name"], categories, properties.items()
def main(input_dir, output_dir, step=None, sleep=None, quiet=False):
def _log(message):
if not quiet:
print message
def _w(f, key, *values):
f.write("%s: '" % key)
for n, v in enumerate(values):
if n > 0:
f.write(" ")
f.write(v.replace("'", "").encode("utf-8"))
f.write("'\n")
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
for filename in os.listdir(input_dir):
if not filename.endswith(".xml"):
continue
_log("Parsing %s..." % filename)
n = 0
for merchant, product, categories, attributes in parse_xml(os.path.join(input_dir, filename)):
output_filename = os.path.join(output_dir,
"%s-%s.markdown" % (date.today().strftime("%Y-%m-%d"), slugify(product)))
if os.path.exists(output_filename):
# skip if file already exists
continue
n += 1
with open(output_filename, "w") as f:
f.write("---\n")
_w(f, "layout", "single-product")
_w(f, "categories", *categories)
_w(f, "merchantName", merchant)
for k, v in attributes:
_w(f, k, v)
f.write("---\n")
if step and n % step == 0:
if not quiet:
_log("%d products generated" % n)
if sleep:
time.sleep(sleep)
_log("OK")
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-i", "--input", dest="input")
parser.add_option("-o", "--output", dest="output")
parser.add_option("--sleep", dest="sleep", type="int")
parser.add_option("--step", dest="step", type="int", default=100)
parser.add_option("-q", "--quiet", dest="quiet", action="store_true", default=False)
options, args = parser.parse_args()
main(options.input, options.output, options.step, options.sleep, options.quiet)
@iurisilvio
Copy link
Author

Fixed.

categories: 'SaksFifthAvenue-UK Kids Toys-and-Books'

@jcyin1
Copy link

jcyin1 commented Feb 22, 2015

I get

File "parser.py", line 67
f.write("'\n")
^
SyntaxError: invalid syntax

@iurisilvio
Copy link
Author

Oops, I pasted the wrong code. :) Fixed now.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment