Skip to content

Instantly share code, notes, and snippets.

@stephenmcd
Created April 22, 2010 23:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stephenmcd/375986 to your computer and use it in GitHub Desktop.
Save stephenmcd/375986 to your computer and use it in GitHub Desktop.
Concurrent product data generation from Google Base / Flickr
"""
Stand-alone data generation routine that uses the ecommerce taxonomy found on
Google Base to generate a significant amount of category and product data, as
well as using the Flickr API to retrieve images for the products. The
multiprocessing module is also used for parallelization.
The Django models and environment used here are specific to the Cartridge
project but the approach could easily be reused with any ecommerce database.
"""
from deploy.environment import setup
setup()
from multiprocessing import Process, Queue
from os.path import exists, join
from shutil import move
from sys import exit
from urllib import urlopen, urlretrieve
from django.conf import settings
from django.contrib.webdesign.lorem_ipsum import paragraph
from django.db import connection
from django.db.models import F
from shop.models import Category, Product
from shop.settings import PRODUCT_OPTIONS
try:
import flickr
except ImportError:
print "flickr.py must be installed from http://code.google.com/p/flickrpy/"
exit()
WORKERS = 10
image_dir = join(settings.MEDIA_ROOT, "product")
queue = Queue()
def create_products(queue):
"""
Download an image from Flickr for the product on the queue and if
successful now or previously, create the applicable product records.
"""
# Close the connection for this process to avoid the issue discussed here:
# http://groups.google.com/group/django-users/browse_thread/thread/2c7421cdb9b99e48
connection.close()
while True:
# Get next set of data from queue.
data = queue.get()
if data is None:
break
main_category, sub_category, product = data[0], data[1], data[-1]
# Try and download a product image from Flickr.
image = join(image_dir, "%s.jpg" % product)
if exists(image):
message = "Using already downloaded image for %s" % data
else:
try:
images = flickr.photos_search(tags=[product], per_page=1)
if not images:
raise Exception("No images found")
url = images[0].getURL(size="Large", urlType="source")
urlretrieve(url, image)
except Exception, e:
message = "Error [%s] for %s" % (e, data)
else:
message = "Successfully downloaded image for %s" % data
remaining = "%s remaining" % queue.qsize()
print remaining.ljust(20, "."), message
# Create database records for the product.
if exists(image):
product = Category.objects.get(parent__title=main_category,
title=sub_category).products.create(title=product,
available=True, active=True, description=paragraph())
image = "product/%s.jpg" % product.title
product.images.create(file=image)
product.variations.create_from_options(PRODUCT_OPTIONS)
product.variations.manage_empty()
product.variations.update(unit_price=F("id") + "10000")
product.variations.update(unit_price=F("unit_price") / "1000.0")
product.copy_default_variation()
if __name__ == "__main__":
# Load the Google Base data.
category_url = "http://www.google.com/basepages/producttype/taxonomy.txt"
try:
category_data = urlopen(category_url).read()
except Exception, e:
print "Failed to load category data: %s" % e
exit()
# Clear out the database, moving the product images to a temp location and
# restoring them so that they're not deleted.
print "Deleting categories"
Category.objects.all().delete()
print "Backing up images"
move(image_dir, "tmp_products")
print "Deleting products"
Product.objects.all().delete()
print "Restoring images"
move("tmp_products", image_dir)
# Parse the category data into triples of main category, sub category and
# product, create the categories and put the triples onto the queue. The
# categories must be created here in a single process due to the non-atomic
# nature of Django's Model.objects.get_or_create()
print "Creating categories"
for line in category_data.split("\n"):
parts = line.split(" > ")
if len(parts) > 2:
if len(parts) == 3:
main_category, created = Category.objects.get_or_create(
title=parts[0], active=True)
sub_category, created = main_category.children.get_or_create(
title=parts[1], active=True)
queue.put(parts)
# Create worker processes and run the main function in them.
workers = []
for _ in range(WORKERS):
queue.put(None)
workers.append(Process(target=create_products, args=(queue,)))
for worker in workers:
worker.start()
for worker in workers:
worker.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment