ogrisel/.gitignore

## .gitignore
*.swp
*.pyc
*.png
data/*
build

## __init__.py
"""Neural nets-based utility to build low dimensional codes or/and sparse codes

- low dimensional projections are typically used for semantic mapping and
visualization of high dim vector data while preserving nearest neighboors

- sparse codes are typically used for scalable semantic indexing of high
dimensional vectors such as the term frequency vectors of documents

Author: olivier.grisel@ensta.org
License: MIT http://www.opensource.org/licenses/mit-license.php
"""


## README.rst

      
    Raw
  

              README.rst
            
          
    codemaker

Python utilities based on theano and pynnet and scikitis.learn to learn
vector encoders that map vector data to either:


dense codes in low dimensional space, useful for semantic mapping and
visualization
sparse codes in medium to high dimensional space, useful for semantic
indexing, denoising and compression


In both cases, care is taken to preserve nearest neighboors relationships.

Project status

This is experimental code. Nothinh is expected to work as advertised yet :)

Licensing

MIT: http://www.opensource.org/licenses/mit-license.php

Hacking

Download the source distrib of the afore mentionned dependencies, untar them in
the parent folder of codemaker, build `scikits.learn`_ in local mode with
python setup build_ext -i and setup the dev environment with:
$ . ./activate.sh

You should now be able to fire you favorite python shell and import
codemaker:
>>> import codemaker
>>> help(codemaker)

Run the tests with the nosetests command.

Examples

TODO


## activate.sh
#!/bin/sh
export PYTHONPATH="../theano:../pynnet:../scikit-learn:./src:$PYTHONPATH"

## evaluation.py
"""Functions to naively compute nearest neighbors"""

import numpy as np
# TODO: move the matplotlib dependent code somewhere else
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def find_nearest(ref, vectors, n=5, max_dist=None, norm='l2', mean=True):
    """Find the n vector indices whose components match ref"""
    if norm == 'l2':
        distances = np.sqrt((vectors - ref) ** 2)
    elif norm == 'l1':
        distances = np.abs(vectors - ref)
    else:
        raise ValueError("unknown norm: " + norm)
    if mean:
        distances = distances.mean(axis=1)
    else:
        distances = distances.sum(axis=1)

    return [(i, distances[i]) for i in distances.argsort()[:n]
            if max_dist is None or distances[i] < max_dist]


def preview(filenames, rows=1):
    for i, f in enumerate(filenames):
        sp = plt.subplot(rows, len(filenames), i + 1)
        sp.imshow(np.flipud(mpimg.imread(f)))
    plt.show()


def preview_nearest(ref_idx, codes, filenames, n=5):
    if not isinstance(codes, list):
        codes = [codes]
    for i, code_set in enumerate(codes):
        nearest = find_nearest(code_set[ref_idx], code_set, n=n)
        for j, f in enumerate([filenames[k] for k, _ in nearest]):
            columns = len(nearest) + 1
            rows = len(codes)
            sp = plt.subplot(rows, columns, (i * columns) + j + 1)
            sp.imshow(np.flipud(mpimg.imread(f)))


## image.py
#!/usr/bin/env python
"""Script to build image classification dataset from commons.wikipedia.org

Author: olivier.grisel@ensta.org
License: MIT - http://www.opensource.org/licenses/mit-license.php
"""

import os
import sys
import cgi
import random
from gzip import GzipFile
from PIL import Image
from cStringIO import StringIO
from restkit import Resource
from restkit import ConnectionPool
from lxml import etree
import numpy as np
import leargist

CONNECTION_POOL = ConnectionPool(max_connections=4)

BASE_SERVER_URL = "http://commons.wikimedia.org"
BASE_CATEGORY_URL = "/wiki/Category:"

# the magic single command to do gray level & crop / extent transforms
CONVERT_GRAY_CMD = ("convert {input} -colorspace Gray"
                    " -resize {w}x{h}^ -gravity center"
                    " -extent {w}x{h} {output}")

CONVERT_COLOR_CMD = ("convert {input}"
                    " -resize {w}x{h}^ -gravity center"
                    " -extent {w}x{h} {output}")

COLLECTIONS = (
    ("portraits", [
        "Portrait_photographs_of_men",
        "Portrait_photographs_of_women",
    ]),
    ("landscapes", [
        "Landscapes",
        "Coasts",
        "Landscape_of_France",
        "Snowy_landscapes",
    ]),
    ("diagrams", [
        "Diagrams",
    ]),
    ("paintings", [
        "Impressionist_paintings",
        "Post-Impressionist_paintings",
        "Realist_paintings",
    ]),
    ("drawings", [
        "Sepia_drawings",
        "Pencil_drawings",
        "Charcoal_drawings",
        "Pastel_drawings",
    ]),
    ("buildings", [
        "Columns",
        "Building_construction",
        "Building_interiors",
    ]),
)

def microthumbs(filenames, sizes=[2, 3, 4]):
    """Aggregate pixel values of thumbnails of various micro sizes

    This can be used as a poors man replacement for GIST features for image
    clustering.
    """
    result = np.zeros((len(filenames), 3 * sum(s ** 2 for s in sizes)))
    for i, f in enumerate(filenames):
        vectors = []
        for s in sizes:
            img = Image.open(f).convert('RGB').resize((s, s))
            vectors.append(np.asarray(img, dtype=np.double).flatten() / 128)
        result[i][:] = np.concatenate(vectors)
    return result


#TODO: drop this an replace me by scipy.ndimage instead
def img_to_array(image, mode='L', w=32, h=32, dtype=np.float32):
    """Convert a PIL Image into a numpy array"""
    if image.mode != mode:
        image = image.convert(mode=mode)
    if image.size != (w, h):
        image = image.resize((w, h), Image.ANTIALIAS)

    data = np.array(image.getdata(), dtype)
    rescaled_data = data / 128. - 1
    return rescaled_data


def array_to_img(data, w=32, h=32):
    """Convert brain ouput to an Image with the same format as the input"""
    rescaled = ((data + 1.) * 128).clip(0, 255)
    image = Image.new('L', (w, h))
    image.putdata(list(rescaled.flat))
    return image


def urldecode(url):
    """Helper to convert query parameters as a dict"""
    if "?" in url:
        path, query = url.split("?", 1)
        return path, dict(cgi.parse_qsl(query))
    else:
        return url, {}


def find_image_urls(category, server_url=BASE_SERVER_URL,
                    category_prefix=BASE_CATEGORY_URL,
                    resource=None, pool=CONNECTION_POOL):
    """Scrap the mediawiki category pages to identify thumbs to download"""
    parser = etree.HTMLParser()
    if resource is None:
        resource = Resource(server_url, pool_instance=pool)
    collected_urls = []
    page = category_prefix + category
    parameters = {}
    while page is not None:
        response = resource.get(page, **parameters)
        tree = etree.parse(StringIO(response.body), parser)
        thumb_tags = tree.xpath(
            '//div[@class="thumb"]//img[contains(@src, "/thumb/")]')
        collected_urls.extend(tag.get('src') for tag in thumb_tags)
        links = tree.xpath('//a[contains(text(), "next 200")]')
        if links:
            page, parameters = urldecode(links[0].get('href'))
        else:
            page = None
    return collected_urls


def collect(collections, folder):
    """Collect categorized thumbnails from commons.mediawiki.org"""
    resource = Resource(BASE_SERVER_URL, pool_instance=CONNECTION_POOL)
    for category_name, subcategories in collections:
        category_folder = os.path.join(folder, category_name)
        if not os.path.exists(category_folder):
            os.makedirs(category_folder)
        for subcategory in subcategories:
            for url in find_image_urls(subcategory, resource=resource):
                _, filename = url.rsplit("/", 1)
                if len(filename) > 200:
                    print "skipping file: ", filename
                    continue
                filepath = os.path.join(category_folder, filename)
                if os.path.exists(filepath):
                    print "skipping", url
                    continue
                print "downloading", url
                with file(filepath, 'wb') as f:
                    payload = Resource(
                        url, pool_instance=CONNECTION_POOL).get().body
                    f.write(payload)


def preprocess(input_folder, output_folder, w=32, h=32, gray_levels=True):
    """Convert images to fixed dimensional squared graylevel jpegs"""
    for category in os.listdir(input_folder):
        category_input_folder = os.path.join(input_folder, category)
        category_output_folder = os.path.join(output_folder, category)
        if not os.path.isdir(category_input_folder):
            continue
        if not os.path.exists(category_output_folder):
            os.makedirs(category_output_folder)

        for filename in os.listdir(category_input_folder):
            input_path = os.path.join(category_input_folder, filename)
            output_path = os.path.join(
                category_output_folder, filename + '-preprocessed.jpeg')
            if os.path.exists(output_path):
                print "skipping", filename
                continue
            if gray_levels:
                cmd = CONVERT_GRAY_CMD.format(
                    input=input_path, output=output_path, w=w, h=h)
            else:
                cmd = CONVERT_COLOR_CMD.format(
                    input=input_path, output=output_path, w=w, h=h)

            print "processing", filename
            os.system(cmd)


def pack(input_folder, index_file, data_output_file, label_output_file=None,
         seed=42, dtype=np.float32, transform=None):
    """Pack picture files as numpy arrays with category folder as labels"""
    all_filenames = [(category, os.path.join(input_folder, category, filename))
                     for category in os.listdir(input_folder)
                     for filename in os.listdir(
                         os.path.join(input_folder, category))]
    random.seed(seed)
    random.shuffle(all_filenames)

    # count the number of picture by category and drop excedent so that all
    # categories have equal number of samples

    counts = dict()
    for c, _ in all_filenames:
        if c in counts:
            counts[c] += 1
        else:
            counts[c] = 1
    limit = min(counts.values())

    resampled_filenames = []
    counts = dict()
    for c, fn in all_filenames:
        if c in counts:
            counts[c] += 1
        else:
            counts[c] = 1
        if counts[c] <= limit:
            resampled_filenames.append(fn)

    with file(index_file, "wb") as f:
        f.write("\n".join(resampled_filenames))
        f.write('\n')

    reference = Image.open(resampled_filenames[0])

    w, h = reference.size

    dim = w * h
    if transform == 'gist':
        dim = 960 # hardcoded for now

    # TODO: add microthumb transform as baseline too

    data_array = np.zeros((len(resampled_filenames), dim), dtype=dtype)
    for i, filepath in enumerate(resampled_filenames):
        im = Image.open(filepath)
        if transform == 'gist':
            data_array[i,:] = leargist.color_gist(im)
        else:
            data_array[i,:] = img_to_array(im, w=w, h=h, dtype=dtype).flatten()

    if data_output_file.endswith('.gz'):
        np.save(GzipFile(data_output_file, 'wb'), data_array)
    else:
        np.save(file(data_output_file, 'wb'), data_array)

    # TODO: deal with label data at some point


if __name__ == "__main__":
    # TODO: use optparser to create a real CLI handler
    if sys.argv[1] == "collect":
        collect(COLLECTIONS, "collected-images")
    elif sys.argv[1] == "preprocess":
        preprocess("collected-images", "preprocessed-images")
    elif sys.argv[1] == "pack":
        pack("preprocessed-images", "image_filenames.txt", "images.npy.gz")
    elif sys.argv[1] == "crop-color":
        preprocess("collected-images", "cropped-images", h=200, w=200,
                   gray_levels=False)
    elif sys.argv[1] == "gist":
        pack("cropped-images", "gist_image_filenames.txt", "gist.npy.gz",
             transform='gist')


## setup.py
from distutils.core import setup, Extension
import sys, os

version = file('VERSION.txt').read().strip()


setup(name='codemaker',
      version=version,
      description="Embedding and sparse coding of dense and/or high dimensional data",
      long_description=file('README.rst').read(),
      classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
      keywords=('machine-learning artificial-intelligence scientific numerical'
                'artificial-neural-networks'),
      author='Olivier Grisel',
      author_email='olivier.grisel@ensta.org',
      url='http://github.com/ogrisel/codemaker',
      license='MIT',
      package_dir={'': 'src'},
      packages=['codemaker'])

## sparse.py
import numpy as np
from scikits.learn.glm.coordinate_descent import ElasticNet

def sparse_encode(D, data, alpha=1, tol=1e-3, callback=None):
    """Given dictionary D, finc sparse encoding of vectors in data

    Encoding is performed using coordinate descent of the elastic net
    regularized least squares.
    """
    D = np.asanyarray(D, dtype=np.double)
    data = np.asanyarray(data, dtype=np.double)
    data = np.atleast_2d(data)
    encoded = np.zeros((data.shape[0], D.shape[1]), dtype=np.double)

    for i, code in enumerate(data):
        # TODO: parallelize me with multiprocessing!
        encoded[i][:] = ElasticNet(alpha=alpha).fit(D, code, tol=tol).w
        if callback is not None:
            callback(i)
    return encoded


## test_sparsecode.py
from codemaker.sparse import sparse_encode
import numpy as np
from nose.tools import *

def test_random_sparsecode():

    n_samples = 100
    n_features = 50
    n_basis = 30

    np.random.seed(0)

    # sample data from centered unit variance guaussian process
    x = np.random.normal(
        loc=0.0, scale=1.0, size=n_samples * n_features).reshape(
            n_samples, n_features)

    # lets take the first samples as dictionary
    D = x[:n_basis].T

    # encode all samples according to D
    encoded = sparse_encode(D, x, alpha=10)
    assert_equal(encoded.shape, (n_samples, n_basis))

    # check that the first sample are encoded using a single component (trivial
    # sparse code)
    avg_density = (encoded[:n_basis] != 0).sum(axis=1).mean()
    assert_almost_equal(avg_density, 1.0, 0.01)

    for i in xrange(n_basis):
        for j in xrange(n_basis):
            if i == j:
                assert_almost_equal(encoded[i][j], 1.0, 0.05)
            else:
                assert_almost_equal(encoded[i][j], 0.0, 0.01)

    # check that the remaining samples could also be encoding with a sparse code
    avg_density = (encoded[n_basis:] != 0).sum(axis=1).mean()
    assert_almost_equal(avg_density, 3.6, 0.1)


## utils.py
"""This file contains different utility functions that are not connected
in anyway to the networks presented in the tutorials, but rather help in
processing the outputs into a more understandable way.

For example ``tile_raster_images`` helps in generating a easy to grasp
image from a set of samples or weights.

Credit: excerpt from the http://deeplearning.net/tutorial
"""


import numpy


def scale_to_unit_interval(ndar,eps=1e-8):
    """ Scales all values in the ndarray ndar to be between 0 and 1 """
    ndar = ndar.copy()
    ndar -= ndar.min()
    ndar *= 1.0 / (ndar.max()+eps)
    return ndar


def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0),
              scale_rows_to_unit_interval = True, output_pixel_vals = True):
    """
    Transform an array with one flattened image per row, into an array in
    which images are reshaped and layed out like tiles on a floor.

    This function is useful for visualizing datasets whose rows are images,
    and also columns of matrices for transforming those rows
    (such as the first layer of a neural net).

    :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
    be 2-D ndarrays or None;
    :param X: a 2-D array in which every row is a flattened image.

    :type img_shape: tuple; (height, width)
    :param img_shape: the original shape of each image

    :type tile_shape: tuple; (rows, cols)
    :param tile_shape: the number of images to tile (rows, cols)

    :param output_pixel_vals: if output should be pixel values (i.e. int8
    values) or floats

    :param scale_rows_to_unit_interval: if the values need to be scaled before
    being plotted to [0,1] or not


    :returns: array suitable for viewing as an image.
    (See:`PIL.Image.fromarray`.)
    :rtype: a 2-d array with same dtype as X.

    """

    assert len(img_shape) == 2
    assert len(tile_shape) == 2
    assert len(tile_spacing) == 2

    # The expression below can be re-written in a more C style as
    # follows :
    #
    # out_shape    = [0,0]
    # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
    #                tile_spacing[0]
    # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
    #                tile_spacing[1]
    out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
                        in zip(img_shape, tile_shape, tile_spacing)]

    if isinstance(X, tuple):
        assert len(X) == 4
        # Create an output numpy ndarray to store the image
        if output_pixel_vals:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
        else:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)

        #colors default to 0, alpha defaults to 1 (opaque)
        if output_pixel_vals:
            channel_defaults = [0,0,0,255]
        else:
            channel_defaults = [0.,0.,0.,1.]

        for i in xrange(4):
            if X[i] is None:
                # if channel is None, fill it with zeros of the correct
                # dtype
                out_array[:,:,i] = numpy.zeros(out_shape,
                        dtype='uint8' if output_pixel_vals else out_array.dtype
                        )+channel_defaults[i]
            else:
                # use a recurrent call to compute the channel and store it
                # in the output
                out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
        return out_array

    else:
        # if we are dealing with only one channel
        H, W = img_shape
        Hs, Ws = tile_spacing

        # generate a matrix to store the output
        out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)


        for tile_row in xrange(tile_shape[0]):
            for tile_col in xrange(tile_shape[1]):
                if tile_row * tile_shape[1] + tile_col < X.shape[0]:
                    if scale_rows_to_unit_interval:
                        # if we should scale values to be between 0 and 1
                        # do this by calling the `scale_to_unit_interval`
                        # function
                        this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
                    else:
                        this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
                    # add the slice to the corresponding position in the
                    # output array
                    out_array[
                        tile_row * (H+Hs):tile_row*(H+Hs)+H,
                        tile_col * (W+Ws):tile_col*(W+Ws)+W
                        ] \
                        = this_img * (255 if output_pixel_vals else 1)
        return out_array


## VERSION.txt
0.1.0-git
	"""Neural nets-based utility to build low dimensional codes or/and sparse codes

	- low dimensional projections are typically used for semantic mapping and
	visualization of high dim vector data while preserving nearest neighboors

	- sparse codes are typically used for scalable semantic indexing of high
	dimensional vectors such as the term frequency vectors of documents

	Author: olivier.grisel@ensta.org
	License: MIT http://www.opensource.org/licenses/mit-license.php
	"""
	#!/bin/sh
	export PYTHONPATH="../theano:../pynnet:../scikit-learn:./src:$PYTHONPATH"
	"""Functions to naively compute nearest neighbors"""

	import numpy as np
	# TODO: move the matplotlib dependent code somewhere else
	import matplotlib.pyplot as plt
	import matplotlib.image as mpimg

	def find_nearest(ref, vectors, n=5, max_dist=None, norm='l2', mean=True):
	"""Find the n vector indices whose components match ref"""
	if norm == 'l2':
	distances = np.sqrt((vectors - ref) ** 2)
	elif norm == 'l1':
	distances = np.abs(vectors - ref)
	else:
	raise ValueError("unknown norm: " + norm)
	if mean:
	distances = distances.mean(axis=1)
	else:
	distances = distances.sum(axis=1)

	return [(i, distances[i]) for i in distances.argsort()[:n]
	if max_dist is None or distances[i] < max_dist]


	def preview(filenames, rows=1):
	for i, f in enumerate(filenames):
	sp = plt.subplot(rows, len(filenames), i + 1)
	sp.imshow(np.flipud(mpimg.imread(f)))
	plt.show()


	def preview_nearest(ref_idx, codes, filenames, n=5):
	if not isinstance(codes, list):
	codes = [codes]
	for i, code_set in enumerate(codes):
	nearest = find_nearest(code_set[ref_idx], code_set, n=n)
	for j, f in enumerate([filenames[k] for k, _ in nearest]):
	columns = len(nearest) + 1
	rows = len(codes)
	sp = plt.subplot(rows, columns, (i * columns) + j + 1)
	sp.imshow(np.flipud(mpimg.imread(f)))
	#!/usr/bin/env python
	"""Script to build image classification dataset from commons.wikipedia.org

	Author: olivier.grisel@ensta.org
	License: MIT - http://www.opensource.org/licenses/mit-license.php
	"""

	import os
	import sys
	import cgi
	import random
	from gzip import GzipFile
	from PIL import Image
	from cStringIO import StringIO
	from restkit import Resource
	from restkit import ConnectionPool
	from lxml import etree
	import numpy as np
	import leargist

	CONNECTION_POOL = ConnectionPool(max_connections=4)

	BASE_SERVER_URL = "http://commons.wikimedia.org"
	BASE_CATEGORY_URL = "/wiki/Category:"

	# the magic single command to do gray level & crop / extent transforms
	CONVERT_GRAY_CMD = ("convert {input} -colorspace Gray"
	" -resize {w}x{h}^ -gravity center"
	" -extent {w}x{h} {output}")

	CONVERT_COLOR_CMD = ("convert {input}"
	" -resize {w}x{h}^ -gravity center"
	" -extent {w}x{h} {output}")

	COLLECTIONS = (
	("portraits", [
	"Portrait_photographs_of_men",
	"Portrait_photographs_of_women",
	]),
	("landscapes", [
	"Landscapes",
	"Coasts",
	"Landscape_of_France",
	"Snowy_landscapes",
	]),
	("diagrams", [
	"Diagrams",
	]),
	("paintings", [
	"Impressionist_paintings",
	"Post-Impressionist_paintings",
	"Realist_paintings",
	]),
	("drawings", [
	"Sepia_drawings",
	"Pencil_drawings",
	"Charcoal_drawings",
	"Pastel_drawings",
	]),
	("buildings", [
	"Columns",
	"Building_construction",
	"Building_interiors",
	]),
	)

	def microthumbs(filenames, sizes=[2, 3, 4]):
	"""Aggregate pixel values of thumbnails of various micro sizes

	This can be used as a poors man replacement for GIST features for image
	clustering.
	"""
	result = np.zeros((len(filenames), 3 * sum(s ** 2 for s in sizes)))
	for i, f in enumerate(filenames):
	vectors = []
	for s in sizes:
	img = Image.open(f).convert('RGB').resize((s, s))
	vectors.append(np.asarray(img, dtype=np.double).flatten() / 128)
	result[i][:] = np.concatenate(vectors)
	return result


	#TODO: drop this an replace me by scipy.ndimage instead
	def img_to_array(image, mode='L', w=32, h=32, dtype=np.float32):
	"""Convert a PIL Image into a numpy array"""
	if image.mode != mode:
	image = image.convert(mode=mode)
	if image.size != (w, h):
	image = image.resize((w, h), Image.ANTIALIAS)

	data = np.array(image.getdata(), dtype)
	rescaled_data = data / 128. - 1
	return rescaled_data


	def array_to_img(data, w=32, h=32):
	"""Convert brain ouput to an Image with the same format as the input"""
	rescaled = ((data + 1.) * 128).clip(0, 255)
	image = Image.new('L', (w, h))
	image.putdata(list(rescaled.flat))
	return image


	def urldecode(url):
	"""Helper to convert query parameters as a dict"""
	if "?" in url:
	path, query = url.split("?", 1)
	return path, dict(cgi.parse_qsl(query))
	else:
	return url, {}


	def find_image_urls(category, server_url=BASE_SERVER_URL,
	category_prefix=BASE_CATEGORY_URL,
	resource=None, pool=CONNECTION_POOL):
	"""Scrap the mediawiki category pages to identify thumbs to download"""
	parser = etree.HTMLParser()
	if resource is None:
	resource = Resource(server_url, pool_instance=pool)
	collected_urls = []
	page = category_prefix + category
	parameters = {}
	while page is not None:
	response = resource.get(page, **parameters)
	tree = etree.parse(StringIO(response.body), parser)
	thumb_tags = tree.xpath(
	'//div[@class="thumb"]//img[contains(@src, "/thumb/")]')
	collected_urls.extend(tag.get('src') for tag in thumb_tags)
	links = tree.xpath('//a[contains(text(), "next 200")]')
	if links:
	page, parameters = urldecode(links[0].get('href'))
	else:
	page = None
	return collected_urls


	def collect(collections, folder):
	"""Collect categorized thumbnails from commons.mediawiki.org"""
	resource = Resource(BASE_SERVER_URL, pool_instance=CONNECTION_POOL)
	for category_name, subcategories in collections:
	category_folder = os.path.join(folder, category_name)
	if not os.path.exists(category_folder):
	os.makedirs(category_folder)
	for subcategory in subcategories:
	for url in find_image_urls(subcategory, resource=resource):
	_, filename = url.rsplit("/", 1)
	if len(filename) > 200:
	print "skipping file: ", filename
	continue
	filepath = os.path.join(category_folder, filename)
	if os.path.exists(filepath):
	print "skipping", url
	continue
	print "downloading", url
	with file(filepath, 'wb') as f:
	payload = Resource(
	url, pool_instance=CONNECTION_POOL).get().body
	f.write(payload)


	def preprocess(input_folder, output_folder, w=32, h=32, gray_levels=True):
	"""Convert images to fixed dimensional squared graylevel jpegs"""
	for category in os.listdir(input_folder):
	category_input_folder = os.path.join(input_folder, category)
	category_output_folder = os.path.join(output_folder, category)
	if not os.path.isdir(category_input_folder):
	continue
	if not os.path.exists(category_output_folder):
	os.makedirs(category_output_folder)

	for filename in os.listdir(category_input_folder):
	input_path = os.path.join(category_input_folder, filename)
	output_path = os.path.join(
	category_output_folder, filename + '-preprocessed.jpeg')
	if os.path.exists(output_path):
	print "skipping", filename
	continue
	if gray_levels:
	cmd = CONVERT_GRAY_CMD.format(
	input=input_path, output=output_path, w=w, h=h)
	else:
	cmd = CONVERT_COLOR_CMD.format(
	input=input_path, output=output_path, w=w, h=h)

	print "processing", filename
	os.system(cmd)


	def pack(input_folder, index_file, data_output_file, label_output_file=None,
	seed=42, dtype=np.float32, transform=None):
	"""Pack picture files as numpy arrays with category folder as labels"""
	all_filenames = [(category, os.path.join(input_folder, category, filename))
	for category in os.listdir(input_folder)
	for filename in os.listdir(
	os.path.join(input_folder, category))]
	random.seed(seed)
	random.shuffle(all_filenames)

	# count the number of picture by category and drop excedent so that all
	# categories have equal number of samples

	counts = dict()
	for c, _ in all_filenames:
	if c in counts:
	counts[c] += 1
	else:
	counts[c] = 1
	limit = min(counts.values())

	resampled_filenames = []
	counts = dict()
	for c, fn in all_filenames:
	if c in counts:
	counts[c] += 1
	else:
	counts[c] = 1
	if counts[c] <= limit:
	resampled_filenames.append(fn)

	with file(index_file, "wb") as f:
	f.write("\n".join(resampled_filenames))
	f.write('\n')

	reference = Image.open(resampled_filenames[0])

	w, h = reference.size

	dim = w * h
	if transform == 'gist':
	dim = 960 # hardcoded for now

	# TODO: add microthumb transform as baseline too

	data_array = np.zeros((len(resampled_filenames), dim), dtype=dtype)
	for i, filepath in enumerate(resampled_filenames):
	im = Image.open(filepath)
	if transform == 'gist':
	data_array[i,:] = leargist.color_gist(im)
	else:
	data_array[i,:] = img_to_array(im, w=w, h=h, dtype=dtype).flatten()

	if data_output_file.endswith('.gz'):
	np.save(GzipFile(data_output_file, 'wb'), data_array)
	else:
	np.save(file(data_output_file, 'wb'), data_array)

	# TODO: deal with label data at some point


	if __name__ == "__main__":
	# TODO: use optparser to create a real CLI handler
	if sys.argv[1] == "collect":
	collect(COLLECTIONS, "collected-images")
	elif sys.argv[1] == "preprocess":
	preprocess("collected-images", "preprocessed-images")
	elif sys.argv[1] == "pack":
	pack("preprocessed-images", "image_filenames.txt", "images.npy.gz")
	elif sys.argv[1] == "crop-color":
	preprocess("collected-images", "cropped-images", h=200, w=200,
	gray_levels=False)
	elif sys.argv[1] == "gist":
	pack("cropped-images", "gist_image_filenames.txt", "gist.npy.gz",
	transform='gist')
	from distutils.core import setup, Extension
	import sys, os

	version = file('VERSION.txt').read().strip()


	setup(name='codemaker',
	version=version,
	description="Embedding and sparse coding of dense and/or high dimensional data",
	long_description=file('README.rst').read(),
	classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
	keywords=('machine-learning artificial-intelligence scientific numerical'
	'artificial-neural-networks'),
	author='Olivier Grisel',
	author_email='olivier.grisel@ensta.org',
	url='http://github.com/ogrisel/codemaker',
	license='MIT',
	package_dir={'': 'src'},
	packages=['codemaker'])
	import numpy as np
	from scikits.learn.glm.coordinate_descent import ElasticNet

	def sparse_encode(D, data, alpha=1, tol=1e-3, callback=None):
	"""Given dictionary D, finc sparse encoding of vectors in data

	Encoding is performed using coordinate descent of the elastic net
	regularized least squares.
	"""
	D = np.asanyarray(D, dtype=np.double)
	data = np.asanyarray(data, dtype=np.double)
	data = np.atleast_2d(data)
	encoded = np.zeros((data.shape[0], D.shape[1]), dtype=np.double)

	for i, code in enumerate(data):
	# TODO: parallelize me with multiprocessing!
	encoded[i][:] = ElasticNet(alpha=alpha).fit(D, code, tol=tol).w
	if callback is not None:
	callback(i)
	return encoded
	from codemaker.sparse import sparse_encode
	import numpy as np
	from nose.tools import *

	def test_random_sparsecode():

	n_samples = 100
	n_features = 50
	n_basis = 30

	np.random.seed(0)

	# sample data from centered unit variance guaussian process
	x = np.random.normal(
	loc=0.0, scale=1.0, size=n_samples * n_features).reshape(
	n_samples, n_features)

	# lets take the first samples as dictionary
	D = x[:n_basis].T

	# encode all samples according to D
	encoded = sparse_encode(D, x, alpha=10)
	assert_equal(encoded.shape, (n_samples, n_basis))

	# check that the first sample are encoded using a single component (trivial
	# sparse code)
	avg_density = (encoded[:n_basis] != 0).sum(axis=1).mean()
	assert_almost_equal(avg_density, 1.0, 0.01)

	for i in xrange(n_basis):
	for j in xrange(n_basis):
	if i == j:
	assert_almost_equal(encoded[i][j], 1.0, 0.05)
	else:
	assert_almost_equal(encoded[i][j], 0.0, 0.01)

	# check that the remaining samples could also be encoding with a sparse code
	avg_density = (encoded[n_basis:] != 0).sum(axis=1).mean()
	assert_almost_equal(avg_density, 3.6, 0.1)
	"""This file contains different utility functions that are not connected
	in anyway to the networks presented in the tutorials, but rather help in
	processing the outputs into a more understandable way.

	For example ``tile_raster_images`` helps in generating a easy to grasp
	image from a set of samples or weights.

	Credit: excerpt from the http://deeplearning.net/tutorial
	"""


	import numpy


	def scale_to_unit_interval(ndar,eps=1e-8):
	""" Scales all values in the ndarray ndar to be between 0 and 1 """
	ndar = ndar.copy()
	ndar -= ndar.min()
	ndar *= 1.0 / (ndar.max()+eps)
	return ndar


	def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0),
	scale_rows_to_unit_interval = True, output_pixel_vals = True):
	"""
	Transform an array with one flattened image per row, into an array in
	which images are reshaped and layed out like tiles on a floor.

	This function is useful for visualizing datasets whose rows are images,
	and also columns of matrices for transforming those rows
	(such as the first layer of a neural net).

	:type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
	be 2-D ndarrays or None;
	:param X: a 2-D array in which every row is a flattened image.

	:type img_shape: tuple; (height, width)
	:param img_shape: the original shape of each image

	:type tile_shape: tuple; (rows, cols)
	:param tile_shape: the number of images to tile (rows, cols)

	:param output_pixel_vals: if output should be pixel values (i.e. int8
	values) or floats

	:param scale_rows_to_unit_interval: if the values need to be scaled before
	being plotted to [0,1] or not


	:returns: array suitable for viewing as an image.
	(See:`PIL.Image.fromarray`.)
	:rtype: a 2-d array with same dtype as X.

	"""

	assert len(img_shape) == 2
	assert len(tile_shape) == 2
	assert len(tile_spacing) == 2

	# The expression below can be re-written in a more C style as
	# follows :
	#
	# out_shape = [0,0]
	# out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
	# tile_spacing[0]
	# out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
	# tile_spacing[1]
	out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
	in zip(img_shape, tile_shape, tile_spacing)]

	if isinstance(X, tuple):
	assert len(X) == 4
	# Create an output numpy ndarray to store the image
	if output_pixel_vals:
	out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
	else:
	out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)

	#colors default to 0, alpha defaults to 1 (opaque)
	if output_pixel_vals:
	channel_defaults = [0,0,0,255]
	else:
	channel_defaults = [0.,0.,0.,1.]

	for i in xrange(4):
	if X[i] is None:
	# if channel is None, fill it with zeros of the correct
	# dtype
	out_array[:,:,i] = numpy.zeros(out_shape,
	dtype='uint8' if output_pixel_vals else out_array.dtype
	)+channel_defaults[i]
	else:
	# use a recurrent call to compute the channel and store it
	# in the output
	out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
	return out_array

	else:
	# if we are dealing with only one channel
	H, W = img_shape
	Hs, Ws = tile_spacing

	# generate a matrix to store the output
	out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)


	for tile_row in xrange(tile_shape[0]):
	for tile_col in xrange(tile_shape[1]):
	if tile_row * tile_shape[1] + tile_col < X.shape[0]:
	if scale_rows_to_unit_interval:
	# if we should scale values to be between 0 and 1
	# do this by calling the `scale_to_unit_interval`
	# function
	this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
	else:
	this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
	# add the slice to the corresponding position in the
	# output array
	out_array[
	tile_row * (H+Hs):tile_row*(H+Hs)+H,
	tile_col * (W+Ws):tile_col*(W+Ws)+W
	] \
	= this_img * (255 if output_pixel_vals else 1)
	return out_array