Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Created March 11, 2010 18:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ogrisel/329477 to your computer and use it in GitHub Desktop.
Save ogrisel/329477 to your computer and use it in GitHub Desktop.
Image fetching and clustering / semantic coding
*.swp
*.pyc
*.png
data/*
build
"""Neural nets-based utility to build low dimensional codes or/and sparse codes
- low dimensional projections are typically used for semantic mapping and
visualization of high dim vector data while preserving nearest neighboors
- sparse codes are typically used for scalable semantic indexing of high
dimensional vectors such as the term frequency vectors of documents
Author: olivier.grisel@ensta.org
License: MIT http://www.opensource.org/licenses/mit-license.php
"""

codemaker

Python utilities based on theano and pynnet and scikitis.learn to learn vector encoders that map vector data to either:

  • dense codes in low dimensional space, useful for semantic mapping and visualization
  • sparse codes in medium to high dimensional space, useful for semantic indexing, denoising and compression

In both cases, care is taken to preserve nearest neighboors relationships.

Project status

This is experimental code. Nothinh is expected to work as advertised yet :)

Licensing

MIT: http://www.opensource.org/licenses/mit-license.php

Hacking

Download the source distrib of the afore mentionned dependencies, untar them in the parent folder of codemaker, build scikits.learn in local mode with python setup build_ext -i and setup the dev environment with:

$ . ./activate.sh

You should now be able to fire you favorite python shell and import `codemaker`:

>>> import codemaker
>>> help(codemaker)

Run the tests with the nosetests command.

Examples

TODO

#!/bin/sh
export PYTHONPATH="../theano:../pynnet:../scikit-learn:./src:$PYTHONPATH"
"""Functions to naively compute nearest neighbors"""
import numpy as np
# TODO: move the matplotlib dependent code somewhere else
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
def find_nearest(ref, vectors, n=5, max_dist=None, norm='l2', mean=True):
"""Find the n vector indices whose components match ref"""
if norm == 'l2':
distances = np.sqrt((vectors - ref) ** 2)
elif norm == 'l1':
distances = np.abs(vectors - ref)
else:
raise ValueError("unknown norm: " + norm)
if mean:
distances = distances.mean(axis=1)
else:
distances = distances.sum(axis=1)
return [(i, distances[i]) for i in distances.argsort()[:n]
if max_dist is None or distances[i] < max_dist]
def preview(filenames, rows=1):
for i, f in enumerate(filenames):
sp = plt.subplot(rows, len(filenames), i + 1)
sp.imshow(np.flipud(mpimg.imread(f)))
plt.show()
def preview_nearest(ref_idx, codes, filenames, n=5):
if not isinstance(codes, list):
codes = [codes]
for i, code_set in enumerate(codes):
nearest = find_nearest(code_set[ref_idx], code_set, n=n)
for j, f in enumerate([filenames[k] for k, _ in nearest]):
columns = len(nearest) + 1
rows = len(codes)
sp = plt.subplot(rows, columns, (i * columns) + j + 1)
sp.imshow(np.flipud(mpimg.imread(f)))
#!/usr/bin/env python
"""Script to build image classification dataset from commons.wikipedia.org
Author: olivier.grisel@ensta.org
License: MIT - http://www.opensource.org/licenses/mit-license.php
"""
import os
import sys
import cgi
import random
from gzip import GzipFile
from PIL import Image
from cStringIO import StringIO
from restkit import Resource
from restkit import ConnectionPool
from lxml import etree
import numpy as np
import leargist
CONNECTION_POOL = ConnectionPool(max_connections=4)
BASE_SERVER_URL = "http://commons.wikimedia.org"
BASE_CATEGORY_URL = "/wiki/Category:"
# the magic single command to do gray level & crop / extent transforms
CONVERT_GRAY_CMD = ("convert {input} -colorspace Gray"
" -resize {w}x{h}^ -gravity center"
" -extent {w}x{h} {output}")
CONVERT_COLOR_CMD = ("convert {input}"
" -resize {w}x{h}^ -gravity center"
" -extent {w}x{h} {output}")
COLLECTIONS = (
("portraits", [
"Portrait_photographs_of_men",
"Portrait_photographs_of_women",
]),
("landscapes", [
"Landscapes",
"Coasts",
"Landscape_of_France",
"Snowy_landscapes",
]),
("diagrams", [
"Diagrams",
]),
("paintings", [
"Impressionist_paintings",
"Post-Impressionist_paintings",
"Realist_paintings",
]),
("drawings", [
"Sepia_drawings",
"Pencil_drawings",
"Charcoal_drawings",
"Pastel_drawings",
]),
("buildings", [
"Columns",
"Building_construction",
"Building_interiors",
]),
)
def microthumbs(filenames, sizes=[2, 3, 4]):
"""Aggregate pixel values of thumbnails of various micro sizes
This can be used as a poors man replacement for GIST features for image
clustering.
"""
result = np.zeros((len(filenames), 3 * sum(s ** 2 for s in sizes)))
for i, f in enumerate(filenames):
vectors = []
for s in sizes:
img = Image.open(f).convert('RGB').resize((s, s))
vectors.append(np.asarray(img, dtype=np.double).flatten() / 128)
result[i][:] = np.concatenate(vectors)
return result
#TODO: drop this an replace me by scipy.ndimage instead
def img_to_array(image, mode='L', w=32, h=32, dtype=np.float32):
"""Convert a PIL Image into a numpy array"""
if image.mode != mode:
image = image.convert(mode=mode)
if image.size != (w, h):
image = image.resize((w, h), Image.ANTIALIAS)
data = np.array(image.getdata(), dtype)
rescaled_data = data / 128. - 1
return rescaled_data
def array_to_img(data, w=32, h=32):
"""Convert brain ouput to an Image with the same format as the input"""
rescaled = ((data + 1.) * 128).clip(0, 255)
image = Image.new('L', (w, h))
image.putdata(list(rescaled.flat))
return image
def urldecode(url):
"""Helper to convert query parameters as a dict"""
if "?" in url:
path, query = url.split("?", 1)
return path, dict(cgi.parse_qsl(query))
else:
return url, {}
def find_image_urls(category, server_url=BASE_SERVER_URL,
category_prefix=BASE_CATEGORY_URL,
resource=None, pool=CONNECTION_POOL):
"""Scrap the mediawiki category pages to identify thumbs to download"""
parser = etree.HTMLParser()
if resource is None:
resource = Resource(server_url, pool_instance=pool)
collected_urls = []
page = category_prefix + category
parameters = {}
while page is not None:
response = resource.get(page, **parameters)
tree = etree.parse(StringIO(response.body), parser)
thumb_tags = tree.xpath(
'//div[@class="thumb"]//img[contains(@src, "/thumb/")]')
collected_urls.extend(tag.get('src') for tag in thumb_tags)
links = tree.xpath('//a[contains(text(), "next 200")]')
if links:
page, parameters = urldecode(links[0].get('href'))
else:
page = None
return collected_urls
def collect(collections, folder):
"""Collect categorized thumbnails from commons.mediawiki.org"""
resource = Resource(BASE_SERVER_URL, pool_instance=CONNECTION_POOL)
for category_name, subcategories in collections:
category_folder = os.path.join(folder, category_name)
if not os.path.exists(category_folder):
os.makedirs(category_folder)
for subcategory in subcategories:
for url in find_image_urls(subcategory, resource=resource):
_, filename = url.rsplit("/", 1)
if len(filename) > 200:
print "skipping file: ", filename
continue
filepath = os.path.join(category_folder, filename)
if os.path.exists(filepath):
print "skipping", url
continue
print "downloading", url
with file(filepath, 'wb') as f:
payload = Resource(
url, pool_instance=CONNECTION_POOL).get().body
f.write(payload)
def preprocess(input_folder, output_folder, w=32, h=32, gray_levels=True):
"""Convert images to fixed dimensional squared graylevel jpegs"""
for category in os.listdir(input_folder):
category_input_folder = os.path.join(input_folder, category)
category_output_folder = os.path.join(output_folder, category)
if not os.path.isdir(category_input_folder):
continue
if not os.path.exists(category_output_folder):
os.makedirs(category_output_folder)
for filename in os.listdir(category_input_folder):
input_path = os.path.join(category_input_folder, filename)
output_path = os.path.join(
category_output_folder, filename + '-preprocessed.jpeg')
if os.path.exists(output_path):
print "skipping", filename
continue
if gray_levels:
cmd = CONVERT_GRAY_CMD.format(
input=input_path, output=output_path, w=w, h=h)
else:
cmd = CONVERT_COLOR_CMD.format(
input=input_path, output=output_path, w=w, h=h)
print "processing", filename
os.system(cmd)
def pack(input_folder, index_file, data_output_file, label_output_file=None,
seed=42, dtype=np.float32, transform=None):
"""Pack picture files as numpy arrays with category folder as labels"""
all_filenames = [(category, os.path.join(input_folder, category, filename))
for category in os.listdir(input_folder)
for filename in os.listdir(
os.path.join(input_folder, category))]
random.seed(seed)
random.shuffle(all_filenames)
# count the number of picture by category and drop excedent so that all
# categories have equal number of samples
counts = dict()
for c, _ in all_filenames:
if c in counts:
counts[c] += 1
else:
counts[c] = 1
limit = min(counts.values())
resampled_filenames = []
counts = dict()
for c, fn in all_filenames:
if c in counts:
counts[c] += 1
else:
counts[c] = 1
if counts[c] <= limit:
resampled_filenames.append(fn)
with file(index_file, "wb") as f:
f.write("\n".join(resampled_filenames))
f.write('\n')
reference = Image.open(resampled_filenames[0])
w, h = reference.size
dim = w * h
if transform == 'gist':
dim = 960 # hardcoded for now
# TODO: add microthumb transform as baseline too
data_array = np.zeros((len(resampled_filenames), dim), dtype=dtype)
for i, filepath in enumerate(resampled_filenames):
im = Image.open(filepath)
if transform == 'gist':
data_array[i,:] = leargist.color_gist(im)
else:
data_array[i,:] = img_to_array(im, w=w, h=h, dtype=dtype).flatten()
if data_output_file.endswith('.gz'):
np.save(GzipFile(data_output_file, 'wb'), data_array)
else:
np.save(file(data_output_file, 'wb'), data_array)
# TODO: deal with label data at some point
if __name__ == "__main__":
# TODO: use optparser to create a real CLI handler
if sys.argv[1] == "collect":
collect(COLLECTIONS, "collected-images")
elif sys.argv[1] == "preprocess":
preprocess("collected-images", "preprocessed-images")
elif sys.argv[1] == "pack":
pack("preprocessed-images", "image_filenames.txt", "images.npy.gz")
elif sys.argv[1] == "crop-color":
preprocess("collected-images", "cropped-images", h=200, w=200,
gray_levels=False)
elif sys.argv[1] == "gist":
pack("cropped-images", "gist_image_filenames.txt", "gist.npy.gz",
transform='gist')
from distutils.core import setup, Extension
import sys, os
version = file('VERSION.txt').read().strip()
setup(name='codemaker',
version=version,
description="Embedding and sparse coding of dense and/or high dimensional data",
long_description=file('README.rst').read(),
classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
keywords=('machine-learning artificial-intelligence scientific numerical'
'artificial-neural-networks'),
author='Olivier Grisel',
author_email='olivier.grisel@ensta.org',
url='http://github.com/ogrisel/codemaker',
license='MIT',
package_dir={'': 'src'},
packages=['codemaker'])
import numpy as np
from scikits.learn.glm.coordinate_descent import ElasticNet
def sparse_encode(D, data, alpha=1, tol=1e-3, callback=None):
"""Given dictionary D, finc sparse encoding of vectors in data
Encoding is performed using coordinate descent of the elastic net
regularized least squares.
"""
D = np.asanyarray(D, dtype=np.double)
data = np.asanyarray(data, dtype=np.double)
data = np.atleast_2d(data)
encoded = np.zeros((data.shape[0], D.shape[1]), dtype=np.double)
for i, code in enumerate(data):
# TODO: parallelize me with multiprocessing!
encoded[i][:] = ElasticNet(alpha=alpha).fit(D, code, tol=tol).w
if callback is not None:
callback(i)
return encoded
from codemaker.sparse import sparse_encode
import numpy as np
from nose.tools import *
def test_random_sparsecode():
n_samples = 100
n_features = 50
n_basis = 30
np.random.seed(0)
# sample data from centered unit variance guaussian process
x = np.random.normal(
loc=0.0, scale=1.0, size=n_samples * n_features).reshape(
n_samples, n_features)
# lets take the first samples as dictionary
D = x[:n_basis].T
# encode all samples according to D
encoded = sparse_encode(D, x, alpha=10)
assert_equal(encoded.shape, (n_samples, n_basis))
# check that the first sample are encoded using a single component (trivial
# sparse code)
avg_density = (encoded[:n_basis] != 0).sum(axis=1).mean()
assert_almost_equal(avg_density, 1.0, 0.01)
for i in xrange(n_basis):
for j in xrange(n_basis):
if i == j:
assert_almost_equal(encoded[i][j], 1.0, 0.05)
else:
assert_almost_equal(encoded[i][j], 0.0, 0.01)
# check that the remaining samples could also be encoding with a sparse code
avg_density = (encoded[n_basis:] != 0).sum(axis=1).mean()
assert_almost_equal(avg_density, 3.6, 0.1)
"""This file contains different utility functions that are not connected
in anyway to the networks presented in the tutorials, but rather help in
processing the outputs into a more understandable way.
For example ``tile_raster_images`` helps in generating a easy to grasp
image from a set of samples or weights.
Credit: excerpt from the http://deeplearning.net/tutorial
"""
import numpy
def scale_to_unit_interval(ndar,eps=1e-8):
""" Scales all values in the ndarray ndar to be between 0 and 1 """
ndar = ndar.copy()
ndar -= ndar.min()
ndar *= 1.0 / (ndar.max()+eps)
return ndar
def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0),
scale_rows_to_unit_interval = True, output_pixel_vals = True):
"""
Transform an array with one flattened image per row, into an array in
which images are reshaped and layed out like tiles on a floor.
This function is useful for visualizing datasets whose rows are images,
and also columns of matrices for transforming those rows
(such as the first layer of a neural net).
:type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
be 2-D ndarrays or None;
:param X: a 2-D array in which every row is a flattened image.
:type img_shape: tuple; (height, width)
:param img_shape: the original shape of each image
:type tile_shape: tuple; (rows, cols)
:param tile_shape: the number of images to tile (rows, cols)
:param output_pixel_vals: if output should be pixel values (i.e. int8
values) or floats
:param scale_rows_to_unit_interval: if the values need to be scaled before
being plotted to [0,1] or not
:returns: array suitable for viewing as an image.
(See:`PIL.Image.fromarray`.)
:rtype: a 2-d array with same dtype as X.
"""
assert len(img_shape) == 2
assert len(tile_shape) == 2
assert len(tile_spacing) == 2
# The expression below can be re-written in a more C style as
# follows :
#
# out_shape = [0,0]
# out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
# tile_spacing[0]
# out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
# tile_spacing[1]
out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
in zip(img_shape, tile_shape, tile_spacing)]
if isinstance(X, tuple):
assert len(X) == 4
# Create an output numpy ndarray to store the image
if output_pixel_vals:
out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
else:
out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
#colors default to 0, alpha defaults to 1 (opaque)
if output_pixel_vals:
channel_defaults = [0,0,0,255]
else:
channel_defaults = [0.,0.,0.,1.]
for i in xrange(4):
if X[i] is None:
# if channel is None, fill it with zeros of the correct
# dtype
out_array[:,:,i] = numpy.zeros(out_shape,
dtype='uint8' if output_pixel_vals else out_array.dtype
)+channel_defaults[i]
else:
# use a recurrent call to compute the channel and store it
# in the output
out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
return out_array
else:
# if we are dealing with only one channel
H, W = img_shape
Hs, Ws = tile_spacing
# generate a matrix to store the output
out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
for tile_row in xrange(tile_shape[0]):
for tile_col in xrange(tile_shape[1]):
if tile_row * tile_shape[1] + tile_col < X.shape[0]:
if scale_rows_to_unit_interval:
# if we should scale values to be between 0 and 1
# do this by calling the `scale_to_unit_interval`
# function
this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
else:
this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
# add the slice to the corresponding position in the
# output array
out_array[
tile_row * (H+Hs):tile_row*(H+Hs)+H,
tile_col * (W+Ws):tile_col*(W+Ws)+W
] \
= this_img * (255 if output_pixel_vals else 1)
return out_array
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment