Skip to content

Instantly share code, notes, and snippets.

@stuarteberg
Last active December 21, 2015 10:49
Show Gist options
  • Save stuarteberg/6294491 to your computer and use it in GitHub Desktop.
Save stuarteberg/6294491 to your computer and use it in GitHub Desktop.
Benchmarking alternate implementations for relabeling a label image with a mapping specified as a dict.
import numpy
original_labels = None
mapping = None
# TODO: Ensure that all labels are present in mapping dict
# because none of these functions work otherwise.
def using_index_array():
consecutivized_labels = numpy.searchsorted( sorted( mapping.iterkeys() ), original_labels )
index_array = numpy.array( sorted( mapping.iteritems() ) )[:, 1]
return index_array[ consecutivized_labels ]
def using_frompyfunc():
vectorized_relabel = numpy.frompyfunc(mapping.__getitem__, 1, 1)
return vectorized_relabel( original_labels )
def using_frompyfunc_with_lambda():
vectorized_relabel = numpy.frompyfunc(lambda x: mapping[x], 1, 1)
return vectorized_relabel( original_labels )
def using_vectorize():
vectorized_relabel = numpy.vectorize(mapping.__getitem__)
return vectorized_relabel( original_labels )
def using_vectorize_with_lambda():
vectorized_relabel = numpy.vectorize(lambda x: mapping[x])
return vectorized_relabel( original_labels )
def using_plain_forloop():
# This turns out to be ridiculously slow.
result = numpy.ndarray( shape=original_labels.shape, dtype=original_labels.dtype )
result_flat = result.flat
original_flat = original_labels.flat
for i in xrange( len(result_flat) ):
result_flat[i] = mapping[original_flat[i]]
return result
# Quick consistency check on a small image...
original_labels = (100*numpy.random.random( (100,100) )).astype(numpy.uint32)
mapping = { k : k + 99 for k in range(100) }
expected = original_labels + 99
assert ( expected == using_index_array() ).all()
assert ( expected == using_vectorize_with_lambda() ).all()
assert ( expected == using_frompyfunc_with_lambda() ).all()
assert ( expected == using_frompyfunc() ).all()
assert ( expected == using_vectorize() ).all()
assert ( expected == using_plain_forloop() ).all()
import timeit
original_labels = (100*numpy.random.random( (10000,10000) )).astype(numpy.uint32)
mapping = { k : k + 99 for k in range(100) }
print "With 100 labels:\n"
print "using_vectorize_with_lambda", timeit.timeit( "using_vectorize_with_lambda()", "from __main__ import using_vectorize_with_lambda", number=1 )
print "using_frompyfunc_with_lambda", timeit.timeit( "using_frompyfunc_with_lambda()", "from __main__ import using_frompyfunc_with_lambda", number=1 )
print "using_vectorize", timeit.timeit( "using_vectorize()", "from __main__ import using_vectorize", number=1 )
print "using_frompyfunc", timeit.timeit( "using_frompyfunc()", "from __main__ import using_frompyfunc", number=1 )
print "using_index_array", timeit.timeit( "using_index_array()", "from __main__ import using_index_array", number=1 )
#print "using_plain_forloop", timeit.timeit( "using_plain_forloop()", "from __main__ import using_plain_forloop", number=1 )
print ""
original_labels = (10000*numpy.random.random( (10000,10000) )).astype(numpy.uint32)
mapping = { k : k + 99 for k in range(10000) }
print "With 10000 labels:\n"
print "using_vectorize_with_lambda", timeit.timeit( "using_vectorize_with_lambda()", "from __main__ import using_vectorize_with_lambda", number=1 )
print "using_frompyfunc_with_lambda", timeit.timeit( "using_frompyfunc_with_lambda()", "from __main__ import using_frompyfunc_with_lambda", number=1 )
print "using_vectorize", timeit.timeit( "using_vectorize()", "from __main__ import using_vectorize", number=1 )
print "using_frompyfunc", timeit.timeit( "using_frompyfunc()", "from __main__ import using_frompyfunc", number=1 )
print "using_index_array", timeit.timeit( "using_index_array()", "from __main__ import using_index_array", number=1 )
#print "using_plain_forloop", timeit.timeit( "using_plain_forloop()", "from __main__ import using_plain_forloop", number=1 )
original_labels = (30000*numpy.random.random( (10000,10000) )).astype(numpy.uint32)
mapping = { k : k + 99 for k in range(1000000) }
print "With 30,000 labels using 1M entry map:\n"
print "using_index_array", timeit.timeit( "using_index_array()", "from __main__ import using_index_array", number=1 )
print "using_frompyfunc", timeit.timeit( "using_frompyfunc()", "from __main__ import using_frompyfunc", number=1 )
print ""
original_labels = (30000*numpy.random.random( (10000,10000) )).astype(numpy.uint32)
mapping = { k : k + 99 for k in range(10000000) }
print "With 30,000 labels using 10M entry map:\n"
print "using_index_array", timeit.timeit( "using_index_array()", "from __main__ import using_index_array", number=1 )
print "using_frompyfunc", timeit.timeit( "using_frompyfunc()", "from __main__ import using_frompyfunc", number=1 )
print ""
@stuarteberg
Copy link
Author

Example results on my MacBook Pro:

$ python relabel_benchmarks.py
With 100 labels:

using_vectorize_with_lambda 30.7045938969
using_frompyfunc_with_lambda 22.6237640381
using_vectorize 20.1000521183
using_frompyfunc 13.0213320255
using_index_array 6.54346990585

With 10000 labels:

using_vectorize_with_lambda 33.634442091
using_frompyfunc_with_lambda 24.5189290047
using_vectorize 21.9436271191
using_frompyfunc 14.8590140343
using_index_array 11.5993220806

Observations:

  • The index array technique is the fastest option, despite needing to loop over the whole image twice.
  • The index array technique really shines when there are relatively few labels in the map. With lots more labels, it slows down a bit compared to the other technique, probably due to extra time spent in numpy.searchsorted().
  • The index array technique becomes prohibitively slow if the mapping is very large (results not shown), whereas the other techniques are not sensitve to the mapping size.
  • The frompyfunc/vectorize technique is significantly sped up if the extra function call of the lambda can be avoided by using dict.__getitem__ directly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment