Skip to content

Instantly share code, notes, and snippets.

@synapticarbors
Created March 5, 2014 15:34
Show Gist options
  • Save synapticarbors/9369567 to your computer and use it in GitHub Desktop.
Save synapticarbors/9369567 to your computer and use it in GitHub Desktop.
Small optimizations to example found at https://github.com/sauliusl/python-optimisation-example/
#cython: boundscheck=False
#cython: wraparound=False
import numpy as np # Import python numpy bindings
cimport numpy as np # Import cpython numpy bindings
DEF MATCH_SCORE = 1
DEF MISMATCH_SCORE = 0
DEF DELETION_SCORE = 0
DEF INSERTION_SCORE = 0
def align(left, right):
# Precompute lengths as ints
cdef unsigned int len_left, len_right
len_left = len(left)
len_right = len(right)
# Define matrix to be appropriate 2-dimensional array
#cdef np.ndarray[np.float_t, ndim=2] matrix
cdef np.float_t[:,::1] matrix
cdef object[:,::1] traceback
matrix = np.empty((len_left+1, len_right+1), np.float)
traceback = np.empty((len_left+1, len_right+1), np.object_)
# Initialise
matrix[0, 0] = 0
# Define iterators as ints so loops run in native C
cdef unsigned int i, j
for i in xrange(len_left):
# This should be using optimised lookups for numpy matrix as all indices ints
matrix[i+1, 0] = (i+1) * DELETION_SCORE
traceback[i+1, 0] = i, 0
for i in xrange(len_right):
matrix[0, i+1] = (i+1) * INSERTION_SCORE
traceback[0, i+1] = 0, i
# Define the variables as doubles
cdef double max_score, match_score, deletion, insertion
for i, symbol_left in enumerate(left):
for j, symbol_right in enumerate(right):
# Matrix lookups below should be optimised
match_score = matrix[i, j] + MATCH_SCORE if symbol_left == symbol_right else MISMATCH_SCORE
deletion = matrix[i, j+1] + DELETION_SCORE
insertion = matrix[i+1, j] + INSERTION_SCORE
if match_score >= deletion and match_score >= insertion:
max_score = match_score
traceback_pos = i, j
elif deletion > match_score and deletion >= insertion:
max_score = deletion
traceback_pos = i, j+1
else:
max_score = insertion
traceback_pos = i+1, j
matrix[i+1, j+1] = max_score
traceback[i+1, j+1] = traceback_pos
return matrix[len(left), len(right)], np.asarray(traceback)

I made a few optimizations to the cython code. The timings on my machine are:

from sequences import P53_HUMAN, P53_MOUSE
from alignment import align as py_align
from align_numpy import align as cy_align
from align_numpy2 import align as cy_align2
 
%timeit py_align(P53_HUMAN, P53_MOUSE)
1 loops, best of 3: 442 ms per loop
 
%timeit cy_align(P53_HUMAN, P53_MOUSE)
10 loops, best of 3: 178 ms per loop
 
# With optimizations in align_numpy2
%timeit cy_align2(P53_HUMAN, P53_MOUSE)
10 loops, best of 3: 22.6 ms per loop

I've validated that the results are the same as the original code. Most of the speedup was ditching the dict representation of traceback for a numpy array of objects.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment