kylemcdonald/csa.c

## csa.c
#include "csa.h"
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>

#define sort_insert(best, size, a, a_prc, nsize) \
\
{\
unsigned si_i, si_j;\
\
if (size == 0)\
  best[0] = a;\
else\
  {\
  si_j = size;\
  for (si_i = 0; si_i < size; si_i++)\
    if (a_prc < best[si_i]->c - best[si_i]->head->p)\
      {\
      si_j = si_i;\
      for (si_i = nsize - 1; si_i > si_j; si_i--)\
 best[si_i] = best[si_i - 1];\
      break;\
      }\
  best[si_j] = a;\
  }\
}

/* ------------------------- Problem size variables -------------------- */
unsigned	n, m;

/* --------------- Data structures describing the problem -------------- */
lhs_ptr	head_lhs_node, tail_lhs_node;
rhs_ptr	head_rhs_node, tail_rhs_node;
lr_aptr	head_lr_arc, tail_lr_arc;

/* ------------------------- Tunable variables ------------------------- */
/*
 Cost threshhold for pricing out: used even when price-outs are
 switched off to make bounding reduced-cost differences easier in
 double_push(). In principle this is not necessary, but it makes the
 code better.
 */
double		po_cost_thresh;
double		scale_factor;	/* scaling factor */

/*
 Processing variables.
 */
double		epsilon;	/* scaling parameter */
double		min_epsilon;	/* snap to this value when epsilon small */
unsigned	total_e;	/* total excess */
ACTIVE_TYPE	active;		/* list of active nodes */

char *nomem_msg = "Insufficient memory.\n";

void st_reset(csa_stack s) {
    s->top = s->bottom;
}

char *st_pop(csa_stack s) {
    s->top--;
    return(*(s->top));
}

csa_stack st_create(unsigned size) {
    csa_stack s;

    s = (csa_stack) malloc(sizeof(struct csa_stack_st));
    if (s == NULL) {
        (void) printf("%s", nomem_msg);
        exit(9);
    }
    s->bottom = (char **) malloc(size * sizeof(char *));
    if (s->bottom == NULL) {
        (void) printf("%s", nomem_msg);
        exit(9);
    }
    s->top = s->bottom;
    return(s);
}

void best_build(lhs_ptr v) {
    unsigned i;
    lr_aptr  a, a_stop;
    double  red_cost, save_max;

    for (i = 0, a = v->first; i < NUM_BEST; i++, a++)
    {
        red_cost = a->c - a->head->p;
        sort_insert(v->best, i, a, red_cost, i + 1);
    }
    /*
     Calculate initial next_best by looking at the next arc in the
     adjacency list.
     */
    if ((v->next_best = a->c - a->head->p) <
        (red_cost = v->best[NUM_BEST - 1]->c -
         v->best[NUM_BEST - 1]->head->p))
    {
        sort_insert(v->best, NUM_BEST, a, v->next_best, NUM_BEST);
        v->next_best = red_cost;
    }
    a++;
    /*
     Now go through remaining arcs in adjacency list and place each one
     at the appropriate place in best[], if any.
     */
    a_stop = (v+1)->priced_out;
    for (; a != a_stop; a++)
    {
        if ((red_cost = a->c - a->head->p) < v->next_best)
        {
            if (red_cost < (save_max = v->best[NUM_BEST - 1]->c -
                            v->best[NUM_BEST - 1]->head->p))
            {
                sort_insert(v->best, NUM_BEST, a, red_cost, NUM_BEST);
                v->next_best = save_max;
            }
            else
                v->next_best = red_cost;
        }
    }
}

/* Assume v has excess (is unassigned) and do a double push from v. */
void double_push(lhs_ptr v) {
    double v_pref, v_second, red_cost, adm_gap;
    lr_aptr a, a_stop, adm;
    rhs_ptr w;
    lhs_ptr u;
    unsigned i;
    lr_aptr  *check_arc;

    /*
     Begin part I: Compute the following:
     o adm, the minimum-reduced-cost arc incident to v,
     o adm_gap, the amount by which the reduced cost of adm must be
     increased to make it equal in reduced cost to another arc incident
     to v, or enough to price the arc out if it is the only incident
     arc.
     */

    if (v->node_info.few_arcs)
    {

        /*
         If the input problem is feasible, it is never the case that
         (a_stop == a) after the following two lines because we never get
         excess at a node with no incident arcs.
         */
        a_stop = (v+1)->priced_out;
        a = v->first;
        v_pref = a->c - a->head->p;
        v_second = v_pref + epsilon * (po_cost_thresh + 1.0);
        adm = a;
        /*
         After this loop, v_pref is the minimum reduced cost of an edge out of
         v, and v_second is the second-to-minimum such reduced cost.
         */
        for (a++; a != a_stop; a++)
            if (v_pref > (red_cost = a->c - a->head->p))
            {
                v_second = v_pref;
                v_pref = red_cost;
                adm = a;
            }
            else if (v_second > red_cost)
                v_second = red_cost;

    }
    else
    {
        /*
         Find the minimum and second-minimum edges listed in the node's
         best[] array, and check whether their present partial reduced
         costs are below the node's bound as stored in next_best. If they
         are, we calculate adm_gap and are done with part I. If not, we
         rebuild the best[] array and the next_best bound, and calculate the
         required information.
         */
        adm = v->best[1];
        v_second = adm->c - adm->head->p;
        adm = v->best[0];
        v_pref = adm->c - adm->head->p;
        if (v_pref > v_second)
        {
            adm = v->best[1];
            red_cost = v_second;
            v_second = v_pref;
            v_pref = red_cost;
        }
        for (i = NUM_BEST - 2, check_arc = &v->best[2]; i > 0; i--, check_arc++)
        {
            a = *check_arc;
            if (v_pref > (red_cost = a->c - a->head->p))
            {
                v_second = v_pref;
                v_pref = red_cost;
                adm = a;
            }
            else if (v_second > red_cost)
                v_second = red_cost;
        }
        if (v_second > v->next_best)
        {
            /*
             Rebuild the best[] array and recalculate next_best.
             !v->node_info.few_arcs, so we know there are enough incident arcs
             to fill up best[] initially and have one left over for next_best.
             */
            best_build(v);
            adm = v->best[1];
            v_second = adm->c - adm->head->p;
            adm = v->best[0];
            v_pref = adm->c - adm->head->p;
        }
    }

    adm_gap = v_second - v_pref;

    /*
     Begin part II: Using the information computed in part I,
     o match v to w, adm's head, and
     o unmatch the node (if any) formerly matched to w.
     In the case where w's current matching arc is priced out, we do not
     change the matching, but we reset the value of adm_gap so that the
     (v, w) arc will be priced out.
     */
    w = adm->head;
    if ((u = w->matched))
    /*
     If w's matched arc is priced in, go ahead and unmatch (u, w) and
     match (v, w). If w's matched arc is priced out, abort the double
     push and relabel w so v no longer prefers w.
     */
        if (w->node_info.priced_in)
        {
            u->matched = NULL;
            make_active(u);
            v->matched = adm;
            w->matched = v;
        }
        else
        {
            adm_gap = epsilon * po_cost_thresh;
            make_active(v);
        }
        else
        {
            total_e--;
            v->matched = adm;
            w->matched = v;
        }

    /*
     Relabel w: v's price is chosen to make the implicit reduced cost of
     v's new preferred arc (v_pref + adm_gap) equal to zero. Then w's price
     is chosen so that the arc just matched has implicit reduced cost
     -epsilon.
     */
    w->p -= adm_gap + epsilon;
}

void refine() {
    lhs_ptr v;

    /*
     Saturate all negative arcs: Negative arcs are exactly those
     right-to-left matching arcs with negative reduced cost, and there is
     an interpretation of the implicit price function on the left that
     admits all right-to-left matching arcs. This interpretation is
     always consistent with the stored prices of lhs nodes in the case
     of EXPLICIT_PRICES.
     */
    total_e = 0;
    for (v = head_lhs_node; v != tail_lhs_node; v++)
    {
        if (v->matched && v->matched->head->node_info.priced_in)
        {
            v->matched->head->matched = NULL;
            v->matched = NULL;
        }
        if (v->matched == NULL)
        {
            total_e++;
            make_active(v);
        }
    }

    while (total_e > EXCESS_THRESH)
    {
        get_active_node(v);
        double_push(v);
    }
}

int update_epsilon() {
    int    fix_in = FALSE;
    epsilon /= scale_factor;
    if (epsilon < min_epsilon) {
        epsilon = min_epsilon;
    }
    return(!fix_in);
}

double parse_arcs(csa_arc* arcs, unsigned int arc_count, unsigned int lhs_n, unsigned int rhs_n) {
    unsigned int tail, head, tmp, i;
    long *lhs_degree;
    double cost, max_cost = 0;

    lr_aptr a;
    ta_ptr temp_a, temp_arcs;
    lhs_ptr l_v;
    rhs_ptr r_v;

    n = lhs_n + rhs_n;
    m = arc_count;

    printf("Parsing %u arcs...\n", arc_count);

    head_lr_arc = (lr_aptr) malloc((m + 1) * sizeof(struct lr_arc));
    tail_lr_arc = head_lr_arc + m;

    bool swap = false;
    if (lhs_n > n - lhs_n) {
        lhs_n = n - lhs_n;
        swap = true;
    }
    head_lhs_node = (lhs_ptr) malloc((lhs_n + 1) * sizeof(struct lhs_node));
    tail_lhs_node = head_lhs_node + lhs_n;
    head_rhs_node = (rhs_ptr) malloc((n - lhs_n + 1) * sizeof(struct rhs_node));
    tail_rhs_node = head_rhs_node + n - lhs_n;
    lhs_degree = (long *) malloc(lhs_n * sizeof(long));

    temp_arcs = (ta_ptr) malloc(m * sizeof(struct temp_arc));
    if ((head_lhs_node == NULL) || (head_lr_arc == NULL) ||
        (lhs_degree == NULL) || (temp_arcs == NULL)) {
        printf("%s", nomem_msg);
        return 0;
    }
    temp_a = temp_arcs;
    for (tail = 0; tail < lhs_n; tail++)
        lhs_degree[tail] = 0;

    for(i = 0; i < m; i++) {
    	csa_arc* arc = &arcs[i];
        tail = arc->tail;
        head = arc->head;
        cost = arc->cost;

        if (swap) {
        	tmp = head;
        	head = tail;
        	tail = tmp;
        }

        if ((tail >= lhs_n) || (head >= n - lhs_n)) {
            printf("Out of range arc: %u=>%u but lhs_n=%u rhs_n=%u\n",
            	arc->head, arc->tail, lhs_n, rhs_n);
            return 0;
        }

        temp_a->head = head_rhs_node + head;
        temp_a->tail = head_lhs_node + tail;
        temp_a->cost = cost;
        if (cost > max_cost) {
            max_cost = cost;
        }
        temp_a++;
        lhs_degree[tail]++;

    }

    a = head_lr_arc;
    for (tail = 0, l_v = head_lhs_node; l_v != tail_lhs_node; l_v++, tail++) {
        l_v->priced_out = l_v->first = a;
        l_v->matched = NULL;
        a += lhs_degree[tail];
        if (lhs_degree[tail] < NUM_BEST + 1) {
            l_v->node_info.few_arcs = true;
        } else {
            l_v->node_info.few_arcs = false;
        }
    }
    tail_lhs_node->priced_out = a;

    for (r_v = head_rhs_node; r_v != tail_rhs_node; r_v++) {
        r_v->node_info.priced_in = true;
        r_v->matched = NULL;
        r_v->p = 0.0;
    }

    for (temp_a--; temp_a != temp_arcs - 1; temp_a--) {
        a = temp_a->tail->first + (--lhs_degree[temp_a->tail - head_lhs_node]);
        a->head = temp_a->head;
        a->c = (double) temp_a->cost;
    }

    (void) free((char *) temp_arcs);
    (void) free((char *) lhs_degree);

    printf("Max cost (epsilon) is %f\n", max_cost);

    return max_cost;
}

static double csa(double* cost, unsigned int n, unsigned int* lhs_sol, unsigned int* rhs_sol) {
	// build temporary arcs from the dense cost matrix
    // (ideally we can do this directly to save time, but for now
    // this makes it easier to write the code that does sparse solving)
    unsigned int lhs_n = n;
    unsigned int rhs_n = n;
    unsigned int arc_count = lhs_n * rhs_n;
    printf("Building %u temporary arcs\n", arc_count);
    csa_arc* arcs = malloc(arc_count * sizeof(struct csa_arc_st));
    csa_arc* arcs_itr = arcs;
    double* cost_itr = cost;
    unsigned int tail = 0;
    for(unsigned int row = 0; row < lhs_n; row++) {
        unsigned int head = 0;
        for(unsigned int col = 0; col < rhs_n; col++) {
            csa_arc* arc = arcs_itr++;
            arc->tail = tail;
            arc->head = head;
            arc->cost = *(cost_itr++);
            head++;
        }
        tail++;
    }

    // setup everything for csa from the arcs
    epsilon = parse_arcs(arcs, arc_count, lhs_n, rhs_n);

    // free our temporary arcs
    (void) free((char *) arcs); // should we be freeing with char* or..?

    scale_factor = DEFAULT_SCALE_FACTOR;
    po_cost_thresh = 2.0 * (double) n * (scale_factor + 1);

    printf("Finding best_build...\n");
    create_active(n);
    lhs_ptr	l_v;
    for (l_v = head_lhs_node; l_v != tail_lhs_node; l_v++)
        if (!l_v->node_info.few_arcs)
            best_build(l_v);

    printf("Refining...\n");
    min_epsilon = 2.0 / (double) (n + 1);
    while (epsilon > min_epsilon) {
        (void) update_epsilon();
        printf("\tepsilon: %f min_epsilon: %f\n", epsilon, min_epsilon);
        refine();
    }

    printf("Copying solution...\n");
    double cost_sol = 0;
    for (lhs_ptr v = head_lhs_node; v != tail_lhs_node; v++) {
        unsigned int lhs = v - head_lhs_node;
        unsigned int rhs = v->matched->head - head_rhs_node + tail_lhs_node - head_lhs_node - lhs_n;
        double cost = v->matched->c;
        lhs_sol[lhs] = rhs;
        rhs_sol[rhs] = lhs;
        cost_sol += cost;
    }

    return cost_sol;
}

## csa.h
#define    TRUE    1
#define    FALSE    0
#define    DEFAULT_SCALE_FACTOR    10
#define    EXCESS_THRESH    0
#define    ACTIVE_TYPE    csa_stack
#define    create_active(size)    active = st_create(size)
#define    make_active(v)        st_push(active, (char *) v)
#define    get_active_node(v)    v = (lhs_ptr) st_pop(active)

#define    st_push(s, el) \
{\
*(s->top) = (char *) el;\
s->top++;\
}

#define    st_empty(s)    (s->top == s->bottom)

#define    enq(q, el) \
{\
*(q->tail) = el;\
if (q->tail == q->end) q->tail = q->storage;\
else q->tail++;\
}

#define q_empty(q) (q->head == q->tail ? 1 : 0)

#define    insert_list(node, head) \
{\
node->next = (*(head));\
(*(head))->prev = node;\
(*(head)) = node;\
node->prev = tail_rhs_node;\
}

#define    delete_list(node, head) \
{\
if (node->prev == tail_rhs_node) (*(head)) = node->next;\
node->prev->next = node->next;\
node->next->prev = node->prev;\
}

#define	NUM_BEST	3

typedef	struct	lhs_node	{
    struct	{
        /*
         flag used to indicate to
         double_push() that so few arcs
         are incident that best[] is
         useless.
         */
        unsigned	few_arcs : 1;
    }	node_info;
    /*
     list of arcs to consider first in
     calculating the minimum-reduced-cost
     incident arc; if we find it here, we
     need look no further.
     */
    struct	lr_arc	*best[NUM_BEST];
    /*
     bound on the reduced cost of an arc we
     can be certain still belongs among
     those in best[].
     */
    double	next_best;

    /*
     first arc in the arc array associated
     with this node.
     */
    struct	lr_arc	*priced_out;
    /*
     first priced-in arc in the arc array
     associated with this node.
     */
    struct	lr_arc	*first;
    /*
     matching arc (if any) associated with
     this node; NULL if this node is
     unmatched.
     */
    struct	lr_arc	*matched;
}	*lhs_ptr;

typedef	struct	rhs_node	{
    struct	{
        /*
         flag to indicate this node's
         matching arc (if any) is
         priced in.
         */
        unsigned	priced_in : 1;
    }	node_info;
    /*
     lhs node this rhs node is matched to.
     */
    lhs_ptr	matched;
    /*
     price of this node.
     */
    double	p;
}	*rhs_ptr;

typedef	struct	lr_arc		{
    /*
     rhs node associated with this arc.
     */
    rhs_ptr	head;
    /*
     arc cost.
     */
    double	c;
}	*lr_aptr;

typedef	struct	csa_stack_st	{
    /*
     Sometimes csa_stacks have lhs nodes, and
     other times they have rhs nodes. So
     there's a little type clash;
     everything gets cast to (char *) so we
     can use the same structure for both.
     */
    char	**bottom;
    char	**top;
}	*csa_stack;

typedef	struct	csa_queue_st	{
    /*
     Sometimes csa_queues have lhs nodes, and
     other times they have rhs nodes. So
     there's a little type clash;
     everything gets cast to (char *) so we
     can use the same structure for both.
     */
    char		**head;
    char		**tail;
    char		**storage;
    char		**end;
    unsigned	max_size;
}	*csa_queue;

typedef	struct	temp_arc	{
    lhs_ptr	tail;
    rhs_ptr	head;
    double	cost; // seems strange this is long, shouldn't it be double?
}	*ta_ptr;

typedef struct csa_arc_st {
	/*
	We define our own arc format for ease of use.
	*/
	unsigned int tail;
	unsigned int head;
	double cost;
} csa_arc;

## lapcsa.pyx
import numpy as np
from scipy.spatial.distance import cdist

cdef extern from "csa.c":
    double csa(double* cost, int n, unsigned int* lhs_sol, unsigned int* rhs_sol)

# what about nogil?
# what about avoiding memory copies?
# need to clean up after all the mallocs somehow
def lapcsa(lhs, rhs):
    dim = len(lhs)
    print('Calculating distances...')
    dist = cdist(lhs, rhs, 'euclidean')
    print('Done.')
    lhs_sol = np.ndarray(dim, dtype=np.uint32)
    rhs_sol = np.ndarray(dim, dtype=np.uint32)
    cdef double[:,::1] dist_view = dist
    cdef unsigned int[::1] lhs_sol_view = lhs_sol
    cdef unsigned int[::1] rhs_sol_view = rhs_sol
    cost_sol = csa(&dist_view[0,0], dist.shape[0], &lhs_sol_view[0], &rhs_sol_view[0])
    return cost_sol, lhs_sol, rhs_sol

## setup.py
# run:
# python setup.py build_ext --inplace
# python test.py

from distutils.core import setup, Extension
from Cython.Build import cythonize

example_module = Extension('lapcsa',
    sources=['lapcsa.pyx'],
    extra_compile_args=['-O3'],
    language='c')

setup(
    name='lapcsa',
    packages=['lapcsa'],
    ext_modules=cythonize(example_module, language_level=3)
)

## test.py
import numpy as np
import lapcsa

side = 250
n = side * side

lhs = np.random.uniform(low=-1, high=+1, size=(n, 2))
lhs = np.cumsum(lhs, axis=0)
lhs -= lhs.min(axis=0)
lhs /= lhs.max(axis=0)

xv, yv = np.meshgrid(np.linspace(0, 1, side), np.linspace(0, 1, side))
rhs = np.dstack((xv, yv)).reshape(-1, 2)

cost_csa, rows_csa, cols_csa = lapcsa.lapcsa(lhs, rhs)
	#include "csa.h"
	#include <stdlib.h>
	#include <stdio.h>
	#include <stdbool.h>

	#define sort_insert(best, size, a, a_prc, nsize) \
	\
	{\
	unsigned si_i, si_j;\
	\
	if (size == 0)\
	best[0] = a;\
	else\
	{\
	si_j = size;\
	for (si_i = 0; si_i < size; si_i++)\
	if (a_prc < best[si_i]->c - best[si_i]->head->p)\
	{\
	si_j = si_i;\
	for (si_i = nsize - 1; si_i > si_j; si_i--)\
	best[si_i] = best[si_i - 1];\
	break;\
	}\
	best[si_j] = a;\
	}\
	}

	/* ------------------------- Problem size variables -------------------- */
	unsigned n, m;

	/* --------------- Data structures describing the problem -------------- */
	lhs_ptr head_lhs_node, tail_lhs_node;
	rhs_ptr head_rhs_node, tail_rhs_node;
	lr_aptr head_lr_arc, tail_lr_arc;

	/* ------------------------- Tunable variables ------------------------- */
	/*
	Cost threshhold for pricing out: used even when price-outs are
	switched off to make bounding reduced-cost differences easier in
	double_push(). In principle this is not necessary, but it makes the
	code better.
	*/
	double po_cost_thresh;
	double scale_factor; /* scaling factor */

	/*
	Processing variables.
	*/
	double epsilon; /* scaling parameter */
	double min_epsilon; /* snap to this value when epsilon small */
	unsigned total_e; /* total excess */
	ACTIVE_TYPE active; /* list of active nodes */

	char *nomem_msg = "Insufficient memory.\n";

	void st_reset(csa_stack s) {
	s->top = s->bottom;
	}

	char *st_pop(csa_stack s) {
	s->top--;
	return(*(s->top));
	}

	csa_stack st_create(unsigned size) {
	csa_stack s;

	s = (csa_stack) malloc(sizeof(struct csa_stack_st));
	if (s == NULL) {
	(void) printf("%s", nomem_msg);
	exit(9);
	}
	s->bottom = (char *) malloc(size sizeof(char *));
	if (s->bottom == NULL) {
	(void) printf("%s", nomem_msg);
	exit(9);
	}
	s->top = s->bottom;
	return(s);
	}

	void best_build(lhs_ptr v) {
	unsigned i;
	lr_aptr a, a_stop;
	double red_cost, save_max;

	for (i = 0, a = v->first; i < NUM_BEST; i++, a++)
	{
	red_cost = a->c - a->head->p;
	sort_insert(v->best, i, a, red_cost, i + 1);
	}
	/*
	Calculate initial next_best by looking at the next arc in the
	adjacency list.
	*/
	if ((v->next_best = a->c - a->head->p) <
	(red_cost = v->best[NUM_BEST - 1]->c -
	v->best[NUM_BEST - 1]->head->p))
	{
	sort_insert(v->best, NUM_BEST, a, v->next_best, NUM_BEST);
	v->next_best = red_cost;
	}
	a++;
	/*
	Now go through remaining arcs in adjacency list and place each one
	at the appropriate place in best[], if any.
	*/
	a_stop = (v+1)->priced_out;
	for (; a != a_stop; a++)
	{
	if ((red_cost = a->c - a->head->p) < v->next_best)
	{
	if (red_cost < (save_max = v->best[NUM_BEST - 1]->c -
	v->best[NUM_BEST - 1]->head->p))
	{
	sort_insert(v->best, NUM_BEST, a, red_cost, NUM_BEST);
	v->next_best = save_max;
	}
	else
	v->next_best = red_cost;
	}
	}
	}

	/* Assume v has excess (is unassigned) and do a double push from v. */
	void double_push(lhs_ptr v) {
	double v_pref, v_second, red_cost, adm_gap;
	lr_aptr a, a_stop, adm;
	rhs_ptr w;
	lhs_ptr u;
	unsigned i;
	lr_aptr *check_arc;

	/*
	Begin part I: Compute the following:
	o adm, the minimum-reduced-cost arc incident to v,
	o adm_gap, the amount by which the reduced cost of adm must be
	increased to make it equal in reduced cost to another arc incident
	to v, or enough to price the arc out if it is the only incident
	arc.
	*/

	if (v->node_info.few_arcs)
	{

	/*
	If the input problem is feasible, it is never the case that
	(a_stop == a) after the following two lines because we never get
	excess at a node with no incident arcs.
	*/
	a_stop = (v+1)->priced_out;
	a = v->first;
	v_pref = a->c - a->head->p;
	v_second = v_pref + epsilon * (po_cost_thresh + 1.0);
	adm = a;
	/*
	After this loop, v_pref is the minimum reduced cost of an edge out of
	v, and v_second is the second-to-minimum such reduced cost.
	*/
	for (a++; a != a_stop; a++)
	if (v_pref > (red_cost = a->c - a->head->p))
	{
	v_second = v_pref;
	v_pref = red_cost;
	adm = a;
	}
	else if (v_second > red_cost)
	v_second = red_cost;

	}
	else
	{
	/*
	Find the minimum and second-minimum edges listed in the node's
	best[] array, and check whether their present partial reduced
	costs are below the node's bound as stored in next_best. If they
	are, we calculate adm_gap and are done with part I. If not, we
	rebuild the best[] array and the next_best bound, and calculate the
	required information.
	*/
	adm = v->best[1];
	v_second = adm->c - adm->head->p;
	adm = v->best[0];
	v_pref = adm->c - adm->head->p;
	if (v_pref > v_second)
	{
	adm = v->best[1];
	red_cost = v_second;
	v_second = v_pref;
	v_pref = red_cost;
	}
	for (i = NUM_BEST - 2, check_arc = &v->best[2]; i > 0; i--, check_arc++)
	{
	a = *check_arc;
	if (v_pref > (red_cost = a->c - a->head->p))
	{
	v_second = v_pref;
	v_pref = red_cost;
	adm = a;
	}
	else if (v_second > red_cost)
	v_second = red_cost;
	}
	if (v_second > v->next_best)
	{
	/*
	Rebuild the best[] array and recalculate next_best.
	!v->node_info.few_arcs, so we know there are enough incident arcs
	to fill up best[] initially and have one left over for next_best.
	*/
	best_build(v);
	adm = v->best[1];
	v_second = adm->c - adm->head->p;
	adm = v->best[0];
	v_pref = adm->c - adm->head->p;
	}
	}

	adm_gap = v_second - v_pref;

	/*
	Begin part II: Using the information computed in part I,
	o match v to w, adm's head, and
	o unmatch the node (if any) formerly matched to w.
	In the case where w's current matching arc is priced out, we do not
	change the matching, but we reset the value of adm_gap so that the
	(v, w) arc will be priced out.
	*/
	w = adm->head;
	if ((u = w->matched))
	/*
	If w's matched arc is priced in, go ahead and unmatch (u, w) and
	match (v, w). If w's matched arc is priced out, abort the double
	push and relabel w so v no longer prefers w.
	*/
	if (w->node_info.priced_in)
	{
	u->matched = NULL;
	make_active(u);
	v->matched = adm;
	w->matched = v;
	}
	else
	{
	adm_gap = epsilon * po_cost_thresh;
	make_active(v);
	}
	else
	{
	total_e--;
	v->matched = adm;
	w->matched = v;
	}

	/*
	Relabel w: v's price is chosen to make the implicit reduced cost of
	v's new preferred arc (v_pref + adm_gap) equal to zero. Then w's price
	is chosen so that the arc just matched has implicit reduced cost
	-epsilon.
	*/
	w->p -= adm_gap + epsilon;
	}

	void refine() {
	lhs_ptr v;

	/*
	Saturate all negative arcs: Negative arcs are exactly those
	right-to-left matching arcs with negative reduced cost, and there is
	an interpretation of the implicit price function on the left that
	admits all right-to-left matching arcs. This interpretation is
	always consistent with the stored prices of lhs nodes in the case
	of EXPLICIT_PRICES.
	*/
	total_e = 0;
	for (v = head_lhs_node; v != tail_lhs_node; v++)
	{
	if (v->matched && v->matched->head->node_info.priced_in)
	{
	v->matched->head->matched = NULL;
	v->matched = NULL;
	}
	if (v->matched == NULL)
	{
	total_e++;
	make_active(v);
	}
	}

	while (total_e > EXCESS_THRESH)
	{
	get_active_node(v);
	double_push(v);
	}
	}

	int update_epsilon() {
	int fix_in = FALSE;
	epsilon /= scale_factor;
	if (epsilon < min_epsilon) {
	epsilon = min_epsilon;
	}
	return(!fix_in);
	}

	double parse_arcs(csa_arc* arcs, unsigned int arc_count, unsigned int lhs_n, unsigned int rhs_n) {
	unsigned int tail, head, tmp, i;
	long *lhs_degree;
	double cost, max_cost = 0;

	lr_aptr a;
	ta_ptr temp_a, temp_arcs;
	lhs_ptr l_v;
	rhs_ptr r_v;

	n = lhs_n + rhs_n;
	m = arc_count;

	printf("Parsing %u arcs...\n", arc_count);

	head_lr_arc = (lr_aptr) malloc((m + 1) * sizeof(struct lr_arc));
	tail_lr_arc = head_lr_arc + m;

	bool swap = false;
	if (lhs_n > n - lhs_n) {
	lhs_n = n - lhs_n;
	swap = true;
	}
	head_lhs_node = (lhs_ptr) malloc((lhs_n + 1) * sizeof(struct lhs_node));
	tail_lhs_node = head_lhs_node + lhs_n;
	head_rhs_node = (rhs_ptr) malloc((n - lhs_n + 1) * sizeof(struct rhs_node));
	tail_rhs_node = head_rhs_node + n - lhs_n;
	lhs_degree = (long ) malloc(lhs_n sizeof(long));

	temp_arcs = (ta_ptr) malloc(m * sizeof(struct temp_arc));
	if ((head_lhs_node == NULL) \|\| (head_lr_arc == NULL) \|\|
	(lhs_degree == NULL) \|\| (temp_arcs == NULL)) {
	printf("%s", nomem_msg);
	return 0;
	}
	temp_a = temp_arcs;
	for (tail = 0; tail < lhs_n; tail++)
	lhs_degree[tail] = 0;

	for(i = 0; i < m; i++) {
	csa_arc* arc = &arcs[i];
	tail = arc->tail;
	head = arc->head;
	cost = arc->cost;

	if (swap) {
	tmp = head;
	head = tail;
	tail = tmp;
	}

	if ((tail >= lhs_n) \|\| (head >= n - lhs_n)) {
	printf("Out of range arc: %u=>%u but lhs_n=%u rhs_n=%u\n",
	arc->head, arc->tail, lhs_n, rhs_n);
	return 0;
	}

	temp_a->head = head_rhs_node + head;
	temp_a->tail = head_lhs_node + tail;
	temp_a->cost = cost;
	if (cost > max_cost) {
	max_cost = cost;
	}
	temp_a++;
	lhs_degree[tail]++;

	}

	a = head_lr_arc;
	for (tail = 0, l_v = head_lhs_node; l_v != tail_lhs_node; l_v++, tail++) {
	l_v->priced_out = l_v->first = a;
	l_v->matched = NULL;
	a += lhs_degree[tail];
	if (lhs_degree[tail] < NUM_BEST + 1) {
	l_v->node_info.few_arcs = true;
	} else {
	l_v->node_info.few_arcs = false;
	}
	}
	tail_lhs_node->priced_out = a;

	for (r_v = head_rhs_node; r_v != tail_rhs_node; r_v++) {
	r_v->node_info.priced_in = true;
	r_v->matched = NULL;
	r_v->p = 0.0;
	}

	for (temp_a--; temp_a != temp_arcs - 1; temp_a--) {
	a = temp_a->tail->first + (--lhs_degree[temp_a->tail - head_lhs_node]);
	a->head = temp_a->head;
	a->c = (double) temp_a->cost;
	}

	(void) free((char *) temp_arcs);
	(void) free((char *) lhs_degree);

	printf("Max cost (epsilon) is %f\n", max_cost);

	return max_cost;
	}

	static double csa(double* cost, unsigned int n, unsigned int* lhs_sol, unsigned int* rhs_sol) {
	// build temporary arcs from the dense cost matrix
	// (ideally we can do this directly to save time, but for now
	// this makes it easier to write the code that does sparse solving)
	unsigned int lhs_n = n;
	unsigned int rhs_n = n;
	unsigned int arc_count = lhs_n * rhs_n;
	printf("Building %u temporary arcs\n", arc_count);
	csa_arc* arcs = malloc(arc_count * sizeof(struct csa_arc_st));
	csa_arc* arcs_itr = arcs;
	double* cost_itr = cost;
	unsigned int tail = 0;
	for(unsigned int row = 0; row < lhs_n; row++) {
	unsigned int head = 0;
	for(unsigned int col = 0; col < rhs_n; col++) {
	csa_arc* arc = arcs_itr++;
	arc->tail = tail;
	arc->head = head;
	arc->cost = *(cost_itr++);
	head++;
	}
	tail++;
	}

	// setup everything for csa from the arcs
	epsilon = parse_arcs(arcs, arc_count, lhs_n, rhs_n);

	// free our temporary arcs
	(void) free((char ) arcs); // should we be freeing with char or..?

	scale_factor = DEFAULT_SCALE_FACTOR;
	po_cost_thresh = 2.0 * (double) n * (scale_factor + 1);

	printf("Finding best_build...\n");
	create_active(n);
	lhs_ptr l_v;
	for (l_v = head_lhs_node; l_v != tail_lhs_node; l_v++)
	if (!l_v->node_info.few_arcs)
	best_build(l_v);

	printf("Refining...\n");
	min_epsilon = 2.0 / (double) (n + 1);
	while (epsilon > min_epsilon) {
	(void) update_epsilon();
	printf("\tepsilon: %f min_epsilon: %f\n", epsilon, min_epsilon);
	refine();
	}

	printf("Copying solution...\n");
	double cost_sol = 0;
	for (lhs_ptr v = head_lhs_node; v != tail_lhs_node; v++) {
	unsigned int lhs = v - head_lhs_node;
	unsigned int rhs = v->matched->head - head_rhs_node + tail_lhs_node - head_lhs_node - lhs_n;
	double cost = v->matched->c;
	lhs_sol[lhs] = rhs;
	rhs_sol[rhs] = lhs;
	cost_sol += cost;
	}

	return cost_sol;
	}
	#define TRUE 1
	#define FALSE 0
	#define DEFAULT_SCALE_FACTOR 10
	#define EXCESS_THRESH 0
	#define ACTIVE_TYPE csa_stack
	#define create_active(size) active = st_create(size)
	#define make_active(v) st_push(active, (char *) v)
	#define get_active_node(v) v = (lhs_ptr) st_pop(active)

	#define st_push(s, el) \
	{\
	(s->top) = (char ) el;\
	s->top++;\
	}

	#define st_empty(s) (s->top == s->bottom)

	#define enq(q, el) \
	{\
	*(q->tail) = el;\
	if (q->tail == q->end) q->tail = q->storage;\
	else q->tail++;\
	}

	#define q_empty(q) (q->head == q->tail ? 1 : 0)

	#define insert_list(node, head) \
	{\
	node->next = (*(head));\
	(*(head))->prev = node;\
	(*(head)) = node;\
	node->prev = tail_rhs_node;\
	}

	#define delete_list(node, head) \
	{\
	if (node->prev == tail_rhs_node) (*(head)) = node->next;\
	node->prev->next = node->next;\
	node->next->prev = node->prev;\
	}

	#define NUM_BEST 3

	typedef struct lhs_node {
	struct {
	/*
	flag used to indicate to
	double_push() that so few arcs
	are incident that best[] is
	useless.
	*/
	unsigned few_arcs : 1;
	} node_info;
	/*
	list of arcs to consider first in
	calculating the minimum-reduced-cost
	incident arc; if we find it here, we
	need look no further.
	*/
	struct lr_arc *best[NUM_BEST];
	/*
	bound on the reduced cost of an arc we
	can be certain still belongs among
	those in best[].
	*/
	double next_best;

	/*
	first arc in the arc array associated
	with this node.
	*/
	struct lr_arc *priced_out;
	/*
	first priced-in arc in the arc array
	associated with this node.
	*/
	struct lr_arc *first;
	/*
	matching arc (if any) associated with
	this node; NULL if this node is
	unmatched.
	*/
	struct lr_arc *matched;
	} *lhs_ptr;

	typedef struct rhs_node {
	struct {
	/*
	flag to indicate this node's
	matching arc (if any) is
	priced in.
	*/
	unsigned priced_in : 1;
	} node_info;
	/*
	lhs node this rhs node is matched to.
	*/
	lhs_ptr matched;
	/*
	price of this node.
	*/
	double p;
	} *rhs_ptr;

	typedef struct lr_arc {
	/*
	rhs node associated with this arc.
	*/
	rhs_ptr head;
	/*
	arc cost.
	*/
	double c;
	} *lr_aptr;

	typedef struct csa_stack_st {
	/*
	Sometimes csa_stacks have lhs nodes, and
	other times they have rhs nodes. So
	there's a little type clash;
	everything gets cast to (char *) so we
	can use the same structure for both.
	*/
	char **bottom;
	char **top;
	} *csa_stack;

	typedef struct csa_queue_st {
	/*
	Sometimes csa_queues have lhs nodes, and
	other times they have rhs nodes. So
	there's a little type clash;
	everything gets cast to (char *) so we
	can use the same structure for both.
	*/
	char **head;
	char **tail;
	char **storage;
	char **end;
	unsigned max_size;
	} *csa_queue;

	typedef struct temp_arc {
	lhs_ptr tail;
	rhs_ptr head;
	double cost; // seems strange this is long, shouldn't it be double?
	} *ta_ptr;

	typedef struct csa_arc_st {
	/*
	We define our own arc format for ease of use.
	*/
	unsigned int tail;
	unsigned int head;
	double cost;
	} csa_arc;
	import numpy as np
	from scipy.spatial.distance import cdist

	cdef extern from "csa.c":
	double csa(double* cost, int n, unsigned int* lhs_sol, unsigned int* rhs_sol)

	# what about nogil?
	# what about avoiding memory copies?
	# need to clean up after all the mallocs somehow
	def lapcsa(lhs, rhs):
	dim = len(lhs)
	print('Calculating distances...')
	dist = cdist(lhs, rhs, 'euclidean')
	print('Done.')
	lhs_sol = np.ndarray(dim, dtype=np.uint32)
	rhs_sol = np.ndarray(dim, dtype=np.uint32)
	cdef double[:,::1] dist_view = dist
	cdef unsigned int[::1] lhs_sol_view = lhs_sol
	cdef unsigned int[::1] rhs_sol_view = rhs_sol
	cost_sol = csa(&dist_view[0,0], dist.shape[0], &lhs_sol_view[0], &rhs_sol_view[0])
	return cost_sol, lhs_sol, rhs_sol
	# run:
	# python setup.py build_ext --inplace
	# python test.py

	from distutils.core import setup, Extension
	from Cython.Build import cythonize

	example_module = Extension('lapcsa',
	sources=['lapcsa.pyx'],
	extra_compile_args=['-O3'],
	language='c')

	setup(
	name='lapcsa',
	packages=['lapcsa'],
	ext_modules=cythonize(example_module, language_level=3)
	)
	import numpy as np
	import lapcsa

	side = 250
	n = side * side

	lhs = np.random.uniform(low=-1, high=+1, size=(n, 2))
	lhs = np.cumsum(lhs, axis=0)
	lhs -= lhs.min(axis=0)
	lhs /= lhs.max(axis=0)

	xv, yv = np.meshgrid(np.linspace(0, 1, side), np.linspace(0, 1, side))
	rhs = np.dstack((xv, yv)).reshape(-1, 2)

	cost_csa, rows_csa, cols_csa = lapcsa.lapcsa(lhs, rhs)