jasonsewall/mpi-gather.cpp Secret

## mpi-gather.cpp
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cmath>
#include <mpi.h>
#include <algorithm>

#include <cstdarg>
#include <cstdio>
#include <cstring>
#include <cmath>
#include <limits>
#include <vector>

static std::vector<int> get_factors(int num)
{
    std::vector<int> res;
    while(num > 1)
    {
        if((num % 2) == 0)
        {
            num /= 2;
            res.push_back(2);
        }
        else
        {
            bool divisor = false;
            for(int i = 3; i <= std::sqrt(num); i+=2)
                if((num % i) == 0)
                {
                    num /= i;
                    res.push_back(i);
                    divisor = true;
                    break;
                }
            if(!divisor)
            {
                res.push_back(num);
                num = 1;
            }
        }
    }
    return res;
}

static double nth_subset_sum(unsigned long long num, const std::vector<double> *set)
{
    double res = 0.0;
    for(int i = 0; i < set->size(); ++i)
        if(num & (1ULL << i))
            res += (*set)[i];
    return res;
}

static int nth_subset_prod(unsigned long long num, const std::vector<int> *set)
{
    int res = 1;
    for(int i = 0; i < set->size(); ++i)
        if(num & (1ULL << i))
            res *= (*set)[i];
    return res;
}

static unsigned long long best_subset(const double split, const std::vector<double> *ln_factors)
{
    double best = std::numeric_limits<double>::max();
    unsigned long long argbest = -1ULL;
    for( unsigned long long i = 1; i < (1 << ln_factors->size())-1; ++i)
    {
        const double prod = std::abs(nth_subset_sum(i, ln_factors) - split);
        if(prod < best)
        {
            best = prod;
            argbest = i;
        }
    }
    return argbest;
}

std::vector<int> best_decomp(int num, int groups)
{
    const double split = std::log((double)num)/groups;
    std::vector<int> res(groups);

    int todiv = num;
    for(int i = 0; i < groups-1; ++i)
    {
        std::vector<int>    factors = get_factors(todiv);
        std::vector<double> ln_factors(factors.size());
        for(int j = 0; j < ln_factors.size(); ++j)
            ln_factors[j] = std::log(factors[j]);

        const unsigned long long set_idx = best_subset(split, &ln_factors);
        res[i] = nth_subset_prod(set_idx, &factors);
        todiv /= res[i];
    }
    res[groups-1] = todiv;
    std::sort(res.begin(), res.end());
    return res;
}

void divvy(int *start, int *end, const int nitems, int chunkno, int nchunks)
{
    const int items_per_chunk = nitems/nchunks;
    const int remainder       = nitems - nchunks*items_per_chunk;

    *start = chunkno*items_per_chunk     + std::min(chunkno,   remainder);
    *end   = (chunkno+1)*items_per_chunk + std::min(chunkno+1, remainder);
}

struct grid2d
{
    double *q;
    int ystride;
    int varstride;
    int N[2];

    int MPI_offset[2];
    int MPI_decomp[2];
};

void get_rank_decomp(int start[2], int end[2], int position[2], const int decomp[2], int rank, const int n[2])
{
    position[0] = rank % decomp[0];
    position[1] = rank / decomp[0];

    divvy(start+0, end+0, n[0], position[0], decomp[0]);
    divvy(start+1, end+1, n[1], position[1], decomp[1]);
}

static const int halow = 2;

void grid2d_init(grid2d *h, int rank, int nranks, const int global_N[2])
{
    std::vector<int> decomp(best_decomp(nranks, 2));

    h->MPI_decomp[0] = decomp[0];
    h->MPI_decomp[1] = decomp[1];

    if(rank == 0)
    {
        printf("Decomp: %d %d\n", h->MPI_decomp[0], h->MPI_decomp[1]);
    }

    int end[2];
    int MPI_position[2];
    get_rank_decomp(h->MPI_offset, end, MPI_position, h->MPI_decomp, rank, global_N);

    h->N[0] = end[0]-h->MPI_offset[0];
    h->N[1] = end[1]-h->MPI_offset[1];

    h->ystride = h->N[0] + halow*2;
    h->varstride = h->ystride * (h->N[1] + halow*2);

    h->q = (double*)malloc(sizeof(double)*h->varstride*4);
    assert(h->q);
    memset(h->q, 0, sizeof(double)*h->varstride*4);
    printf("rank % 4d: (% 3d, % 3d) [% 5d, % 5d]x[% 5d, % 5d]\n", rank, MPI_position[0], MPI_position[1], h->MPI_offset[0], end[0], h->MPI_offset[1], end[1]);
}

void mpi_gather(grid2d *dest, const grid2d *src, uint64_t xfer_mask)
{
    int rank, nranks;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    assert(nranks <= 64);

    const uint64_t rank_mask = (1ULL << nranks) -1ULL;
    const uint64_t effective_mask = rank_mask & xfer_mask;
    printf("R%d GATHER START: xfer_mask %llu, rank_mask: %llu, effective_mask: %llu\n", rank, xfer_mask, rank_mask, effective_mask);

    const int nrecvs = rank == 0 ? __builtin_popcountll(effective_mask) : 0;

    const int sending = ((1ULL << rank) & effective_mask) != 0ULL;

    const int num_types = nrecvs + sending;
    const int num_comms = num_types;

    int used_types = 0;
    int used_comms = 0;

    MPI_Datatype grid_types[num_types];
    MPI_Request reqs[num_comms];

    if(sending)
    {
        MPI_Datatype tmp[2];
        MPI_Type_create_hvector(src->N[0], 1, sizeof(double)*1, MPI_DOUBLE, tmp+0);
        MPI_Type_create_hvector(src->N[1], 1, sizeof(double)*src->ystride, tmp[0], tmp+1);
        MPI_Type_create_hvector(4,         1, sizeof(double)*src->varstride, tmp[1], grid_types+used_types);
        MPI_Type_commit(grid_types+used_types);

        MPI_Isend(src->q+halow+halow*src->ystride, 1, grid_types[used_types], 0, rank, MPI_COMM_WORLD, reqs+used_comms);

        ++used_types;
        ++used_comms;
    }

    if(rank == 0)
    {
        MPI_Status stats[num_comms];
        for(int r = 0; r < nranks; ++r)
        {
            if(((1ULL << r) & effective_mask) == 0ULL)
            {
                continue;
            }

            int pos[2], start[2], end[2];
            get_rank_decomp(start, end, pos, src->MPI_decomp, r, dest->N);
            int n[2];
            n[0] = end[0]-start[0];
            n[1] = end[1]-start[1];

            MPI_Datatype tmp[2];
            MPI_Type_create_hvector(n[0], 1, sizeof(double)*1, MPI_DOUBLE, tmp+0);
            MPI_Type_create_hvector(n[1], 1, sizeof(double)*dest->ystride, tmp[0], tmp+1);
            MPI_Type_create_hvector(4,    1, sizeof(double)*dest->varstride, tmp[1], grid_types+used_types);
            MPI_Type_commit(grid_types+used_types);

            MPI_Irecv(dest->q + halow + start[0] + dest->ystride*(halow+start[1]), 1, grid_types[used_types], r, r, MPI_COMM_WORLD, reqs+used_comms);

            ++used_types;
            ++used_comms;
        }
    }


    if(sending || rank == 0)
    {
        MPI_Status stats[used_comms];
        //MPI_Waitall(used_comms, reqs_stats);

        int out_comms = 0;
        int out_indices[used_comms];
        int req_finished[used_comms];
        const char *status_str[2] = {"not ", ""};
        memset(req_finished, 0, sizeof(int)*used_comms);
        do
        {
            printf("R%d %d items:\n", rank, used_comms);
            for(int i = 0; i < out_comms; ++i)
            {
                req_finished[out_indices[i]] = 1;
            }
            for(int i = 0; i < used_comms; ++i)
            {
                printf("R%d: req: %d %sfinished\n", rank, i, status_str[req_finished[i]]);
            }
            MPI_Waitsome(used_comms, reqs, &out_comms, out_indices, stats);
        }
        while(out_comms != MPI_UNDEFINED);

        printf("R%d all finished\n", rank);

        for(int r = 0; r < used_types; ++r)
        {
            MPI_Type_free(grid_types + r);
        }
    }
    printf("R%d GATHER END: xfer_mask %llu, rank_mask: %llu, effective_mask: %llu\n", rank, xfer_mask, rank_mask, effective_mask);
}

int check_grid(grid2d *g)
{
    for(int v = 0; v < 4; ++v)
    {
        for(int j = 0; j < g->N[1]+2*halow; ++j)
        {
            for(int i = 0; i < g->N[0]+2*halow; ++i)
            {
                double expected;
                if(i >= halow && j >= halow && i < g->N[0]+halow && j < g->N[1]+halow)
                {
                    expected = i-halow + (j-halow)*g->N[0] + v*g->N[0]*g->N[1];
                }
                else
                {
                    expected = 0;
                }

                const double found = g->q[i+j*g->ystride+v*g->varstride];
                if(found != expected)
                {
                    printf("Gather failed !!! %d, %d, %d expected %lg, got %lg\n", i,j,v, expected, found);
                    return EXIT_FAILURE;

                }
            }
        }
    }
    printf("Gather okay\n");
    return EXIT_SUCCESS;
}

int gather_test(int global_N[2])
{
    int rank;
    int nranks;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    MPI_Bcast(global_N, 2, MPI_INT, 0, MPI_COMM_WORLD);

    grid2d global_grid;
    if(rank == 0)
    {
        grid2d_init(&global_grid, 0, 1, global_N);
    }

    grid2d local_grid;
    grid2d_init(&local_grid, rank, nranks, global_N);

    for(int v = 0; v < 4; ++v)
    {
        for(int j = 0; j < local_grid.N[1]; ++j)
        {
            for(int i = 0; i < local_grid.N[0]; ++i)
            {
                local_grid.q[i+halow + (j+halow)*local_grid.ystride + v*local_grid.varstride] = (i+local_grid.MPI_offset[0]) + (j+local_grid.MPI_offset[1])*global_N[0] + v*global_N[0]*global_N[1];
            }
        }
    }

    mpi_gather(&global_grid, &local_grid, -1ULL);

    int retval = 0;
    if(rank == 0)
    {
        retval = check_grid(&global_grid);
    }
    MPI_Bcast(&retval, 1, MPI_INT, 0, MPI_COMM_WORLD);

    free(local_grid.q);
    if(rank == 0)
    {
        free(global_grid.q);
    }
    return retval;
}

int main(int argc, char* argv[])
{
    MPI_Init(&argc, &argv);

    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int global_N[2];
    if(rank == 0)
    {
        assert(argc == 3);

        global_N[0] = atoi(argv[1]);
        global_N[1] = atoi(argv[2]);
    }
    int ret = gather_test(global_N);

    MPI_Finalize();
    return ret;
}

## test-gather-output.txt
$ sh test-gather.sh
mpi-gather.cpp(245): warning #3656: variable "out_indices" may be used before its value is set
                  req_finished[out_indices[i]] = 1;
                               ^

remark #11074: Inlining inhibited by limit max-size
remark #11074: Inlining inhibited by limit max-total-size
remark #11076: To get full report use -qopt-report=4 -qopt-report-phase ipo
openmpi --version:
mpirun (Open MPI) 2.1.0

Report bugs to http://www.open-mpi.org/community/help/
mpich --version:
HYDRA build details:
    Version:                                 3.2
    Release Date:                            Wed Nov 11 22:06:48 CST 2015
    CC:                              icc
    CXX:                             icpc
    F77:                             ifort
    F90:                             ifort
    Configure options:                       '--disable-option-checking' '--prefix=/home/jsewall/local/mpich' '--cache-file=/dev/null' '--srcdir=.' 'CC=icc' 'CFLAGS= -O2' 'LDFLAGS=' 'LIBS=-lpthread ' 'CPPFLAGS= -
I/home/jsewall/src/mpich-3.2/src/mpl/include -I/home/jsewall/src/mpich-3.2/src/mpl/include -I/home/jsewall/src/mpich-3.2/src/openpa/src -I/home/jsewall/src/mpich-3.2/src/openpa/src -D_REENTRANT -I/home/jsewall/sr
c/mpich-3.2/src/mpi/romio/include'
    Process Manager:                         pmi
    Launchers available:                     ssh rsh fork slurm ll lsf sge manual persist
    Topology libraries available:            hwloc
    Resource management kernels available:   user slurm ll lsf sge pbs cobalt
    Checkpointing libraries available:
    Demux engines available:                 poll select
impi --version:
Intel(R) MPI Library for Linux* OS, Version 2017 Update 2 Build 20170125 (id: 16752)
Copyright (C) 2003-2017, Intel Corporation. All rights reserved.
openmpi -np 1 ./mpi-gather 64 64 => pass
mpich -np 1 ./mpi-gather 64 64 => pass
impi -np 1 ./mpi-gather 64 64 => pass

openmpi -np 2 ./mpi-gather 64 64 => pass
mpich -np 2 ./mpi-gather 64 64 => pass
impi -np 2 ./mpi-gather 64 64 => pass

openmpi -np 3 ./mpi-gather 64 64 => pass
mpich -np 3 ./mpi-gather 64 64 => pass
impi -np 3 ./mpi-gather 64 64 => pass

openmpi -np 1 ./mpi-gather 64 128 => pass
mpich -np 1 ./mpi-gather 64 128 => pass
impi -np 1 ./mpi-gather 64 128 => pass

openmpi -np 2 ./mpi-gather 64 128 => pass
mpich -np 2 ./mpi-gather 64 128 => pass
impi -np 2 ./mpi-gather 64 128 => pass

openmpi -np 3 ./mpi-gather 64 128 => pass
mpich -np 3 ./mpi-gather 64 128 => pass
impi -np 3 ./mpi-gather 64 128 => pass
openmpi -np 1 ./mpi-gather 64 256 => pass
mpich -np 1 ./mpi-gather 64 256 => pass
impi -np 1 ./mpi-gather 64 256 => pass

openmpi -np 2 ./mpi-gather 64 256 => pass
mpich -np 2 ./mpi-gather 64 256 => pass
impi -np 2 ./mpi-gather 64 256 => pass

openmpi -np 3 ./mpi-gather 64 256 => pass
mpich -np 3 ./mpi-gather 64 256 => pass
impi -np 3 ./mpi-gather 64 256 => pass

openmpi -np 1 ./mpi-gather 128 64 => pass
mpich -np 1 ./mpi-gather 128 64 => pass
impi -np 1 ./mpi-gather 128 64 => pass

openmpi -np 2 ./mpi-gather 128 64 => pass
mpich -np 2 ./mpi-gather 128 64 => pass
impi -np 2 ./mpi-gather 128 64 => ^[OApass

openmpi -np 3 ./mpi-gather 128 64 => pass
mpich -np 3 ./mpi-gather 128 64 => pass
impi -np 3 ./mpi-gather 128 64 => pass

openmpi -np 1 ./mpi-gather 128 128 => pass
mpich -np 1 ./mpi-gather 128 128 => pass
impi -np 1 ./mpi-gather 128 128 => pass

openmpi -np 2 ./mpi-gather 128 128 => pass
mpich -np 2 ./mpi-gather 128 128 => pass
impi -np 2 ./mpi-gather 128 128 => pass

openmpi -np 3 ./mpi-gather 128 128 => pass
mpich -np 3 ./mpi-gather 128 128 => pass
impi -np 3 ./mpi-gather 128 128 => pass

openmpi -np 1 ./mpi-gather 128 256 => pass
mpich -np 1 ./mpi-gather 128 256 => pass
impi -np 1 ./mpi-gather 128 256 => pass

openmpi -np 2 ./mpi-gather 128 256 => pass
mpich -np 2 ./mpi-gather 128 256 => pass
impi -np 2 ./mpi-gather 128 256 => pass

openmpi -np 3 ./mpi-gather 128 256 => pass
mpich -np 3 ./mpi-gather 128 256 => pass
impi -np 3 ./mpi-gather 128 256 => pass

openmpi -np 1 ./mpi-gather 256 64 => pass
mpich -np 1 ./mpi-gather 256 64 => pass
impi -np 1 ./mpi-gather 256 64 => pass

openmpi -np 2 ./mpi-gather 256 64 => pass
mpich -np 2 ./mpi-gather 256 64 => pass
impi -np 2 ./mpi-gather 256 64 => pass

openmpi -np 3 ./mpi-gather 256 64 => pass
mpich -np 3 ./mpi-gather 256 64 => pass
impi -np 3 ./mpi-gather 256 64 => pass

openmpi -np 1 ./mpi-gather 256 128 => pass
mpich -np 1 ./mpi-gather 256 128 => pass
impi -np 1 ./mpi-gather 256 128 => pass

openmpi -np 2 ./mpi-gather 256 128 => pass
mpich -np 2 ./mpi-gather 256 128 => pass
impi -np 2 ./mpi-gather 256 128 => pass

openmpi -np 3 ./mpi-gather 256 128 => pass
mpich -np 3 ./mpi-gather 256 128 => pass
impi -np 3 ./mpi-gather 256 128 => pass

openmpi -np 1 ./mpi-gather 256 256 => pass
mpich -np 1 ./mpi-gather 256 256 => pass
impi -np 1 ./mpi-gather 256 256 => pass

openmpi -np 2 ./mpi-gather 256 256 => pass
mpich -np 2 ./mpi-gather 256 256 => pass
impi -np 2 ./mpi-gather 256 256 => pass

openmpi -np 3 ./mpi-gather 256 256 => pass
Expected to lock up. Issue user interrupt.
mpich -np 3 ./mpi-gather 256 256 => ^Cbroken
Expected to lock up. Issue user interrupt.
impi -np 3 ./mpi-gather 256 256 => ^Cbroken

## test-gather.sh
#!/bin/bash

mpiicpc mpi-gather.cpp -o mpi-gather -Wall -Wextra

declare -A mpilibs
OPENMPI=/home/jsewall/local/openmpi/bin/mpirun
MPICH=/home/jsewall/local/mpich/bin/mpirun
IMPI=/opt/intel/compilers_and_libraries_2017.2.174/linux/mpi/intel64/bin/mpirun

mpilibs=( ["openmpi"]="$OPENMPI" ["mpich"]="$MPICH" ["impi"]="$IMPI" )

SIZES="64 128 256"
PROCS="1 2 3"
MPIEXECS="openmpi mpich impi"

for e in $MPIEXECS
do
    echo "$e --version:"
    ${mpilibs[$e]} --version
done

trap inthandle INT

inthandle()
{
    INTERRUPTED=1
}

for sx in $SIZES
do
    for sy in $SIZES
    do
	for p in $PROCS
	do
	    for e in $MPIEXECS
	    do
		INTERRUPTED=0
		if [[ ( ${sx} -ge 256 ) && ( ${sy} -ge 256 ) && ( $p -ge 3 ) && ( $e != "openmpi" )]]
		then
		    echo "Expected to lock up. Issue user interrupt."
		fi
	    	echo -n "$e -np $p ./mpi-gather $sx $sy => "
		${mpilibs[$e]} -np $p ./mpi-gather $sx $sy > /dev/null
		EXITSTATUS=$?
		if [ ${INTERRUPTED} -ne 0 ]
		then
		    echo "broken"
		else
		    if  [ ${EXITSTATUS} == 0 ]
		    then
			echo "pass"
		    else
			echo "fail"
		    fi
		fi
	    done
	    echo ""
	done
    done
done
	#include <cstdio>
	#include <cstdlib>
	#include <cassert>
	#include <cmath>
	#include <mpi.h>
	#include <algorithm>

	#include <cstdarg>
	#include <cstdio>
	#include <cstring>
	#include <cmath>
	#include <limits>
	#include <vector>

	static std::vector<int> get_factors(int num)
	{
	std::vector<int> res;
	while(num > 1)
	{
	if((num % 2) == 0)
	{
	num /= 2;
	res.push_back(2);
	}
	else
	{
	bool divisor = false;
	for(int i = 3; i <= std::sqrt(num); i+=2)
	if((num % i) == 0)
	{
	num /= i;
	res.push_back(i);
	divisor = true;
	break;
	}
	if(!divisor)
	{
	res.push_back(num);
	num = 1;
	}
	}
	}
	return res;
	}

	static double nth_subset_sum(unsigned long long num, const std::vector<double> *set)
	{
	double res = 0.0;
	for(int i = 0; i < set->size(); ++i)
	if(num & (1ULL << i))
	res += (*set)[i];
	return res;
	}

	static int nth_subset_prod(unsigned long long num, const std::vector<int> *set)
	{
	int res = 1;
	for(int i = 0; i < set->size(); ++i)
	if(num & (1ULL << i))
	res = (set)[i];
	return res;
	}

	static unsigned long long best_subset(const double split, const std::vector<double> *ln_factors)
	{
	double best = std::numeric_limits<double>::max();
	unsigned long long argbest = -1ULL;
	for( unsigned long long i = 1; i < (1 << ln_factors->size())-1; ++i)
	{
	const double prod = std::abs(nth_subset_sum(i, ln_factors) - split);
	if(prod < best)
	{
	best = prod;
	argbest = i;
	}
	}
	return argbest;
	}

	std::vector<int> best_decomp(int num, int groups)
	{
	const double split = std::log((double)num)/groups;
	std::vector<int> res(groups);

	int todiv = num;
	for(int i = 0; i < groups-1; ++i)
	{
	std::vector<int> factors = get_factors(todiv);
	std::vector<double> ln_factors(factors.size());
	for(int j = 0; j < ln_factors.size(); ++j)
	ln_factors[j] = std::log(factors[j]);

	const unsigned long long set_idx = best_subset(split, &ln_factors);
	res[i] = nth_subset_prod(set_idx, &factors);
	todiv /= res[i];
	}
	res[groups-1] = todiv;
	std::sort(res.begin(), res.end());
	return res;
	}

	void divvy(int start, int end, const int nitems, int chunkno, int nchunks)
	{
	const int items_per_chunk = nitems/nchunks;
	const int remainder = nitems - nchunks*items_per_chunk;

	start = chunknoitems_per_chunk + std::min(chunkno, remainder);
	end = (chunkno+1)items_per_chunk + std::min(chunkno+1, remainder);
	}

	struct grid2d
	{
	double *q;
	int ystride;
	int varstride;
	int N[2];

	int MPI_offset[2];
	int MPI_decomp[2];
	};

	void get_rank_decomp(int start[2], int end[2], int position[2], const int decomp[2], int rank, const int n[2])
	{
	position[0] = rank % decomp[0];
	position[1] = rank / decomp[0];

	divvy(start+0, end+0, n[0], position[0], decomp[0]);
	divvy(start+1, end+1, n[1], position[1], decomp[1]);
	}

	static const int halow = 2;

	void grid2d_init(grid2d *h, int rank, int nranks, const int global_N[2])
	{
	std::vector<int> decomp(best_decomp(nranks, 2));

	h->MPI_decomp[0] = decomp[0];
	h->MPI_decomp[1] = decomp[1];

	if(rank == 0)
	{
	printf("Decomp: %d %d\n", h->MPI_decomp[0], h->MPI_decomp[1]);
	}

	int end[2];
	int MPI_position[2];
	get_rank_decomp(h->MPI_offset, end, MPI_position, h->MPI_decomp, rank, global_N);

	h->N[0] = end[0]-h->MPI_offset[0];
	h->N[1] = end[1]-h->MPI_offset[1];

	h->ystride = h->N[0] + halow*2;
	h->varstride = h->ystride * (h->N[1] + halow*2);

	h->q = (double)malloc(sizeof(double)h->varstride*4);
	assert(h->q);
	memset(h->q, 0, sizeof(double)h->varstride4);
	printf("rank % 4d: (% 3d, % 3d) [% 5d, % 5d]x[% 5d, % 5d]\n", rank, MPI_position[0], MPI_position[1], h->MPI_offset[0], end[0], h->MPI_offset[1], end[1]);
	}

	void mpi_gather(grid2d dest, const grid2d src, uint64_t xfer_mask)
	{
	int rank, nranks;
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &nranks);

	assert(nranks <= 64);

	const uint64_t rank_mask = (1ULL << nranks) -1ULL;
	const uint64_t effective_mask = rank_mask & xfer_mask;
	printf("R%d GATHER START: xfer_mask %llu, rank_mask: %llu, effective_mask: %llu\n", rank, xfer_mask, rank_mask, effective_mask);

	const int nrecvs = rank == 0 ? __builtin_popcountll(effective_mask) : 0;

	const int sending = ((1ULL << rank) & effective_mask) != 0ULL;

	const int num_types = nrecvs + sending;
	const int num_comms = num_types;

	int used_types = 0;
	int used_comms = 0;

	MPI_Datatype grid_types[num_types];
	MPI_Request reqs[num_comms];

	if(sending)
	{
	MPI_Datatype tmp[2];
	MPI_Type_create_hvector(src->N[0], 1, sizeof(double)*1, MPI_DOUBLE, tmp+0);
	MPI_Type_create_hvector(src->N[1], 1, sizeof(double)*src->ystride, tmp[0], tmp+1);
	MPI_Type_create_hvector(4, 1, sizeof(double)*src->varstride, tmp[1], grid_types+used_types);
	MPI_Type_commit(grid_types+used_types);

	MPI_Isend(src->q+halow+halow*src->ystride, 1, grid_types[used_types], 0, rank, MPI_COMM_WORLD, reqs+used_comms);

	++used_types;
	++used_comms;
	}

	if(rank == 0)
	{
	MPI_Status stats[num_comms];
	for(int r = 0; r < nranks; ++r)
	{
	if(((1ULL << r) & effective_mask) == 0ULL)
	{
	continue;
	}

	int pos[2], start[2], end[2];
	get_rank_decomp(start, end, pos, src->MPI_decomp, r, dest->N);
	int n[2];
	n[0] = end[0]-start[0];
	n[1] = end[1]-start[1];

	MPI_Datatype tmp[2];
	MPI_Type_create_hvector(n[0], 1, sizeof(double)*1, MPI_DOUBLE, tmp+0);
	MPI_Type_create_hvector(n[1], 1, sizeof(double)*dest->ystride, tmp[0], tmp+1);
	MPI_Type_create_hvector(4, 1, sizeof(double)*dest->varstride, tmp[1], grid_types+used_types);
	MPI_Type_commit(grid_types+used_types);

	MPI_Irecv(dest->q + halow + start[0] + dest->ystride*(halow+start[1]), 1, grid_types[used_types], r, r, MPI_COMM_WORLD, reqs+used_comms);

	++used_types;
	++used_comms;
	}
	}


	if(sending \|\| rank == 0)
	{
	MPI_Status stats[used_comms];
	//MPI_Waitall(used_comms, reqs_stats);

	int out_comms = 0;
	int out_indices[used_comms];
	int req_finished[used_comms];
	const char *status_str[2] = {"not ", ""};
	memset(req_finished, 0, sizeof(int)*used_comms);
	do
	{
	printf("R%d %d items:\n", rank, used_comms);
	for(int i = 0; i < out_comms; ++i)
	{
	req_finished[out_indices[i]] = 1;
	}
	for(int i = 0; i < used_comms; ++i)
	{
	printf("R%d: req: %d %sfinished\n", rank, i, status_str[req_finished[i]]);
	}
	MPI_Waitsome(used_comms, reqs, &out_comms, out_indices, stats);
	}
	while(out_comms != MPI_UNDEFINED);

	printf("R%d all finished\n", rank);

	for(int r = 0; r < used_types; ++r)
	{
	MPI_Type_free(grid_types + r);
	}
	}
	printf("R%d GATHER END: xfer_mask %llu, rank_mask: %llu, effective_mask: %llu\n", rank, xfer_mask, rank_mask, effective_mask);
	}

	int check_grid(grid2d *g)
	{
	for(int v = 0; v < 4; ++v)
	{
	for(int j = 0; j < g->N[1]+2*halow; ++j)
	{
	for(int i = 0; i < g->N[0]+2*halow; ++i)
	{
	double expected;
	if(i >= halow && j >= halow && i < g->N[0]+halow && j < g->N[1]+halow)
	{
	expected = i-halow + (j-halow)g->N[0] + vg->N[0]*g->N[1];
	}
	else
	{
	expected = 0;
	}

	const double found = g->q[i+jg->ystride+vg->varstride];
	if(found != expected)
	{
	printf("Gather failed !!! %d, %d, %d expected %lg, got %lg\n", i,j,v, expected, found);
	return EXIT_FAILURE;

	}
	}
	}
	}
	printf("Gather okay\n");
	return EXIT_SUCCESS;
	}

	int gather_test(int global_N[2])
	{
	int rank;
	int nranks;
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &nranks);

	MPI_Bcast(global_N, 2, MPI_INT, 0, MPI_COMM_WORLD);

	grid2d global_grid;
	if(rank == 0)
	{
	grid2d_init(&global_grid, 0, 1, global_N);
	}

	grid2d local_grid;
	grid2d_init(&local_grid, rank, nranks, global_N);

	for(int v = 0; v < 4; ++v)
	{
	for(int j = 0; j < local_grid.N[1]; ++j)
	{
	for(int i = 0; i < local_grid.N[0]; ++i)
	{
	local_grid.q[i+halow + (j+halow)local_grid.ystride + vlocal_grid.varstride] = (i+local_grid.MPI_offset[0]) + (j+local_grid.MPI_offset[1])global_N[0] + vglobal_N[0]*global_N[1];
	}
	}
	}

	mpi_gather(&global_grid, &local_grid, -1ULL);

	int retval = 0;
	if(rank == 0)
	{
	retval = check_grid(&global_grid);
	}
	MPI_Bcast(&retval, 1, MPI_INT, 0, MPI_COMM_WORLD);

	free(local_grid.q);
	if(rank == 0)
	{
	free(global_grid.q);
	}
	return retval;
	}

	int main(int argc, char* argv[])
	{
	MPI_Init(&argc, &argv);

	int rank;
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	int global_N[2];
	if(rank == 0)
	{
	assert(argc == 3);

	global_N[0] = atoi(argv[1]);
	global_N[1] = atoi(argv[2]);
	}
	int ret = gather_test(global_N);

	MPI_Finalize();
	return ret;
	}
	$ sh test-gather.sh
	mpi-gather.cpp(245): warning #3656: variable "out_indices" may be used before its value is set
	req_finished[out_indices[i]] = 1;
	^

	remark #11074: Inlining inhibited by limit max-size
	remark #11074: Inlining inhibited by limit max-total-size
	remark #11076: To get full report use -qopt-report=4 -qopt-report-phase ipo
	openmpi --version:
	mpirun (Open MPI) 2.1.0

	Report bugs to http://www.open-mpi.org/community/help/
	mpich --version:
	HYDRA build details:
	Version: 3.2
	Release Date: Wed Nov 11 22:06:48 CST 2015
	CC: icc
	CXX: icpc
	F77: ifort
	F90: ifort
	Configure options: '--disable-option-checking' '--prefix=/home/jsewall/local/mpich' '--cache-file=/dev/null' '--srcdir=.' 'CC=icc' 'CFLAGS= -O2' 'LDFLAGS=' 'LIBS=-lpthread ' 'CPPFLAGS= -
	I/home/jsewall/src/mpich-3.2/src/mpl/include -I/home/jsewall/src/mpich-3.2/src/mpl/include -I/home/jsewall/src/mpich-3.2/src/openpa/src -I/home/jsewall/src/mpich-3.2/src/openpa/src -D_REENTRANT -I/home/jsewall/sr
	c/mpich-3.2/src/mpi/romio/include'
	Process Manager: pmi
	Launchers available: ssh rsh fork slurm ll lsf sge manual persist
	Topology libraries available: hwloc
	Resource management kernels available: user slurm ll lsf sge pbs cobalt
	Checkpointing libraries available:
	Demux engines available: poll select
	impi --version:
	Intel(R) MPI Library for Linux* OS, Version 2017 Update 2 Build 20170125 (id: 16752)
	Copyright (C) 2003-2017, Intel Corporation. All rights reserved.
	openmpi -np 1 ./mpi-gather 64 64 => pass
	mpich -np 1 ./mpi-gather 64 64 => pass
	impi -np 1 ./mpi-gather 64 64 => pass

	openmpi -np 2 ./mpi-gather 64 64 => pass
	mpich -np 2 ./mpi-gather 64 64 => pass
	impi -np 2 ./mpi-gather 64 64 => pass

	openmpi -np 3 ./mpi-gather 64 64 => pass
	mpich -np 3 ./mpi-gather 64 64 => pass
	impi -np 3 ./mpi-gather 64 64 => pass

	openmpi -np 1 ./mpi-gather 64 128 => pass
	mpich -np 1 ./mpi-gather 64 128 => pass
	impi -np 1 ./mpi-gather 64 128 => pass

	openmpi -np 2 ./mpi-gather 64 128 => pass
	mpich -np 2 ./mpi-gather 64 128 => pass
	impi -np 2 ./mpi-gather 64 128 => pass

	openmpi -np 3 ./mpi-gather 64 128 => pass
	mpich -np 3 ./mpi-gather 64 128 => pass
	impi -np 3 ./mpi-gather 64 128 => pass
	openmpi -np 1 ./mpi-gather 64 256 => pass
	mpich -np 1 ./mpi-gather 64 256 => pass
	impi -np 1 ./mpi-gather 64 256 => pass

	openmpi -np 2 ./mpi-gather 64 256 => pass
	mpich -np 2 ./mpi-gather 64 256 => pass
	impi -np 2 ./mpi-gather 64 256 => pass

	openmpi -np 3 ./mpi-gather 64 256 => pass
	mpich -np 3 ./mpi-gather 64 256 => pass
	impi -np 3 ./mpi-gather 64 256 => pass

	openmpi -np 1 ./mpi-gather 128 64 => pass
	mpich -np 1 ./mpi-gather 128 64 => pass
	impi -np 1 ./mpi-gather 128 64 => pass

	openmpi -np 2 ./mpi-gather 128 64 => pass
	mpich -np 2 ./mpi-gather 128 64 => pass
	impi -np 2 ./mpi-gather 128 64 => ^[OApass

	openmpi -np 3 ./mpi-gather 128 64 => pass
	mpich -np 3 ./mpi-gather 128 64 => pass
	impi -np 3 ./mpi-gather 128 64 => pass

	openmpi -np 1 ./mpi-gather 128 128 => pass
	mpich -np 1 ./mpi-gather 128 128 => pass
	impi -np 1 ./mpi-gather 128 128 => pass

	openmpi -np 2 ./mpi-gather 128 128 => pass
	mpich -np 2 ./mpi-gather 128 128 => pass
	impi -np 2 ./mpi-gather 128 128 => pass

	openmpi -np 3 ./mpi-gather 128 128 => pass
	mpich -np 3 ./mpi-gather 128 128 => pass
	impi -np 3 ./mpi-gather 128 128 => pass

	openmpi -np 1 ./mpi-gather 128 256 => pass
	mpich -np 1 ./mpi-gather 128 256 => pass
	impi -np 1 ./mpi-gather 128 256 => pass

	openmpi -np 2 ./mpi-gather 128 256 => pass
	mpich -np 2 ./mpi-gather 128 256 => pass
	impi -np 2 ./mpi-gather 128 256 => pass

	openmpi -np 3 ./mpi-gather 128 256 => pass
	mpich -np 3 ./mpi-gather 128 256 => pass
	impi -np 3 ./mpi-gather 128 256 => pass

	openmpi -np 1 ./mpi-gather 256 64 => pass
	mpich -np 1 ./mpi-gather 256 64 => pass
	impi -np 1 ./mpi-gather 256 64 => pass

	openmpi -np 2 ./mpi-gather 256 64 => pass
	mpich -np 2 ./mpi-gather 256 64 => pass
	impi -np 2 ./mpi-gather 256 64 => pass

	openmpi -np 3 ./mpi-gather 256 64 => pass
	mpich -np 3 ./mpi-gather 256 64 => pass
	impi -np 3 ./mpi-gather 256 64 => pass

	openmpi -np 1 ./mpi-gather 256 128 => pass
	mpich -np 1 ./mpi-gather 256 128 => pass
	impi -np 1 ./mpi-gather 256 128 => pass

	openmpi -np 2 ./mpi-gather 256 128 => pass
	mpich -np 2 ./mpi-gather 256 128 => pass
	impi -np 2 ./mpi-gather 256 128 => pass

	openmpi -np 3 ./mpi-gather 256 128 => pass
	mpich -np 3 ./mpi-gather 256 128 => pass
	impi -np 3 ./mpi-gather 256 128 => pass

	openmpi -np 1 ./mpi-gather 256 256 => pass
	mpich -np 1 ./mpi-gather 256 256 => pass
	impi -np 1 ./mpi-gather 256 256 => pass

	openmpi -np 2 ./mpi-gather 256 256 => pass
	mpich -np 2 ./mpi-gather 256 256 => pass
	impi -np 2 ./mpi-gather 256 256 => pass

	openmpi -np 3 ./mpi-gather 256 256 => pass
	Expected to lock up. Issue user interrupt.
	mpich -np 3 ./mpi-gather 256 256 => ^Cbroken
	Expected to lock up. Issue user interrupt.
	impi -np 3 ./mpi-gather 256 256 => ^Cbroken
	#!/bin/bash

	mpiicpc mpi-gather.cpp -o mpi-gather -Wall -Wextra

	declare -A mpilibs
	OPENMPI=/home/jsewall/local/openmpi/bin/mpirun
	MPICH=/home/jsewall/local/mpich/bin/mpirun
	IMPI=/opt/intel/compilers_and_libraries_2017.2.174/linux/mpi/intel64/bin/mpirun

	mpilibs=( ["openmpi"]="$OPENMPI" ["mpich"]="$MPICH" ["impi"]="$IMPI" )

	SIZES="64 128 256"
	PROCS="1 2 3"
	MPIEXECS="openmpi mpich impi"

	for e in $MPIEXECS
	do
	echo "$e --version:"
	${mpilibs[$e]} --version
	done

	trap inthandle INT

	inthandle()
	{
	INTERRUPTED=1
	}

	for sx in $SIZES
	do
	for sy in $SIZES
	do
	for p in $PROCS
	do
	for e in $MPIEXECS
	do
	INTERRUPTED=0
	if [[ ( ${sx} -ge 256 ) && ( ${sy} -ge 256 ) && ( $p -ge 3 ) && ( $e != "openmpi" )]]
	then
	echo "Expected to lock up. Issue user interrupt."
	fi
	echo -n "$e -np $p ./mpi-gather $sx $sy => "
	${mpilibs[$e]} -np $p ./mpi-gather $sx $sy > /dev/null
	EXITSTATUS=$?
	if [ ${INTERRUPTED} -ne 0 ]
	then
	echo "broken"
	else
	if [ ${EXITSTATUS} == 0 ]
	then
	echo "pass"
	else
	echo "fail"
	fi
	fi
	done
	echo ""
	done
	done
	done