aolo2/lab3.c

## lab3.c
#include <mpi.h>    /* MPI, duh... */
#include <stdio.h>  /* printf, fprintf, file-IO */
#include <stdint.h> /* fixed-size types */
#include <stdlib.h> /* malloc, atoi */
#include <string.h> /* memset */
#include <math.h>   /* sqrtf */

#define MASTER 0
#define ZERO '.'
#define ONE 'o'

typedef int64_t s64;
typedef int32_t s32;
typedef int16_t s16;
typedef int8_t  s8;

typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t  u8;

typedef float  f32;
typedef double f64;

struct life_worker_block {
    MPI_Comm communicator;

    MPI_Datatype bcol_dt;
    MPI_Datatype ecol_dt;
    MPI_Datatype nowrap_dt;

    char *buffers[4];

    int coords[2];
    int rank;

    char *data;
    int n;
};

struct life_master_block {
    MPI_Comm communicator;

    MPI_Datatype *subblock_types;
    int workers;

    char *data;
    int N;
    int n;
};

static inline int
get_rank(MPI_Comm comm)
{
    int rank;
    MPI_Comm_rank(comm, &rank);
    return(rank);
}

static inline int
get_size(MPI_Comm comm)
{
    int size;
    MPI_Comm_size(comm, &size);
    return(size);
}

static int
add_middle_glider(char *data, s32 N)
{
    if (N < 3) {
        return(1);
    }

    s32 center_x = (N + 2) / 2;
    s32 center_y = (N + 2) / 2;

    s32 glider_start_x = center_x - 1;
    s32 glider_start_y = center_y - 1;

    if (glider_start_x < 0) {
        glider_start_x = 0;
    }

    if (glider_start_y < 0) {
        glider_start_y = 0;
    }

    /* row 1 */
    data[glider_start_y * (N + 2) + glider_start_x + 1] = ONE;

    /* row 2 */
    data[(glider_start_y + 1) * (N + 2) + glider_start_x + 2] = ONE;

    /* row 3 */
    data[(glider_start_y + 2) * (N + 2) + glider_start_x + 0] = ONE;
    data[(glider_start_y + 2) * (N + 2) + glider_start_x + 1] = ONE;
    data[(glider_start_y + 2) * (N + 2) + glider_start_x + 2] = ONE;

    return(0);
}

static char
life_rule(char *data, s32 n, s32 x, s32 y)
{
    char sum = 0;
    char old = data[y * (n + 2) + x];

    /* NOTE: Moore neighborhood */
    if (data[(y - 1) * (n + 2) + x - 1] == ONE) { ++sum; }
    if (data[(y - 1) * (n + 2) + x + 0] == ONE) { ++sum; }
    if (data[(y - 1) * (n + 2) + x + 1] == ONE) { ++sum; }

    if (data[(y + 0) * (n + 2) + x - 1] == ONE) { ++sum; }
    if (data[(y + 0) * (n + 2) + x + 0] == ONE) { ++sum; }
    if (data[(y + 0) * (n + 2) + x + 1] == ONE) { ++sum; }

    if (data[(y + 1) * (n + 2) + x - 1] == ONE) { ++sum; }
    if (data[(y + 1) * (n + 2) + x + 0] == ONE) { ++sum; }
    if (data[(y + 1) * (n + 2) + x + 1] == ONE) { ++sum; }

    if (old == ZERO && sum == 3) {
        return(ONE);
    }

    if (old == ONE && (sum < 3 || sum > 4)) {
        return(ZERO);
    }

    return(old);
}

static void
worker_exchange_boundary(struct life_worker_block state)
{
    int source;
    int dest;

    int n = state.n;
    char *data = state.data;

    MPI_Comm comm = state.communicator;
    MPI_Datatype bcol_dt = state.bcol_dt;
    MPI_Datatype ecol_dt = state.ecol_dt;

    /* Exchange rows */
    char *first_row = data + (n + 2);
    char *last_row = data + n * (n + 2);

    MPI_Cart_shift(comm, 0, 1, &source,  &dest);
    MPI_Sendrecv(first_row, n + 2, MPI_CHAR, dest, 0, state.buffers[1], n + 2, MPI_CHAR, source,
                 MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);
    MPI_Sendrecv(last_row, n + 2, MPI_CHAR, source, 0, state.buffers[0], n + 2, MPI_CHAR, dest,
                 MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);

    MPI_Barrier(comm);

    /* Exchange cols */
    MPI_Cart_shift(comm, 1, 1, &source, &dest);
    MPI_Sendrecv(data, 1, bcol_dt, dest, 0, state.buffers[3], n + 2, MPI_CHAR, source,
                 MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);
    MPI_Sendrecv(data, 1, ecol_dt, source, 0, state.buffers[2], n + 2, MPI_CHAR, dest,
                 MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);

    /* Write back rows to data */
    for (s32 x = 0; x < n + 2; ++x) {
        data[0 * (n + 2) + x] = state.buffers[0][x];
        data[(n + 1) * (n + 2) + x] = state.buffers[1][x];
    }

    /* Write back cols to data */
    for (s32 y = 0; y < n + 2; ++y) {
        data[y * (n + 2) + 0] = state.buffers[2][y];
        data[y * (n + 2) + (n + 1)] = state.buffers[3][y];
    }
}

static void
master_init(struct life_master_block *state)
{
    int N = state->N;
    int n = state->n;

    MPI_Comm comm = state->communicator;

    state->data = malloc((N + 2) * (N + 2) * sizeof(char));
    memset(state->data, ZERO, (N + 2) * (N + 2) * sizeof(char));

    int rc = add_middle_glider(state->data, N);
    if (rc != 0) {
        fprintf(stderr, "[ERROR] Total grid size (%d) is too small\n", N);
        MPI_Abort(comm, 1);
    }

    state->subblock_types = malloc(state->workers * sizeof(MPI_Datatype));

    int rank;
    int grid_size = N / n;

    for (s32 y = 0; y < grid_size; ++y) {
        for (s32 x = 0; x < grid_size; ++x) {
            rank = x * grid_size + y;

            MPI_Datatype subblock_type;
            MPI_Type_create_subarray(2, (int []) { N + 2, N + 2 }, (int []) { n, n }, (int []) { 1 + x * n, 1 + y * n },
                                     MPI_ORDER_C, MPI_CHAR, &subblock_type);
            MPI_Type_commit(&subblock_type);

            state->subblock_types[rank] = subblock_type;
        }
    }
}

static void
worker_init(struct life_worker_block *state)
{
    int n = state->n;

    MPI_Comm comm = state->communicator;

    state->rank = get_rank(comm);
    state->data = malloc((n + 2) * (n + 2) * sizeof(char));

    MPI_Cart_coords(comm, state->rank, 2, state->coords);

    MPI_Type_create_subarray(2, (int []) { n + 2, n + 2 }, (int []) { n + 2, 1 }, (int []) { 0, 1 },
                             MPI_ORDER_C, MPI_CHAR, &state->bcol_dt);
    MPI_Type_create_subarray(2, (int []) { n + 2, n + 2 }, (int []) { n + 2, 1 }, (int []) { 0, n },
                             MPI_ORDER_C, MPI_CHAR, &state->ecol_dt);
    MPI_Type_create_subarray(2, (int []) { n + 2, n + 2 }, (int []) { n, n }, (int []) { 1, 1 },
                             MPI_ORDER_C, MPI_CHAR, &state->nowrap_dt);

    MPI_Type_commit(&state->bcol_dt);
    MPI_Type_commit(&state->ecol_dt);
    MPI_Type_commit(&state->nowrap_dt);

    state->buffers[0] = malloc((n + 2) * sizeof(char));
    state->buffers[1] = malloc((n + 2) * sizeof(char));
    state->buffers[2] = malloc((n + 2) * sizeof(char));
    state->buffers[3] = malloc((n + 2) * sizeof(char));
}

static void
master_send_blocks(struct life_master_block state)
{
    MPI_Request *reqs = malloc(state.workers * sizeof(MPI_Request));

    for (s32 p = 1; p < state.workers + 1; ++p) {
        MPI_Isend(state.data, 1, state.subblock_types[p - 1], p, 0, MPI_COMM_WORLD, reqs + p - 1);
    }

    MPI_Waitall(state.workers, reqs, MPI_STATUSES_IGNORE);
    free(reqs);
}

static void
worker_receive_blocks(struct life_worker_block state)
{
    MPI_Recv(state.data, 1, state.nowrap_dt, MASTER,
             MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}

static void
worker_update_data(struct life_worker_block *state)
{
    int n = state->n;
    char *updated_data = malloc((n + 2) * (n + 2) * sizeof(char));
    char *data = state->data;

    /* To preserve halo */
    memcpy(updated_data, data, (n + 2) * (n + 2) * sizeof(char));

    for (s32 y = 1; y < n + 1; ++y) {
        for (s32 x = 1; x < n + 1; ++x) {
            updated_data[y * (n + 2) + x] = life_rule(data, n, x, y);
        }
    }

    free(data);
    state->data = updated_data;
}

static void
worker_send_blocks(struct life_worker_block state)
{
    MPI_Send(state.data, 1, state.nowrap_dt, MASTER, 0, MPI_COMM_WORLD);
}

static void
master_collect_blocks(struct life_master_block state)
{
    int workers = state.workers;
    MPI_Request *reqs = malloc(workers * sizeof(MPI_Request));

    for (s32 p = 1; p < state.workers + 1; ++p) {
        MPI_Irecv(state.data, 1, state.subblock_types[p - 1], p, MPI_ANY_TAG, MPI_COMM_WORLD, reqs + p - 1);
    }

    MPI_Waitall(workers, reqs, MPI_STATUSES_IGNORE);
    free(reqs);
}

static void
master_dump_data(struct life_master_block state, s32 steps, f64 time)
{
    int N = state.N;

    FILE *f = fopen("output.txt", "wb");
    for (s32 y = 1; y < N + 1; ++y) {
        fwrite(state.data + y * (N + 2) + 1, N * sizeof(char), 1, f);
        fwrite("\n", 1, 1, f);
    }
    fclose(f);

    f = fopen("stat.txt", "wb");
    fprintf(f, "%d %d %d: %.3f\n", state.n, steps, state.workers + 1, time);
    fclose(f);
}

int
main(int argc, char **argv)
{
    MPI_Init(&argc, &argv);

    int rank = get_rank(MPI_COMM_WORLD);
    int size = get_size(MPI_COMM_WORLD);

    if (argc != 3) {
        fprintf(stderr, "[ERROR] Usage: %s block_size steps\n", argv[0]);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    s32 grid_size = sqrtf(size - 1);
    s32 n = atoi(argv[1]);
    s32 steps = atoi(argv[2]);

    if (grid_size * grid_size != size - 1) {
        fprintf(stderr, "[ERROR] (Process count - 1) (%d) is not a whole square\n", size - 1);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    int color = 1;
    if (rank == MASTER) { color = 0; }

    MPI_Comm worker_subcomm;
    MPI_Comm grid_communicator;

    MPI_Comm_split(MPI_COMM_WORLD, color, rank, &worker_subcomm);

    if (rank != MASTER) {
        MPI_Cart_create(worker_subcomm, 2, (int []) { grid_size, grid_size }, (int []) { 1, 1 }, 0, &grid_communicator);
        if (grid_communicator == MPI_COMM_NULL) {
            fprintf(stderr, "[ERROR] Could not create a cartesian grid\n");
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
    }

    struct life_master_block master_state;
    struct life_worker_block worker_state;

    /* Init */
    if (rank == MASTER) {
        master_state.communicator = grid_communicator;
        master_state.N = n * grid_size;
        master_state.n = n;
        master_state.workers = size - 1;
        master_init(&master_state);
    } else {
        worker_state.communicator = grid_communicator;
        worker_state.n = n;
        worker_init(&worker_state);
    }

    /* Distribute blocks */
    if (rank == MASTER) {
        master_send_blocks(master_state);
    } else {
        worker_receive_blocks(worker_state);
    }

    /* Perform simulation */
    f64 before;
    f64 after;

    if (rank == MASTER) {
        before = MPI_Wtime();
    } else {
        /* To setup proper halos */
        worker_exchange_boundary(worker_state);
        worker_exchange_boundary(worker_state);
    }

    for (s32 t = 0; t < steps; ++t) {
        if (rank != MASTER) {
            worker_update_data(&worker_state);
            worker_exchange_boundary(worker_state);
            worker_exchange_boundary(worker_state);
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    if (rank == MASTER) {
        after = MPI_Wtime();
    }

    /* Collect blocks */
    if (rank == MASTER) {
        master_collect_blocks(master_state);
        master_dump_data(master_state, steps, after - before);
    } else {
        worker_send_blocks(worker_state);
    }

    return(MPI_Finalize());
}
	#include <mpi.h> /* MPI, duh... */
	#include <stdio.h> /* printf, fprintf, file-IO */
	#include <stdint.h> /* fixed-size types */
	#include <stdlib.h> /* malloc, atoi */
	#include <string.h> /* memset */
	#include <math.h> /* sqrtf */

	#define MASTER 0
	#define ZERO '.'
	#define ONE 'o'

	typedef int64_t s64;
	typedef int32_t s32;
	typedef int16_t s16;
	typedef int8_t s8;

	typedef uint64_t u64;
	typedef uint32_t u32;
	typedef uint16_t u16;
	typedef uint8_t u8;

	typedef float f32;
	typedef double f64;

	struct life_worker_block {
	MPI_Comm communicator;

	MPI_Datatype bcol_dt;
	MPI_Datatype ecol_dt;
	MPI_Datatype nowrap_dt;

	char *buffers[4];

	int coords[2];
	int rank;

	char *data;
	int n;
	};

	struct life_master_block {
	MPI_Comm communicator;

	MPI_Datatype *subblock_types;
	int workers;

	char *data;
	int N;
	int n;
	};

	static inline int
	get_rank(MPI_Comm comm)
	{
	int rank;
	MPI_Comm_rank(comm, &rank);
	return(rank);
	}

	static inline int
	get_size(MPI_Comm comm)
	{
	int size;
	MPI_Comm_size(comm, &size);
	return(size);
	}

	static int
	add_middle_glider(char *data, s32 N)
	{
	if (N < 3) {
	return(1);
	}

	s32 center_x = (N + 2) / 2;
	s32 center_y = (N + 2) / 2;

	s32 glider_start_x = center_x - 1;
	s32 glider_start_y = center_y - 1;

	if (glider_start_x < 0) {
	glider_start_x = 0;
	}

	if (glider_start_y < 0) {
	glider_start_y = 0;
	}

	/* row 1 */
	data[glider_start_y * (N + 2) + glider_start_x + 1] = ONE;

	/* row 2 */
	data[(glider_start_y + 1) * (N + 2) + glider_start_x + 2] = ONE;

	/* row 3 */
	data[(glider_start_y + 2) * (N + 2) + glider_start_x + 0] = ONE;
	data[(glider_start_y + 2) * (N + 2) + glider_start_x + 1] = ONE;
	data[(glider_start_y + 2) * (N + 2) + glider_start_x + 2] = ONE;

	return(0);
	}

	static char
	life_rule(char *data, s32 n, s32 x, s32 y)
	{
	char sum = 0;
	char old = data[y * (n + 2) + x];

	/* NOTE: Moore neighborhood */
	if (data[(y - 1) * (n + 2) + x - 1] == ONE) { ++sum; }
	if (data[(y - 1) * (n + 2) + x + 0] == ONE) { ++sum; }
	if (data[(y - 1) * (n + 2) + x + 1] == ONE) { ++sum; }

	if (data[(y + 0) * (n + 2) + x - 1] == ONE) { ++sum; }
	if (data[(y + 0) * (n + 2) + x + 0] == ONE) { ++sum; }
	if (data[(y + 0) * (n + 2) + x + 1] == ONE) { ++sum; }

	if (data[(y + 1) * (n + 2) + x - 1] == ONE) { ++sum; }
	if (data[(y + 1) * (n + 2) + x + 0] == ONE) { ++sum; }
	if (data[(y + 1) * (n + 2) + x + 1] == ONE) { ++sum; }

	if (old == ZERO && sum == 3) {
	return(ONE);
	}

	if (old == ONE && (sum < 3 \|\| sum > 4)) {
	return(ZERO);
	}

	return(old);
	}

	static void
	worker_exchange_boundary(struct life_worker_block state)
	{
	int source;
	int dest;

	int n = state.n;
	char *data = state.data;

	MPI_Comm comm = state.communicator;
	MPI_Datatype bcol_dt = state.bcol_dt;
	MPI_Datatype ecol_dt = state.ecol_dt;

	/* Exchange rows */
	char *first_row = data + (n + 2);
	char last_row = data + n (n + 2);

	MPI_Cart_shift(comm, 0, 1, &source, &dest);
	MPI_Sendrecv(first_row, n + 2, MPI_CHAR, dest, 0, state.buffers[1], n + 2, MPI_CHAR, source,
	MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);
	MPI_Sendrecv(last_row, n + 2, MPI_CHAR, source, 0, state.buffers[0], n + 2, MPI_CHAR, dest,
	MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);

	MPI_Barrier(comm);

	/* Exchange cols */
	MPI_Cart_shift(comm, 1, 1, &source, &dest);
	MPI_Sendrecv(data, 1, bcol_dt, dest, 0, state.buffers[3], n + 2, MPI_CHAR, source,
	MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);
	MPI_Sendrecv(data, 1, ecol_dt, source, 0, state.buffers[2], n + 2, MPI_CHAR, dest,
	MPI_ANY_TAG, comm, MPI_STATUS_IGNORE);

	/* Write back rows to data */
	for (s32 x = 0; x < n + 2; ++x) {
	data[0 * (n + 2) + x] = state.buffers[0][x];
	data[(n + 1) * (n + 2) + x] = state.buffers[1][x];
	}

	/* Write back cols to data */
	for (s32 y = 0; y < n + 2; ++y) {
	data[y * (n + 2) + 0] = state.buffers[2][y];
	data[y * (n + 2) + (n + 1)] = state.buffers[3][y];
	}
	}

	static void
	master_init(struct life_master_block *state)
	{
	int N = state->N;
	int n = state->n;

	MPI_Comm comm = state->communicator;

	state->data = malloc((N + 2) * (N + 2) * sizeof(char));
	memset(state->data, ZERO, (N + 2) * (N + 2) * sizeof(char));

	int rc = add_middle_glider(state->data, N);
	if (rc != 0) {
	fprintf(stderr, "[ERROR] Total grid size (%d) is too small\n", N);
	MPI_Abort(comm, 1);
	}

	state->subblock_types = malloc(state->workers * sizeof(MPI_Datatype));

	int rank;
	int grid_size = N / n;

	for (s32 y = 0; y < grid_size; ++y) {
	for (s32 x = 0; x < grid_size; ++x) {
	rank = x * grid_size + y;

	MPI_Datatype subblock_type;
	MPI_Type_create_subarray(2, (int []) { N + 2, N + 2 }, (int []) { n, n }, (int []) { 1 + x * n, 1 + y * n },
	MPI_ORDER_C, MPI_CHAR, &subblock_type);
	MPI_Type_commit(&subblock_type);

	state->subblock_types[rank] = subblock_type;
	}
	}
	}

	static void
	worker_init(struct life_worker_block *state)
	{
	int n = state->n;

	MPI_Comm comm = state->communicator;

	state->rank = get_rank(comm);
	state->data = malloc((n + 2) * (n + 2) * sizeof(char));

	MPI_Cart_coords(comm, state->rank, 2, state->coords);

	MPI_Type_create_subarray(2, (int []) { n + 2, n + 2 }, (int []) { n + 2, 1 }, (int []) { 0, 1 },
	MPI_ORDER_C, MPI_CHAR, &state->bcol_dt);
	MPI_Type_create_subarray(2, (int []) { n + 2, n + 2 }, (int []) { n + 2, 1 }, (int []) { 0, n },
	MPI_ORDER_C, MPI_CHAR, &state->ecol_dt);
	MPI_Type_create_subarray(2, (int []) { n + 2, n + 2 }, (int []) { n, n }, (int []) { 1, 1 },
	MPI_ORDER_C, MPI_CHAR, &state->nowrap_dt);

	MPI_Type_commit(&state->bcol_dt);
	MPI_Type_commit(&state->ecol_dt);
	MPI_Type_commit(&state->nowrap_dt);

	state->buffers[0] = malloc((n + 2) * sizeof(char));
	state->buffers[1] = malloc((n + 2) * sizeof(char));
	state->buffers[2] = malloc((n + 2) * sizeof(char));
	state->buffers[3] = malloc((n + 2) * sizeof(char));
	}

	static void
	master_send_blocks(struct life_master_block state)
	{
	MPI_Request reqs = malloc(state.workers sizeof(MPI_Request));

	for (s32 p = 1; p < state.workers + 1; ++p) {
	MPI_Isend(state.data, 1, state.subblock_types[p - 1], p, 0, MPI_COMM_WORLD, reqs + p - 1);
	}

	MPI_Waitall(state.workers, reqs, MPI_STATUSES_IGNORE);
	free(reqs);
	}

	static void
	worker_receive_blocks(struct life_worker_block state)
	{
	MPI_Recv(state.data, 1, state.nowrap_dt, MASTER,
	MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}

	static void
	worker_update_data(struct life_worker_block *state)
	{
	int n = state->n;
	char updated_data = malloc((n + 2) (n + 2) * sizeof(char));
	char *data = state->data;

	/* To preserve halo */
	memcpy(updated_data, data, (n + 2) * (n + 2) * sizeof(char));

	for (s32 y = 1; y < n + 1; ++y) {
	for (s32 x = 1; x < n + 1; ++x) {
	updated_data[y * (n + 2) + x] = life_rule(data, n, x, y);
	}
	}

	free(data);
	state->data = updated_data;
	}

	static void
	worker_send_blocks(struct life_worker_block state)
	{
	MPI_Send(state.data, 1, state.nowrap_dt, MASTER, 0, MPI_COMM_WORLD);
	}

	static void
	master_collect_blocks(struct life_master_block state)
	{
	int workers = state.workers;
	MPI_Request reqs = malloc(workers sizeof(MPI_Request));

	for (s32 p = 1; p < state.workers + 1; ++p) {
	MPI_Irecv(state.data, 1, state.subblock_types[p - 1], p, MPI_ANY_TAG, MPI_COMM_WORLD, reqs + p - 1);
	}

	MPI_Waitall(workers, reqs, MPI_STATUSES_IGNORE);
	free(reqs);
	}

	static void
	master_dump_data(struct life_master_block state, s32 steps, f64 time)
	{
	int N = state.N;

	FILE *f = fopen("output.txt", "wb");
	for (s32 y = 1; y < N + 1; ++y) {
	fwrite(state.data + y * (N + 2) + 1, N * sizeof(char), 1, f);
	fwrite("\n", 1, 1, f);
	}
	fclose(f);

	f = fopen("stat.txt", "wb");
	fprintf(f, "%d %d %d: %.3f\n", state.n, steps, state.workers + 1, time);
	fclose(f);
	}

	int
	main(int argc, char **argv)
	{
	MPI_Init(&argc, &argv);

	int rank = get_rank(MPI_COMM_WORLD);
	int size = get_size(MPI_COMM_WORLD);

	if (argc != 3) {
	fprintf(stderr, "[ERROR] Usage: %s block_size steps\n", argv[0]);
	MPI_Abort(MPI_COMM_WORLD, 1);
	}

	s32 grid_size = sqrtf(size - 1);
	s32 n = atoi(argv[1]);
	s32 steps = atoi(argv[2]);

	if (grid_size * grid_size != size - 1) {
	fprintf(stderr, "[ERROR] (Process count - 1) (%d) is not a whole square\n", size - 1);
	MPI_Abort(MPI_COMM_WORLD, 1);
	}

	int color = 1;
	if (rank == MASTER) { color = 0; }

	MPI_Comm worker_subcomm;
	MPI_Comm grid_communicator;

	MPI_Comm_split(MPI_COMM_WORLD, color, rank, &worker_subcomm);

	if (rank != MASTER) {
	MPI_Cart_create(worker_subcomm, 2, (int []) { grid_size, grid_size }, (int []) { 1, 1 }, 0, &grid_communicator);
	if (grid_communicator == MPI_COMM_NULL) {
	fprintf(stderr, "[ERROR] Could not create a cartesian grid\n");
	MPI_Abort(MPI_COMM_WORLD, 1);
	}
	}

	struct life_master_block master_state;
	struct life_worker_block worker_state;

	/* Init */
	if (rank == MASTER) {
	master_state.communicator = grid_communicator;
	master_state.N = n * grid_size;
	master_state.n = n;
	master_state.workers = size - 1;
	master_init(&master_state);
	} else {
	worker_state.communicator = grid_communicator;
	worker_state.n = n;
	worker_init(&worker_state);
	}

	/* Distribute blocks */
	if (rank == MASTER) {
	master_send_blocks(master_state);
	} else {
	worker_receive_blocks(worker_state);
	}

	/* Perform simulation */
	f64 before;
	f64 after;

	if (rank == MASTER) {
	before = MPI_Wtime();
	} else {
	/* To setup proper halos */
	worker_exchange_boundary(worker_state);
	worker_exchange_boundary(worker_state);
	}

	for (s32 t = 0; t < steps; ++t) {
	if (rank != MASTER) {
	worker_update_data(&worker_state);
	worker_exchange_boundary(worker_state);
	worker_exchange_boundary(worker_state);
	}
	}

	MPI_Barrier(MPI_COMM_WORLD);

	if (rank == MASTER) {
	after = MPI_Wtime();
	}

	/* Collect blocks */
	if (rank == MASTER) {
	master_collect_blocks(master_state);
	master_dump_data(master_state, steps, after - before);
	} else {
	worker_send_blocks(worker_state);
	}

	return(MPI_Finalize());
	}