laanwj/neontile.c

## neontile.c
/* Vivante tiling experiments in NEON
   W.J. van der Laan 2017, MIT license
 */
/* gcc neontile.c -o neontile -O2 -mfpu=neon */
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>

#include <etnaviv_drmif.h>

const char *drm_device_name="/dev/dri/renderD128";

/*** Reference implementation for testing and performance baseline */
#define TEX_TILE_WIDTH (4)
#define TEX_TILE_HEIGHT (4)
#define TEX_TILE_WORDS (TEX_TILE_WIDTH * TEX_TILE_HEIGHT)

#define DO_TILE(type)                                                   \
   src_stride /= sizeof(type);                                          \
   dst_stride = (dst_stride * TEX_TILE_HEIGHT) / sizeof(type);          \
   for (unsigned srcy = 0; srcy < height; ++srcy) {                     \
      unsigned dsty = basey + srcy;                                     \
      unsigned ty = (dsty / TEX_TILE_HEIGHT) * dst_stride +             \
                    (dsty % TEX_TILE_HEIGHT) * TEX_TILE_WIDTH;          \
      for (unsigned srcx = 0; srcx < width; ++srcx) {                   \
         unsigned dstx = basex + srcx;                                  \
         ((type *)dest)[ty + (dstx / TEX_TILE_WIDTH) * TEX_TILE_WORDS + \
                        (dstx % TEX_TILE_WIDTH)] =                      \
            ((type *)src)[srcy * src_stride + srcx];                    \
      }                                                                 \
   }

#define DO_UNTILE(type)                                                   \
   src_stride = (src_stride * TEX_TILE_HEIGHT) / sizeof(type);            \
   dst_stride /= sizeof(type);                                            \
   for (unsigned dsty = 0; dsty < height; ++dsty) {                       \
      unsigned srcy = basey + dsty;                                       \
      unsigned sy = (srcy / TEX_TILE_HEIGHT) * src_stride +               \
                    (srcy % TEX_TILE_HEIGHT) * TEX_TILE_WIDTH;            \
      for (unsigned dstx = 0; dstx < width; ++dstx) {                     \
         unsigned srcx = basex + dstx;                                    \
         ((type *)dest)[dsty * dst_stride + dstx] =                       \
            ((type *)src)[sy + (srcx / TEX_TILE_WIDTH) * TEX_TILE_WORDS + \
                          (srcx % TEX_TILE_WIDTH)];                       \
      }                                                                   \
   }

void
etna_texture_tile(void *dest, const void *src, unsigned basex, unsigned basey,
                  unsigned dst_stride, unsigned width, unsigned height,
                  unsigned src_stride, unsigned elmtsize)
{
   if (elmtsize == 4) {
      DO_TILE(uint32_t)
   } else if (elmtsize == 2) {
      DO_TILE(uint16_t)
   } else if (elmtsize == 1) {
      DO_TILE(uint8_t)
   } else {
      printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
   }
}

void
etna_texture_untile(void *dest, const void *src, unsigned basex, unsigned basey,
                    unsigned src_stride, unsigned width, unsigned height,
                    unsigned dst_stride, unsigned elmtsize)
{
   if (elmtsize == 4) {
      DO_UNTILE(uint32_t);
   } else if (elmtsize == 2) {
      DO_UNTILE(uint16_t);
   } else if (elmtsize == 1) {
      DO_UNTILE(uint8_t);
   } else {
      printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
   }
}

void base_tile8(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
    etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 1);
}
void base_untile8(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
    etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 1);
}
void base_tile16(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
    etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 2);
}
void base_untile16(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
    etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 2);
}
void base_tile32(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
    etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 4);
}
void base_untile32(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
    etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 4);
}

/** NEON specializations */

/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */
inline void tile32_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    "vld1.8    {d0,d1}, [%0], %r2;\n"
    "vld1.8    {d2,d3}, [%0], %r2;\n"
    "vld1.8    {d4,d5}, [%0], %r2;\n"
    "vld1.8    {d6,d7}, [%0], %r2;\n"
    "vstm %1,  {q0, q1, q2, q3};\n"
    : "=r"(cpu) /* changed */
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1", "q2", "q3");
}

/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
inline void tile32_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
    const void *cpunext = cpu + 16;
    __asm__ volatile (
    "vld1.8    {d0,d1}, [%0], %r3;\n"
    "vld1.8    {d8,d9}, [%1], %r3;\n"
    "vld1.8    {d2,d3}, [%0], %r3;\n"
    "vld1.8    {d10,d11}, [%1], %r3;\n"
    "vld1.8    {d4,d5}, [%0], %r3;\n"
    "vld1.8    {d12,d13}, [%1], %r3;\n"
    "vld1.8    {d6,d7}, [%0], %r3;\n"
    "vld1.8    {d14,d15}, [%1], %r3;\n"
    "vstm %2,  {q0, q1, q2, q3, q4, q5, q6, q7};\n"
    : "=r"(cpu), "=r"(cpunext)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}

/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
/* alt implementation does shuffling in registers instead of memory */
inline void tile32_2x_alt_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    "vld1.8    {d0,d1,d2,d3}, [%0], %r2;\n"
    "vld1.8    {d8,d9,d10,d11}, [%0], %r2;\n"
    "vld1.8    {d4,d5,d6,d7}, [%0], %r2;\n"
    "vld1.8    {d12,d13,d14,d15}, [%0], %r2;\n"
    /* d0 d1 d8  d9  */
    /* d2 d3 d10 d11 */
    /* d4 d5 d12 d13 */
    /* d6 d7 d14 d15 */
    "vswp d2, d8\n"
    "vswp d3, d9\n"
    "vswp d12, d6\n"
    "vswp d13, d7\n"
    /* d0  d1  d2  d3  */
    /* d8  d9  d10 d11 */
    /* d4  d5  d6  d7 */
    /* d12 d13 d14 d15 */
    "vstm %1,  {q0, q1, q2, q3, q4, q5, q6, q7};\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}

/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */
inline void tile16_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    "vld1.8    {d0}, [%0], %r2;\n"
    "vld1.8    {d1}, [%0], %r2;\n"
    "vld1.8    {d2}, [%0], %r2;\n"
    "vld1.8    {d3}, [%0], %r2;\n"
    "vstm %1,  {q0, q1};\n"
    : "=r"(cpu) /* changed */
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1");
}

/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */
inline void tile16_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
    const void *cpunext = cpu + 8;
    __asm__ volatile (
    /* load two adjacent tiles from untiled */
    "vld1.8    {d0}, [%0], %r3;\n"
    "vld1.8    {d4}, [%1], %r3;\n"
    "vld1.8    {d1}, [%0], %r3;\n"
    "vld1.8    {d5}, [%1], %r3;\n"
    "vld1.8    {d2}, [%0], %r3;\n"
    "vld1.8    {d6}, [%1], %r3;\n"
    "vld1.8    {d3}, [%0], %r3;\n"
    "vld1.8    {d7}, [%1], %r3;\n"
    /* store two adjacent tiles, tiled */
    "vstm %2,  {q0, q1, q2, q3};\n"
    : "=r"(cpu), "=r"(cpunext)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
    : "q0", "q1", "q2", "q3");
}

/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */
inline void tile8_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    /* load two adjacent tiles, from untiled */
    "vld1.8    {d0}, [%0], %r2;\n"
    "vld1.8    {d1}, [%0], %r2;\n"
    "vld1.8    {d2}, [%0], %r2;\n"
    "vld1.8    {d3}, [%0], %r2;\n"
    /* Transpose:
     *   Start
     *   [d0]  x1  x0
     *   [d1]  x3  x2
     *   [d2]  x5  x4
     *   [d3]  x7  x6
     */
    "vtrn.32 d0, d1;\n"
    "vtrn.32 d2, d3;\n"
     /*  [d0]  x2  x0
     *   [d1]  x3  x1
     *   [d2]  x6  x4
     *   [d3]  x7  x5
     */
    "vswp d1, d2;\n"
     /*  [d0]  x2  x0
     *   [d1]  x6  x4
     *   [d2]  x3  x1
     *   [d3]  x7  x5
     */
    /* store two adjacent tiles, to tiled */
    "vstm %1,   {d0-d3};\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1");
}

/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */
inline void tile8_4x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    /* load four adjacent tiles, from untiled */
    "vld1.8    {d0,d1}, [%0], %r2;\n"
    "vld1.8    {d2,d3}, [%0], %r2;\n"
    "vld1.8    {d4,d5}, [%0], %r2;\n"
    "vld1.8    {d6,d7}, [%0], %r2;\n"
    /* Transpose:
     *   Start
     *   [q0]  x3  x2  x1  x0
     *   [q1]  x7  x6  x5  x4
     *   [q2] x11 x10  x9  x8
     *   [q3] x15 x14 x13 x12
     */
    "vtrn.32 q0, q1;\n"
    "vtrn.32 q2, q3;\n"
     /*  [q0]  x6  x2  x4  x0
     *   [q1]  x7  x3  x5  x1
     *   [q2] x14 x10 x12  x8
     *   [q3] x15 x11 x13  x9
     */
    "vswp d1, d4;\n"
    "vswp d3, d6;\n"
     /*  [q0] x12  x8  x4  x0
     *   [q1] x13  x9  x5  x1
     *   [q2] x14 x10  x6  x2
     *   [q3] x15 x11  x7  x3
     */
    /* store four adjacent tiles, to tiled */
    "vstm %1,   {q0, q1, q2, q3};\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1", "q2", "q3");
}

/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */
inline void untile32_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    "vldm %1,  {q0, q1, q2, q3};\n"
    "vst1.8    {d0,d1}, [%0], %r2;\n"
    "vst1.8    {d2,d3}, [%0], %r2;\n"
    "vst1.8    {d4,d5}, [%0], %r2;\n"
    "vst1.8    {d6,d7}, [%0], %r2;\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1", "q2", "q3");
}

/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
inline void untile32_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
    void *cpunext = cpu + 16;
    __asm__ volatile (
    "vldm %2,  {q0, q1, q2, q3, q4, q5, q6, q7};\n"
    "vst1.8    {d0,d1}, [%0], %r3;\n"
    "vst1.8    {d8,d9}, [%1], %r3;\n"
    "vst1.8    {d2,d3}, [%0], %r3;\n"
    "vst1.8    {d10,d11}, [%1], %r3;\n"
    "vst1.8    {d4,d5}, [%0], %r3;\n"
    "vst1.8    {d12,d13}, [%1], %r3;\n"
    "vst1.8    {d6,d7}, [%0], %r3;\n"
    "vst1.8    {d14,d15}, [%1], %r3;\n"
    : "=r"(cpu), "=r"(cpunext)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}

/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
/* alt implementation does shuffling in registers instead of memory */
inline void untile32_2x_alt_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    "vldm %1,  {q0, q1, q2, q3, q4, q5, q6, q7};\n"
    /* d0 d1 d8  d9  */
    /* d2 d3 d10 d11 */
    /* d4 d5 d12 d13 */
    /* d6 d7 d14 d15 */
    "vswp d2, d8\n"
    "vswp d3, d9\n"
    "vswp d12, d6\n"
    "vswp d13, d7\n"
    /* d0  d1  d2  d3  */
    /* d8  d9  d10 d11 */
    /* d4  d5  d6  d7 */
    /* d12 d13 d14 d15 */
    "vst1.8    {d0,d1,d2,d3}, [%0], %r2;\n"
    "vst1.8    {d8,d9,d10,d11}, [%0], %r2;\n"
    "vst1.8    {d4,d5,d6,d7}, [%0], %r2;\n"
    "vst1.8    {d12,d13,d14,d15}, [%0], %r2;\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}


/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */
inline void untile16_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    "vldm %1,  {q0, q1};\n"
    "vst1.8    {d0}, [%0], %r2;\n"
    "vst1.8    {d1}, [%0], %r2;\n"
    "vst1.8    {d2}, [%0], %r2;\n"
    "vst1.8    {d3}, [%0], %r2;\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1");
}

/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */
inline void untile16_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
    void *cpunext = cpu + 8;
    __asm__ volatile (
    /* load two adjacent tiles, tiled */
    "vldm %2,  {q0, q1, q2, q3};\n"
    /* store two adjacent tiles, untiled */
    "vst1.8    {d0}, [%0], %r3;\n"
    "vst1.8    {d4}, [%1], %r3;\n"
    "vst1.8    {d1}, [%0], %r3;\n"
    "vst1.8    {d5}, [%1], %r3;\n"
    "vst1.8    {d2}, [%0], %r3;\n"
    "vst1.8    {d6}, [%1], %r3;\n"
    "vst1.8    {d3}, [%0], %r3;\n"
    "vst1.8    {d7}, [%1], %r3;\n"
    : "=r"(cpu), "=r"(cpunext)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
    : "q0", "q1", "q2", "q3");
}

/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */
inline void untile8_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    /* load two adjacent tiles, from tiled */
    "vldm %1,   {d0-d3};\n"
    /* Transpose:
     *   Start
     *   [d0]  x2  x0
     *   [d1]  x6  x4
     *   [d2]  x3  x1
     *   [d3]  x7  x5
     */
    "vswp d1, d2;\n"
     /*  [d0]  x2  x0
     *   [d1]  x3  x1
     *   [d2]  x6  x4
     *   [d3]  x7  x5
     */
    "vtrn.32 d0, d1;\n"
    "vtrn.32 d2, d3;\n"
     /*  [d0]  x1  x0
     *   [d1]  x3  x2
     *   [d2]  x5  x4
     *   [d3]  x7  x6
     */
    /* store two adjacent tiles, to untiled */
    "vst1.8    {d0}, [%0], %r2;\n"
    "vst1.8    {d1}, [%0], %r2;\n"
    "vst1.8    {d2}, [%0], %r2;\n"
    "vst1.8    {d3}, [%0], %r2;\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1");
}

/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */
inline void untile8_4x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
    __asm__ volatile (
    /* load four adjacent tiles, from tiled */
    "vldm %1,   {q0, q1, q2, q3};\n"
    /* Transpose:
     *   Start
     *   [q0] x12  x8  x4  x0
     *   [q1] x13  x9  x5  x1
     *   [q2] x14 x10  x6  x2
     *   [q3] x15 x11  x7  x3
     */
    "vswp d1, d4;\n"
    "vswp d3, d6;\n"
     /*  [q0]  x6  x2  x4  x0
     *   [q1]  x7  x3  x5  x1
     *   [q2] x14 x10 x12  x8
     *   [q3] x15 x11 x13  x9
     */
    "vtrn.32 q0, q1;\n"
    "vtrn.32 q2, q3;\n"
     /*  [q0]  x3  x2  x1  x0
     *   [q1]  x7  x6  x5  x4
     *   [q2] x11 x10  x9  x8
     *   [q3] x15 x14 x13 x12
     */
    /* store four adjacent tiles, to untiled */
    "vst1.8    {d0,d1}, [%0], %r2;\n"
    "vst1.8    {d2,d3}, [%0], %r2;\n"
    "vst1.8    {d4,d5}, [%0], %r2;\n"
    "vst1.8    {d6,d7}, [%0], %r2;\n"
    : "=r"(cpu)
    : "r"(gpu), "r"(cpu_stride), "0"(cpu)
    : "q0", "q1", "q2", "q3");
}

/*** Tile visitor functions */
#define TILE_FUNC(elmtsize,htiles,func) \
    void func(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \
    { \
        for (uint32_t y=0; y<height; y+=4) { \
            void *gpu_tile = gpu; \
            const void *cpu_tile = cpu; \
            for (uint32_t x=0; x<width; x+=htiles*4) { \
                func##_impl(gpu_tile, cpu_tile, cpu_stride);\
                gpu_tile += htiles*elmtsize/8*16; \
                cpu_tile += htiles*elmtsize/8*4; \
            } \
            gpu += gpu_stride; \
            cpu += cpu_stride*4; \
        } \
    }

#define UNTILE_FUNC(elmtsize,htiles,func) \
    void func(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \
    { \
        for (uint32_t y=0; y<height; y+=4) { \
            const void *gpu_tile = gpu; \
            void *cpu_tile = cpu; \
            for (uint32_t x=0; x<width; x+=htiles*4) { \
                func##_impl(gpu_tile, cpu_tile, cpu_stride);\
                gpu_tile += htiles*elmtsize/8*16; \
                cpu_tile += htiles*elmtsize/8*4; \
            } \
            gpu += gpu_stride; \
            cpu += cpu_stride*4; \
        } \
    }

TILE_FUNC(32, 1, tile32_1x);
TILE_FUNC(32, 2, tile32_2x);
TILE_FUNC(32, 2, tile32_2x_alt);
TILE_FUNC(16, 1, tile16_1x);
TILE_FUNC(16, 2, tile16_2x);
TILE_FUNC(8, 2, tile8_2x);
TILE_FUNC(8, 4, tile8_4x);
UNTILE_FUNC(32, 1, untile32_1x);
UNTILE_FUNC(32, 2, untile32_2x);
UNTILE_FUNC(32, 2, untile32_2x_alt);
UNTILE_FUNC(16, 1, untile16_1x);
UNTILE_FUNC(16, 2, untile16_2x);
UNTILE_FUNC(8, 2, untile8_2x);
UNTILE_FUNC(8, 4, untile8_4x);

/*** Test and benchmarking framework */

typedef void (*tilefunc_t)(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height);
typedef void (*untilefunc_t)(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height);

static void printx(void *ptr, uint32_t width, uint32_t height, uint32_t elmtsize)
{
    switch(elmtsize) {
    case 32:
        for (uint32_t y=0; y<height; ++y) {
            for (uint32_t x=0; x<width; ++x) {
                printf("%03x ", ((uint32_t*)ptr)[y*width + x]);
            }
            printf("\n");
        }
        break;
    case 16:
        for (uint32_t y=0; y<height; ++y) {
            for (uint32_t x=0; x<width; ++x) {
                printf("%03x ", ((uint16_t*)ptr)[y*width + x]);
            }
            printf("\n");
        }
        break;
    case 8:
        for (uint32_t y=0; y<height; ++y) {
            for (uint32_t x=0; x<width; ++x) {
                printf("%02x ", ((uint8_t*)ptr)[y*width + x]);
            }
            printf("\n");
        }
    }
}

int tile_test(uint32_t width, uint32_t height, uint32_t elmtsize, tilefunc_t tile_func, untilefunc_t untile_func, const char *name)
{
    uint32_t size = width*height*elmtsize/8;
    void *gpu = NULL;
    void *cpu = NULL;
    void *compare = NULL;
    uint32_t cpu_stride = width*elmtsize/8;
    uint32_t gpu_stride = width*elmtsize/8*4;

    if (posix_memalign(&gpu, 64, size) ||
        posix_memalign(&cpu, 64, size) ||
        posix_memalign(&compare, 64, size)) {
        abort();
    }

    /* test pattern */
    switch (elmtsize) {
    case 32:
        for (uint32_t y=0; y<height; ++y) {
            for (uint32_t x=0; x<width; ++x) {
                ((uint32_t*)cpu)[y*width + x] = y*width + x;
            }
        }
        break;
    case 16:
        for (uint32_t y=0; y<height; ++y) {
            for (uint32_t x=0; x<width; ++x) {
                ((uint16_t*)cpu)[y*width + x] = y*width + x;
            }
        }
        break;
    case 8:
        for (uint32_t y=0; y<height; ++y) {
            for (uint32_t x=0; x<width; ++x) {
                ((uint8_t*)cpu)[y*width + x] = y*width + x;
            }
        }
        break;
    }

    tile_func(gpu, cpu, gpu_stride, cpu_stride, width, height);

    untile_func(gpu, compare, gpu_stride, cpu_stride, width, height);

    if (memcmp(compare,cpu,size) != 0) {
        printf("MISMATCH in %s\n", name);
        printx(cpu, width, height, elmtsize);
        printf("->\n");
        printx(gpu, width, height, elmtsize);
        printf("->\n");
        printx(compare, width, height, elmtsize);
    }

    free(cpu);
    free(gpu);
    free(compare);
    return 0;
}


double timediff(struct timespec *b, struct timespec *e)
{
    return (e->tv_sec - b->tv_sec)*1.0 + (e->tv_nsec - b->tv_nsec)*1e-9;
}

typedef void (*benchfunc_t)(void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height);

int tile_bench(struct etna_device *dev, uint32_t width, uint32_t height, uint32_t elmtsize, benchfunc_t bench_func, const char *name)
{
    uint32_t size = width*height*elmtsize/8;
    void *gpu = NULL;
    void *cpu = NULL;
    uint32_t cpu_stride = width*elmtsize/8;
    uint32_t gpu_stride = width*elmtsize/8*4;
    struct etna_bo *bmp;

    if (posix_memalign(&cpu, 64, size)) {
        abort();
    }

    bmp = etna_bo_new(dev, size, DRM_ETNA_GEM_CACHE_UNCACHED);
    gpu = etna_bo_map(bmp);

#if 0
    memset(gpu, 0xaa, size);
    memset(cpu, 0x55, size);
    bench_func(gpu, cpu, gpu_stride, cpu_stride, width, height); /* warm up cache */
#endif

    const int reps=20;
    struct timespec tvm_b, tvm_e;

    printf("[%-16s] ", name);
    clock_gettime(CLOCK_MONOTONIC, &tvm_b);
    for (int rep=0; rep<reps; ++rep) {
        bench_func(gpu, cpu, gpu_stride, cpu_stride, width, height);
    }
    clock_gettime(CLOCK_MONOTONIC, &tvm_e);

    double mtime = timediff(&tvm_b, &tvm_e) / reps;
    printf("%.f us", mtime*1e6);
    printf("\n");

    free(cpu);
    etna_bo_del(bmp);
    return 0;
}

int main()
{
    /* test */
    uint32_t twidth = 128;
    uint32_t theight = 256;
    tile_test(twidth, theight, 32, base_tile32, untile32_1x, "untile32_1x");
    tile_test(twidth, theight, 32, tile32_1x, base_untile32, "tile32_1x");
    tile_test(twidth, theight, 32, tile32_2x, base_untile32, "tile32_2x");
    tile_test(twidth, theight, 32, tile32_2x_alt, base_untile32, "tile32_2x_alt");

    tile_test(twidth, theight, 32, base_tile32, untile32_2x, "untile32_2x");
    tile_test(twidth, theight, 32, base_tile32, untile32_2x_alt, "untile32_2x_alt");

    tile_test(twidth, theight, 16, base_tile16, untile16_1x, "untile16_1x");
    tile_test(twidth, theight, 16, tile16_1x, base_untile16, "tile16_1x");

    tile_test(twidth, theight, 16, base_tile16, untile16_2x, "untile16_2x");
    tile_test(twidth, theight, 16, tile16_2x, base_untile16, "tile16_2x");

    tile_test(twidth, theight, 8, base_tile8, untile8_2x, "untile8_2x");
    tile_test(twidth, theight, 8, tile8_2x, base_untile8, "tile8_2x");

    tile_test(twidth, theight, 8, base_tile8, untile8_4x, "untile8_4x");
    tile_test(twidth, theight, 8, tile8_4x, base_untile8, "tile8_4x");

    /* benchmark */
    uint32_t bwidth = 2048;
    uint32_t bheight = 1024;
    drmVersionPtr version;
    struct etna_device *dev;

    int fd = open(drm_device_name, O_RDWR);
    if (fd < 0) {
        fprintf(stdout, "Unable to open %s\n", drm_device_name);
        abort();
    }

    version = drmGetVersion(fd);
    if (version) {
        printf("Version: %d.%d.%d\n", version->version_major,
               version->version_minor, version->version_patchlevel);
        printf("  Name: %s\n", version->name);
        printf("  Date: %s\n", version->date);
        printf("  Description: %s\n", version->desc);
        drmFreeVersion(version);
    }

    dev = etna_device_new(fd);
    if (!dev) {
        fprintf(stdout, "Unable to create device\n");
        abort();
    }

    tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)base_tile8, "base_tile8");
    tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)tile8_2x, "tile8_2x");
    tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)tile8_4x, "tile8_4x");
    printf("\n");

    tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)base_tile16, "base_tile16");
    tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)tile16_1x, "tile16_1x");
    tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)tile16_2x, "tile16_2x");
    printf("\n");

    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)base_tile32, "base_tile32");
    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_1x, "tile32_1x");
    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_2x, "tile32_2x");
    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_2x_alt, "tile32_2x_alt");
    printf("\n");

    tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)base_untile8, "base_untile8");
    tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)untile8_2x, "untile8_2x");
    tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)untile8_4x, "untile8_4x");
    printf("\n");

    tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)base_untile16, "base_untile16");
    tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)untile16_1x, "untile16_1x");
    tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)untile16_2x, "untile16_2x");
    printf("\n");

    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)base_untile32, "base_untile32");
    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_1x, "untile32_1x");
    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_2x, "untile32_2x");
    tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_2x_alt, "untile32_2x_alt");
    printf("\n");

    etna_device_del(dev);
    close(fd);

    return 0;
}