Created
July 13, 2017 11:00
-
-
Save laanwj/3eb85f8a4193855de2a2f88d5984a278 to your computer and use it in GitHub Desktop.
Vivante tiling experiments in NEON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Vivante tiling experiments in NEON | |
W.J. van der Laan 2017, MIT license | |
*/ | |
/* gcc neontile.c -o neontile -O2 -mfpu=neon */ | |
#define _POSIX_C_SOURCE 200112L | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <etnaviv_drmif.h> | |
const char *drm_device_name="/dev/dri/renderD128"; | |
/*** Reference implementation for testing and performance baseline */ | |
#define TEX_TILE_WIDTH (4) | |
#define TEX_TILE_HEIGHT (4) | |
#define TEX_TILE_WORDS (TEX_TILE_WIDTH * TEX_TILE_HEIGHT) | |
#define DO_TILE(type) \ | |
src_stride /= sizeof(type); \ | |
dst_stride = (dst_stride * TEX_TILE_HEIGHT) / sizeof(type); \ | |
for (unsigned srcy = 0; srcy < height; ++srcy) { \ | |
unsigned dsty = basey + srcy; \ | |
unsigned ty = (dsty / TEX_TILE_HEIGHT) * dst_stride + \ | |
(dsty % TEX_TILE_HEIGHT) * TEX_TILE_WIDTH; \ | |
for (unsigned srcx = 0; srcx < width; ++srcx) { \ | |
unsigned dstx = basex + srcx; \ | |
((type *)dest)[ty + (dstx / TEX_TILE_WIDTH) * TEX_TILE_WORDS + \ | |
(dstx % TEX_TILE_WIDTH)] = \ | |
((type *)src)[srcy * src_stride + srcx]; \ | |
} \ | |
} | |
#define DO_UNTILE(type) \ | |
src_stride = (src_stride * TEX_TILE_HEIGHT) / sizeof(type); \ | |
dst_stride /= sizeof(type); \ | |
for (unsigned dsty = 0; dsty < height; ++dsty) { \ | |
unsigned srcy = basey + dsty; \ | |
unsigned sy = (srcy / TEX_TILE_HEIGHT) * src_stride + \ | |
(srcy % TEX_TILE_HEIGHT) * TEX_TILE_WIDTH; \ | |
for (unsigned dstx = 0; dstx < width; ++dstx) { \ | |
unsigned srcx = basex + dstx; \ | |
((type *)dest)[dsty * dst_stride + dstx] = \ | |
((type *)src)[sy + (srcx / TEX_TILE_WIDTH) * TEX_TILE_WORDS + \ | |
(srcx % TEX_TILE_WIDTH)]; \ | |
} \ | |
} | |
void | |
etna_texture_tile(void *dest, const void *src, unsigned basex, unsigned basey, | |
unsigned dst_stride, unsigned width, unsigned height, | |
unsigned src_stride, unsigned elmtsize) | |
{ | |
if (elmtsize == 4) { | |
DO_TILE(uint32_t) | |
} else if (elmtsize == 2) { | |
DO_TILE(uint16_t) | |
} else if (elmtsize == 1) { | |
DO_TILE(uint8_t) | |
} else { | |
printf("etna_texture_tile: unhandled element size %i\n", elmtsize); | |
} | |
} | |
void | |
etna_texture_untile(void *dest, const void *src, unsigned basex, unsigned basey, | |
unsigned src_stride, unsigned width, unsigned height, | |
unsigned dst_stride, unsigned elmtsize) | |
{ | |
if (elmtsize == 4) { | |
DO_UNTILE(uint32_t); | |
} else if (elmtsize == 2) { | |
DO_UNTILE(uint16_t); | |
} else if (elmtsize == 1) { | |
DO_UNTILE(uint8_t); | |
} else { | |
printf("etna_texture_tile: unhandled element size %i\n", elmtsize); | |
} | |
} | |
void base_tile8(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) | |
{ | |
etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 1); | |
} | |
void base_untile8(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) | |
{ | |
etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 1); | |
} | |
void base_tile16(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) | |
{ | |
etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 2); | |
} | |
void base_untile16(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) | |
{ | |
etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 2); | |
} | |
void base_tile32(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) | |
{ | |
etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 4); | |
} | |
void base_untile32(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) | |
{ | |
etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 4); | |
} | |
/** NEON specializations */ | |
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */ | |
inline void tile32_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
"vld1.8 {d0,d1}, [%0], %r2;\n" | |
"vld1.8 {d2,d3}, [%0], %r2;\n" | |
"vld1.8 {d4,d5}, [%0], %r2;\n" | |
"vld1.8 {d6,d7}, [%0], %r2;\n" | |
"vstm %1, {q0, q1, q2, q3};\n" | |
: "=r"(cpu) /* changed */ | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1", "q2", "q3"); | |
} | |
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */ | |
inline void tile32_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride) | |
{ | |
const void *cpunext = cpu + 16; | |
__asm__ volatile ( | |
"vld1.8 {d0,d1}, [%0], %r3;\n" | |
"vld1.8 {d8,d9}, [%1], %r3;\n" | |
"vld1.8 {d2,d3}, [%0], %r3;\n" | |
"vld1.8 {d10,d11}, [%1], %r3;\n" | |
"vld1.8 {d4,d5}, [%0], %r3;\n" | |
"vld1.8 {d12,d13}, [%1], %r3;\n" | |
"vld1.8 {d6,d7}, [%0], %r3;\n" | |
"vld1.8 {d14,d15}, [%1], %r3;\n" | |
"vstm %2, {q0, q1, q2, q3, q4, q5, q6, q7};\n" | |
: "=r"(cpu), "=r"(cpunext) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext) | |
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); | |
} | |
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */ | |
/* alt implementation does shuffling in registers instead of memory */ | |
inline void tile32_2x_alt_impl(void *gpu, const void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
"vld1.8 {d0,d1,d2,d3}, [%0], %r2;\n" | |
"vld1.8 {d8,d9,d10,d11}, [%0], %r2;\n" | |
"vld1.8 {d4,d5,d6,d7}, [%0], %r2;\n" | |
"vld1.8 {d12,d13,d14,d15}, [%0], %r2;\n" | |
/* d0 d1 d8 d9 */ | |
/* d2 d3 d10 d11 */ | |
/* d4 d5 d12 d13 */ | |
/* d6 d7 d14 d15 */ | |
"vswp d2, d8\n" | |
"vswp d3, d9\n" | |
"vswp d12, d6\n" | |
"vswp d13, d7\n" | |
/* d0 d1 d2 d3 */ | |
/* d8 d9 d10 d11 */ | |
/* d4 d5 d6 d7 */ | |
/* d12 d13 d14 d15 */ | |
"vstm %1, {q0, q1, q2, q3, q4, q5, q6, q7};\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); | |
} | |
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */ | |
inline void tile16_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
"vld1.8 {d0}, [%0], %r2;\n" | |
"vld1.8 {d1}, [%0], %r2;\n" | |
"vld1.8 {d2}, [%0], %r2;\n" | |
"vld1.8 {d3}, [%0], %r2;\n" | |
"vstm %1, {q0, q1};\n" | |
: "=r"(cpu) /* changed */ | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1"); | |
} | |
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */ | |
inline void tile16_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride) | |
{ | |
const void *cpunext = cpu + 8; | |
__asm__ volatile ( | |
/* load two adjacent tiles from untiled */ | |
"vld1.8 {d0}, [%0], %r3;\n" | |
"vld1.8 {d4}, [%1], %r3;\n" | |
"vld1.8 {d1}, [%0], %r3;\n" | |
"vld1.8 {d5}, [%1], %r3;\n" | |
"vld1.8 {d2}, [%0], %r3;\n" | |
"vld1.8 {d6}, [%1], %r3;\n" | |
"vld1.8 {d3}, [%0], %r3;\n" | |
"vld1.8 {d7}, [%1], %r3;\n" | |
/* store two adjacent tiles, tiled */ | |
"vstm %2, {q0, q1, q2, q3};\n" | |
: "=r"(cpu), "=r"(cpunext) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext) | |
: "q0", "q1", "q2", "q3"); | |
} | |
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */ | |
inline void tile8_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
/* load two adjacent tiles, from untiled */ | |
"vld1.8 {d0}, [%0], %r2;\n" | |
"vld1.8 {d1}, [%0], %r2;\n" | |
"vld1.8 {d2}, [%0], %r2;\n" | |
"vld1.8 {d3}, [%0], %r2;\n" | |
/* Transpose: | |
* Start | |
* [d0] x1 x0 | |
* [d1] x3 x2 | |
* [d2] x5 x4 | |
* [d3] x7 x6 | |
*/ | |
"vtrn.32 d0, d1;\n" | |
"vtrn.32 d2, d3;\n" | |
/* [d0] x2 x0 | |
* [d1] x3 x1 | |
* [d2] x6 x4 | |
* [d3] x7 x5 | |
*/ | |
"vswp d1, d2;\n" | |
/* [d0] x2 x0 | |
* [d1] x6 x4 | |
* [d2] x3 x1 | |
* [d3] x7 x5 | |
*/ | |
/* store two adjacent tiles, to tiled */ | |
"vstm %1, {d0-d3};\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1"); | |
} | |
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */ | |
inline void tile8_4x_impl(void *gpu, const void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
/* load four adjacent tiles, from untiled */ | |
"vld1.8 {d0,d1}, [%0], %r2;\n" | |
"vld1.8 {d2,d3}, [%0], %r2;\n" | |
"vld1.8 {d4,d5}, [%0], %r2;\n" | |
"vld1.8 {d6,d7}, [%0], %r2;\n" | |
/* Transpose: | |
* Start | |
* [q0] x3 x2 x1 x0 | |
* [q1] x7 x6 x5 x4 | |
* [q2] x11 x10 x9 x8 | |
* [q3] x15 x14 x13 x12 | |
*/ | |
"vtrn.32 q0, q1;\n" | |
"vtrn.32 q2, q3;\n" | |
/* [q0] x6 x2 x4 x0 | |
* [q1] x7 x3 x5 x1 | |
* [q2] x14 x10 x12 x8 | |
* [q3] x15 x11 x13 x9 | |
*/ | |
"vswp d1, d4;\n" | |
"vswp d3, d6;\n" | |
/* [q0] x12 x8 x4 x0 | |
* [q1] x13 x9 x5 x1 | |
* [q2] x14 x10 x6 x2 | |
* [q3] x15 x11 x7 x3 | |
*/ | |
/* store four adjacent tiles, to tiled */ | |
"vstm %1, {q0, q1, q2, q3};\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1", "q2", "q3"); | |
} | |
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */ | |
inline void untile32_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
"vldm %1, {q0, q1, q2, q3};\n" | |
"vst1.8 {d0,d1}, [%0], %r2;\n" | |
"vst1.8 {d2,d3}, [%0], %r2;\n" | |
"vst1.8 {d4,d5}, [%0], %r2;\n" | |
"vst1.8 {d6,d7}, [%0], %r2;\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1", "q2", "q3"); | |
} | |
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */ | |
inline void untile32_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride) | |
{ | |
void *cpunext = cpu + 16; | |
__asm__ volatile ( | |
"vldm %2, {q0, q1, q2, q3, q4, q5, q6, q7};\n" | |
"vst1.8 {d0,d1}, [%0], %r3;\n" | |
"vst1.8 {d8,d9}, [%1], %r3;\n" | |
"vst1.8 {d2,d3}, [%0], %r3;\n" | |
"vst1.8 {d10,d11}, [%1], %r3;\n" | |
"vst1.8 {d4,d5}, [%0], %r3;\n" | |
"vst1.8 {d12,d13}, [%1], %r3;\n" | |
"vst1.8 {d6,d7}, [%0], %r3;\n" | |
"vst1.8 {d14,d15}, [%1], %r3;\n" | |
: "=r"(cpu), "=r"(cpunext) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext) | |
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); | |
} | |
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */ | |
/* alt implementation does shuffling in registers instead of memory */ | |
inline void untile32_2x_alt_impl(const void *gpu, void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
"vldm %1, {q0, q1, q2, q3, q4, q5, q6, q7};\n" | |
/* d0 d1 d8 d9 */ | |
/* d2 d3 d10 d11 */ | |
/* d4 d5 d12 d13 */ | |
/* d6 d7 d14 d15 */ | |
"vswp d2, d8\n" | |
"vswp d3, d9\n" | |
"vswp d12, d6\n" | |
"vswp d13, d7\n" | |
/* d0 d1 d2 d3 */ | |
/* d8 d9 d10 d11 */ | |
/* d4 d5 d6 d7 */ | |
/* d12 d13 d14 d15 */ | |
"vst1.8 {d0,d1,d2,d3}, [%0], %r2;\n" | |
"vst1.8 {d8,d9,d10,d11}, [%0], %r2;\n" | |
"vst1.8 {d4,d5,d6,d7}, [%0], %r2;\n" | |
"vst1.8 {d12,d13,d14,d15}, [%0], %r2;\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); | |
} | |
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */ | |
inline void untile16_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
"vldm %1, {q0, q1};\n" | |
"vst1.8 {d0}, [%0], %r2;\n" | |
"vst1.8 {d1}, [%0], %r2;\n" | |
"vst1.8 {d2}, [%0], %r2;\n" | |
"vst1.8 {d3}, [%0], %r2;\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1"); | |
} | |
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */ | |
inline void untile16_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride) | |
{ | |
void *cpunext = cpu + 8; | |
__asm__ volatile ( | |
/* load two adjacent tiles, tiled */ | |
"vldm %2, {q0, q1, q2, q3};\n" | |
/* store two adjacent tiles, untiled */ | |
"vst1.8 {d0}, [%0], %r3;\n" | |
"vst1.8 {d4}, [%1], %r3;\n" | |
"vst1.8 {d1}, [%0], %r3;\n" | |
"vst1.8 {d5}, [%1], %r3;\n" | |
"vst1.8 {d2}, [%0], %r3;\n" | |
"vst1.8 {d6}, [%1], %r3;\n" | |
"vst1.8 {d3}, [%0], %r3;\n" | |
"vst1.8 {d7}, [%1], %r3;\n" | |
: "=r"(cpu), "=r"(cpunext) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext) | |
: "q0", "q1", "q2", "q3"); | |
} | |
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */ | |
inline void untile8_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
/* load two adjacent tiles, from tiled */ | |
"vldm %1, {d0-d3};\n" | |
/* Transpose: | |
* Start | |
* [d0] x2 x0 | |
* [d1] x6 x4 | |
* [d2] x3 x1 | |
* [d3] x7 x5 | |
*/ | |
"vswp d1, d2;\n" | |
/* [d0] x2 x0 | |
* [d1] x3 x1 | |
* [d2] x6 x4 | |
* [d3] x7 x5 | |
*/ | |
"vtrn.32 d0, d1;\n" | |
"vtrn.32 d2, d3;\n" | |
/* [d0] x1 x0 | |
* [d1] x3 x2 | |
* [d2] x5 x4 | |
* [d3] x7 x6 | |
*/ | |
/* store two adjacent tiles, to untiled */ | |
"vst1.8 {d0}, [%0], %r2;\n" | |
"vst1.8 {d1}, [%0], %r2;\n" | |
"vst1.8 {d2}, [%0], %r2;\n" | |
"vst1.8 {d3}, [%0], %r2;\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1"); | |
} | |
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */ | |
inline void untile8_4x_impl(const void *gpu, void *cpu, uint32_t cpu_stride) | |
{ | |
__asm__ volatile ( | |
/* load four adjacent tiles, from tiled */ | |
"vldm %1, {q0, q1, q2, q3};\n" | |
/* Transpose: | |
* Start | |
* [q0] x12 x8 x4 x0 | |
* [q1] x13 x9 x5 x1 | |
* [q2] x14 x10 x6 x2 | |
* [q3] x15 x11 x7 x3 | |
*/ | |
"vswp d1, d4;\n" | |
"vswp d3, d6;\n" | |
/* [q0] x6 x2 x4 x0 | |
* [q1] x7 x3 x5 x1 | |
* [q2] x14 x10 x12 x8 | |
* [q3] x15 x11 x13 x9 | |
*/ | |
"vtrn.32 q0, q1;\n" | |
"vtrn.32 q2, q3;\n" | |
/* [q0] x3 x2 x1 x0 | |
* [q1] x7 x6 x5 x4 | |
* [q2] x11 x10 x9 x8 | |
* [q3] x15 x14 x13 x12 | |
*/ | |
/* store four adjacent tiles, to untiled */ | |
"vst1.8 {d0,d1}, [%0], %r2;\n" | |
"vst1.8 {d2,d3}, [%0], %r2;\n" | |
"vst1.8 {d4,d5}, [%0], %r2;\n" | |
"vst1.8 {d6,d7}, [%0], %r2;\n" | |
: "=r"(cpu) | |
: "r"(gpu), "r"(cpu_stride), "0"(cpu) | |
: "q0", "q1", "q2", "q3"); | |
} | |
/*** Tile visitor functions */ | |
#define TILE_FUNC(elmtsize,htiles,func) \ | |
void func(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \ | |
{ \ | |
for (uint32_t y=0; y<height; y+=4) { \ | |
void *gpu_tile = gpu; \ | |
const void *cpu_tile = cpu; \ | |
for (uint32_t x=0; x<width; x+=htiles*4) { \ | |
func##_impl(gpu_tile, cpu_tile, cpu_stride);\ | |
gpu_tile += htiles*elmtsize/8*16; \ | |
cpu_tile += htiles*elmtsize/8*4; \ | |
} \ | |
gpu += gpu_stride; \ | |
cpu += cpu_stride*4; \ | |
} \ | |
} | |
#define UNTILE_FUNC(elmtsize,htiles,func) \ | |
void func(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \ | |
{ \ | |
for (uint32_t y=0; y<height; y+=4) { \ | |
const void *gpu_tile = gpu; \ | |
void *cpu_tile = cpu; \ | |
for (uint32_t x=0; x<width; x+=htiles*4) { \ | |
func##_impl(gpu_tile, cpu_tile, cpu_stride);\ | |
gpu_tile += htiles*elmtsize/8*16; \ | |
cpu_tile += htiles*elmtsize/8*4; \ | |
} \ | |
gpu += gpu_stride; \ | |
cpu += cpu_stride*4; \ | |
} \ | |
} | |
TILE_FUNC(32, 1, tile32_1x); | |
TILE_FUNC(32, 2, tile32_2x); | |
TILE_FUNC(32, 2, tile32_2x_alt); | |
TILE_FUNC(16, 1, tile16_1x); | |
TILE_FUNC(16, 2, tile16_2x); | |
TILE_FUNC(8, 2, tile8_2x); | |
TILE_FUNC(8, 4, tile8_4x); | |
UNTILE_FUNC(32, 1, untile32_1x); | |
UNTILE_FUNC(32, 2, untile32_2x); | |
UNTILE_FUNC(32, 2, untile32_2x_alt); | |
UNTILE_FUNC(16, 1, untile16_1x); | |
UNTILE_FUNC(16, 2, untile16_2x); | |
UNTILE_FUNC(8, 2, untile8_2x); | |
UNTILE_FUNC(8, 4, untile8_4x); | |
/*** Test and benchmarking framework */ | |
typedef void (*tilefunc_t)(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height); | |
typedef void (*untilefunc_t)(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height); | |
static void printx(void *ptr, uint32_t width, uint32_t height, uint32_t elmtsize) | |
{ | |
switch(elmtsize) { | |
case 32: | |
for (uint32_t y=0; y<height; ++y) { | |
for (uint32_t x=0; x<width; ++x) { | |
printf("%03x ", ((uint32_t*)ptr)[y*width + x]); | |
} | |
printf("\n"); | |
} | |
break; | |
case 16: | |
for (uint32_t y=0; y<height; ++y) { | |
for (uint32_t x=0; x<width; ++x) { | |
printf("%03x ", ((uint16_t*)ptr)[y*width + x]); | |
} | |
printf("\n"); | |
} | |
break; | |
case 8: | |
for (uint32_t y=0; y<height; ++y) { | |
for (uint32_t x=0; x<width; ++x) { | |
printf("%02x ", ((uint8_t*)ptr)[y*width + x]); | |
} | |
printf("\n"); | |
} | |
} | |
} | |
int tile_test(uint32_t width, uint32_t height, uint32_t elmtsize, tilefunc_t tile_func, untilefunc_t untile_func, const char *name) | |
{ | |
uint32_t size = width*height*elmtsize/8; | |
void *gpu = NULL; | |
void *cpu = NULL; | |
void *compare = NULL; | |
uint32_t cpu_stride = width*elmtsize/8; | |
uint32_t gpu_stride = width*elmtsize/8*4; | |
if (posix_memalign(&gpu, 64, size) || | |
posix_memalign(&cpu, 64, size) || | |
posix_memalign(&compare, 64, size)) { | |
abort(); | |
} | |
/* test pattern */ | |
switch (elmtsize) { | |
case 32: | |
for (uint32_t y=0; y<height; ++y) { | |
for (uint32_t x=0; x<width; ++x) { | |
((uint32_t*)cpu)[y*width + x] = y*width + x; | |
} | |
} | |
break; | |
case 16: | |
for (uint32_t y=0; y<height; ++y) { | |
for (uint32_t x=0; x<width; ++x) { | |
((uint16_t*)cpu)[y*width + x] = y*width + x; | |
} | |
} | |
break; | |
case 8: | |
for (uint32_t y=0; y<height; ++y) { | |
for (uint32_t x=0; x<width; ++x) { | |
((uint8_t*)cpu)[y*width + x] = y*width + x; | |
} | |
} | |
break; | |
} | |
tile_func(gpu, cpu, gpu_stride, cpu_stride, width, height); | |
untile_func(gpu, compare, gpu_stride, cpu_stride, width, height); | |
if (memcmp(compare,cpu,size) != 0) { | |
printf("MISMATCH in %s\n", name); | |
printx(cpu, width, height, elmtsize); | |
printf("->\n"); | |
printx(gpu, width, height, elmtsize); | |
printf("->\n"); | |
printx(compare, width, height, elmtsize); | |
} | |
free(cpu); | |
free(gpu); | |
free(compare); | |
return 0; | |
} | |
double timediff(struct timespec *b, struct timespec *e) | |
{ | |
return (e->tv_sec - b->tv_sec)*1.0 + (e->tv_nsec - b->tv_nsec)*1e-9; | |
} | |
typedef void (*benchfunc_t)(void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height); | |
int tile_bench(struct etna_device *dev, uint32_t width, uint32_t height, uint32_t elmtsize, benchfunc_t bench_func, const char *name) | |
{ | |
uint32_t size = width*height*elmtsize/8; | |
void *gpu = NULL; | |
void *cpu = NULL; | |
uint32_t cpu_stride = width*elmtsize/8; | |
uint32_t gpu_stride = width*elmtsize/8*4; | |
struct etna_bo *bmp; | |
if (posix_memalign(&cpu, 64, size)) { | |
abort(); | |
} | |
bmp = etna_bo_new(dev, size, DRM_ETNA_GEM_CACHE_UNCACHED); | |
gpu = etna_bo_map(bmp); | |
#if 0 | |
memset(gpu, 0xaa, size); | |
memset(cpu, 0x55, size); | |
bench_func(gpu, cpu, gpu_stride, cpu_stride, width, height); /* warm up cache */ | |
#endif | |
const int reps=20; | |
struct timespec tvm_b, tvm_e; | |
printf("[%-16s] ", name); | |
clock_gettime(CLOCK_MONOTONIC, &tvm_b); | |
for (int rep=0; rep<reps; ++rep) { | |
bench_func(gpu, cpu, gpu_stride, cpu_stride, width, height); | |
} | |
clock_gettime(CLOCK_MONOTONIC, &tvm_e); | |
double mtime = timediff(&tvm_b, &tvm_e) / reps; | |
printf("%.f us", mtime*1e6); | |
printf("\n"); | |
free(cpu); | |
etna_bo_del(bmp); | |
return 0; | |
} | |
int main() | |
{ | |
/* test */ | |
uint32_t twidth = 128; | |
uint32_t theight = 256; | |
tile_test(twidth, theight, 32, base_tile32, untile32_1x, "untile32_1x"); | |
tile_test(twidth, theight, 32, tile32_1x, base_untile32, "tile32_1x"); | |
tile_test(twidth, theight, 32, tile32_2x, base_untile32, "tile32_2x"); | |
tile_test(twidth, theight, 32, tile32_2x_alt, base_untile32, "tile32_2x_alt"); | |
tile_test(twidth, theight, 32, base_tile32, untile32_2x, "untile32_2x"); | |
tile_test(twidth, theight, 32, base_tile32, untile32_2x_alt, "untile32_2x_alt"); | |
tile_test(twidth, theight, 16, base_tile16, untile16_1x, "untile16_1x"); | |
tile_test(twidth, theight, 16, tile16_1x, base_untile16, "tile16_1x"); | |
tile_test(twidth, theight, 16, base_tile16, untile16_2x, "untile16_2x"); | |
tile_test(twidth, theight, 16, tile16_2x, base_untile16, "tile16_2x"); | |
tile_test(twidth, theight, 8, base_tile8, untile8_2x, "untile8_2x"); | |
tile_test(twidth, theight, 8, tile8_2x, base_untile8, "tile8_2x"); | |
tile_test(twidth, theight, 8, base_tile8, untile8_4x, "untile8_4x"); | |
tile_test(twidth, theight, 8, tile8_4x, base_untile8, "tile8_4x"); | |
/* benchmark */ | |
uint32_t bwidth = 2048; | |
uint32_t bheight = 1024; | |
drmVersionPtr version; | |
struct etna_device *dev; | |
int fd = open(drm_device_name, O_RDWR); | |
if (fd < 0) { | |
fprintf(stdout, "Unable to open %s\n", drm_device_name); | |
abort(); | |
} | |
version = drmGetVersion(fd); | |
if (version) { | |
printf("Version: %d.%d.%d\n", version->version_major, | |
version->version_minor, version->version_patchlevel); | |
printf(" Name: %s\n", version->name); | |
printf(" Date: %s\n", version->date); | |
printf(" Description: %s\n", version->desc); | |
drmFreeVersion(version); | |
} | |
dev = etna_device_new(fd); | |
if (!dev) { | |
fprintf(stdout, "Unable to create device\n"); | |
abort(); | |
} | |
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)base_tile8, "base_tile8"); | |
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)tile8_2x, "tile8_2x"); | |
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)tile8_4x, "tile8_4x"); | |
printf("\n"); | |
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)base_tile16, "base_tile16"); | |
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)tile16_1x, "tile16_1x"); | |
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)tile16_2x, "tile16_2x"); | |
printf("\n"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)base_tile32, "base_tile32"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_1x, "tile32_1x"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_2x, "tile32_2x"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_2x_alt, "tile32_2x_alt"); | |
printf("\n"); | |
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)base_untile8, "base_untile8"); | |
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)untile8_2x, "untile8_2x"); | |
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)untile8_4x, "untile8_4x"); | |
printf("\n"); | |
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)base_untile16, "base_untile16"); | |
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)untile16_1x, "untile16_1x"); | |
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)untile16_2x, "untile16_2x"); | |
printf("\n"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)base_untile32, "base_untile32"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_1x, "untile32_1x"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_2x, "untile32_2x"); | |
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_2x_alt, "untile32_2x_alt"); | |
printf("\n"); | |
etna_device_del(dev); | |
close(fd); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment