Skip to content

Instantly share code, notes, and snippets.

@laanwj
Created July 13, 2017 11:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save laanwj/3eb85f8a4193855de2a2f88d5984a278 to your computer and use it in GitHub Desktop.
Save laanwj/3eb85f8a4193855de2a2f88d5984a278 to your computer and use it in GitHub Desktop.
Vivante tiling experiments in NEON
/* Vivante tiling experiments in NEON
W.J. van der Laan 2017, MIT license
*/
/* gcc neontile.c -o neontile -O2 -mfpu=neon */
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <etnaviv_drmif.h>
const char *drm_device_name="/dev/dri/renderD128";
/*** Reference implementation for testing and performance baseline */
#define TEX_TILE_WIDTH (4)
#define TEX_TILE_HEIGHT (4)
#define TEX_TILE_WORDS (TEX_TILE_WIDTH * TEX_TILE_HEIGHT)
#define DO_TILE(type) \
src_stride /= sizeof(type); \
dst_stride = (dst_stride * TEX_TILE_HEIGHT) / sizeof(type); \
for (unsigned srcy = 0; srcy < height; ++srcy) { \
unsigned dsty = basey + srcy; \
unsigned ty = (dsty / TEX_TILE_HEIGHT) * dst_stride + \
(dsty % TEX_TILE_HEIGHT) * TEX_TILE_WIDTH; \
for (unsigned srcx = 0; srcx < width; ++srcx) { \
unsigned dstx = basex + srcx; \
((type *)dest)[ty + (dstx / TEX_TILE_WIDTH) * TEX_TILE_WORDS + \
(dstx % TEX_TILE_WIDTH)] = \
((type *)src)[srcy * src_stride + srcx]; \
} \
}
#define DO_UNTILE(type) \
src_stride = (src_stride * TEX_TILE_HEIGHT) / sizeof(type); \
dst_stride /= sizeof(type); \
for (unsigned dsty = 0; dsty < height; ++dsty) { \
unsigned srcy = basey + dsty; \
unsigned sy = (srcy / TEX_TILE_HEIGHT) * src_stride + \
(srcy % TEX_TILE_HEIGHT) * TEX_TILE_WIDTH; \
for (unsigned dstx = 0; dstx < width; ++dstx) { \
unsigned srcx = basex + dstx; \
((type *)dest)[dsty * dst_stride + dstx] = \
((type *)src)[sy + (srcx / TEX_TILE_WIDTH) * TEX_TILE_WORDS + \
(srcx % TEX_TILE_WIDTH)]; \
} \
}
void
etna_texture_tile(void *dest, const void *src, unsigned basex, unsigned basey,
unsigned dst_stride, unsigned width, unsigned height,
unsigned src_stride, unsigned elmtsize)
{
if (elmtsize == 4) {
DO_TILE(uint32_t)
} else if (elmtsize == 2) {
DO_TILE(uint16_t)
} else if (elmtsize == 1) {
DO_TILE(uint8_t)
} else {
printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
}
}
void
etna_texture_untile(void *dest, const void *src, unsigned basex, unsigned basey,
unsigned src_stride, unsigned width, unsigned height,
unsigned dst_stride, unsigned elmtsize)
{
if (elmtsize == 4) {
DO_UNTILE(uint32_t);
} else if (elmtsize == 2) {
DO_UNTILE(uint16_t);
} else if (elmtsize == 1) {
DO_UNTILE(uint8_t);
} else {
printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
}
}
void base_tile8(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 1);
}
void base_untile8(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 1);
}
void base_tile16(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 2);
}
void base_untile16(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 2);
}
void base_tile32(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
etna_texture_tile(gpu, cpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 4);
}
void base_untile32(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height)
{
etna_texture_untile(cpu, gpu, 0, 0, gpu_stride/4, width, height, cpu_stride, 4);
}
/** NEON specializations */
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */
inline void tile32_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
"vld1.8 {d0,d1}, [%0], %r2;\n"
"vld1.8 {d2,d3}, [%0], %r2;\n"
"vld1.8 {d4,d5}, [%0], %r2;\n"
"vld1.8 {d6,d7}, [%0], %r2;\n"
"vstm %1, {q0, q1, q2, q3};\n"
: "=r"(cpu) /* changed */
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1", "q2", "q3");
}
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
inline void tile32_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
const void *cpunext = cpu + 16;
__asm__ volatile (
"vld1.8 {d0,d1}, [%0], %r3;\n"
"vld1.8 {d8,d9}, [%1], %r3;\n"
"vld1.8 {d2,d3}, [%0], %r3;\n"
"vld1.8 {d10,d11}, [%1], %r3;\n"
"vld1.8 {d4,d5}, [%0], %r3;\n"
"vld1.8 {d12,d13}, [%1], %r3;\n"
"vld1.8 {d6,d7}, [%0], %r3;\n"
"vld1.8 {d14,d15}, [%1], %r3;\n"
"vstm %2, {q0, q1, q2, q3, q4, q5, q6, q7};\n"
: "=r"(cpu), "=r"(cpunext)
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
/* alt implementation does shuffling in registers instead of memory */
inline void tile32_2x_alt_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
"vld1.8 {d0,d1,d2,d3}, [%0], %r2;\n"
"vld1.8 {d8,d9,d10,d11}, [%0], %r2;\n"
"vld1.8 {d4,d5,d6,d7}, [%0], %r2;\n"
"vld1.8 {d12,d13,d14,d15}, [%0], %r2;\n"
/* d0 d1 d8 d9 */
/* d2 d3 d10 d11 */
/* d4 d5 d12 d13 */
/* d6 d7 d14 d15 */
"vswp d2, d8\n"
"vswp d3, d9\n"
"vswp d12, d6\n"
"vswp d13, d7\n"
/* d0 d1 d2 d3 */
/* d8 d9 d10 d11 */
/* d4 d5 d6 d7 */
/* d12 d13 d14 d15 */
"vstm %1, {q0, q1, q2, q3, q4, q5, q6, q7};\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */
inline void tile16_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
"vld1.8 {d0}, [%0], %r2;\n"
"vld1.8 {d1}, [%0], %r2;\n"
"vld1.8 {d2}, [%0], %r2;\n"
"vld1.8 {d3}, [%0], %r2;\n"
"vstm %1, {q0, q1};\n"
: "=r"(cpu) /* changed */
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1");
}
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */
inline void tile16_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
const void *cpunext = cpu + 8;
__asm__ volatile (
/* load two adjacent tiles from untiled */
"vld1.8 {d0}, [%0], %r3;\n"
"vld1.8 {d4}, [%1], %r3;\n"
"vld1.8 {d1}, [%0], %r3;\n"
"vld1.8 {d5}, [%1], %r3;\n"
"vld1.8 {d2}, [%0], %r3;\n"
"vld1.8 {d6}, [%1], %r3;\n"
"vld1.8 {d3}, [%0], %r3;\n"
"vld1.8 {d7}, [%1], %r3;\n"
/* store two adjacent tiles, tiled */
"vstm %2, {q0, q1, q2, q3};\n"
: "=r"(cpu), "=r"(cpunext)
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
: "q0", "q1", "q2", "q3");
}
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */
inline void tile8_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
/* load two adjacent tiles, from untiled */
"vld1.8 {d0}, [%0], %r2;\n"
"vld1.8 {d1}, [%0], %r2;\n"
"vld1.8 {d2}, [%0], %r2;\n"
"vld1.8 {d3}, [%0], %r2;\n"
/* Transpose:
* Start
* [d0] x1 x0
* [d1] x3 x2
* [d2] x5 x4
* [d3] x7 x6
*/
"vtrn.32 d0, d1;\n"
"vtrn.32 d2, d3;\n"
/* [d0] x2 x0
* [d1] x3 x1
* [d2] x6 x4
* [d3] x7 x5
*/
"vswp d1, d2;\n"
/* [d0] x2 x0
* [d1] x6 x4
* [d2] x3 x1
* [d3] x7 x5
*/
/* store two adjacent tiles, to tiled */
"vstm %1, {d0-d3};\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1");
}
/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */
inline void tile8_4x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
/* load four adjacent tiles, from untiled */
"vld1.8 {d0,d1}, [%0], %r2;\n"
"vld1.8 {d2,d3}, [%0], %r2;\n"
"vld1.8 {d4,d5}, [%0], %r2;\n"
"vld1.8 {d6,d7}, [%0], %r2;\n"
/* Transpose:
* Start
* [q0] x3 x2 x1 x0
* [q1] x7 x6 x5 x4
* [q2] x11 x10 x9 x8
* [q3] x15 x14 x13 x12
*/
"vtrn.32 q0, q1;\n"
"vtrn.32 q2, q3;\n"
/* [q0] x6 x2 x4 x0
* [q1] x7 x3 x5 x1
* [q2] x14 x10 x12 x8
* [q3] x15 x11 x13 x9
*/
"vswp d1, d4;\n"
"vswp d3, d6;\n"
/* [q0] x12 x8 x4 x0
* [q1] x13 x9 x5 x1
* [q2] x14 x10 x6 x2
* [q3] x15 x11 x7 x3
*/
/* store four adjacent tiles, to tiled */
"vstm %1, {q0, q1, q2, q3};\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1", "q2", "q3");
}
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */
inline void untile32_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
"vldm %1, {q0, q1, q2, q3};\n"
"vst1.8 {d0,d1}, [%0], %r2;\n"
"vst1.8 {d2,d3}, [%0], %r2;\n"
"vst1.8 {d4,d5}, [%0], %r2;\n"
"vst1.8 {d6,d7}, [%0], %r2;\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1", "q2", "q3");
}
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
inline void untile32_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
void *cpunext = cpu + 16;
__asm__ volatile (
"vldm %2, {q0, q1, q2, q3, q4, q5, q6, q7};\n"
"vst1.8 {d0,d1}, [%0], %r3;\n"
"vst1.8 {d8,d9}, [%1], %r3;\n"
"vst1.8 {d2,d3}, [%0], %r3;\n"
"vst1.8 {d10,d11}, [%1], %r3;\n"
"vst1.8 {d4,d5}, [%0], %r3;\n"
"vst1.8 {d12,d13}, [%1], %r3;\n"
"vst1.8 {d6,d7}, [%0], %r3;\n"
"vst1.8 {d14,d15}, [%1], %r3;\n"
: "=r"(cpu), "=r"(cpunext)
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
/* alt implementation does shuffling in registers instead of memory */
inline void untile32_2x_alt_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
"vldm %1, {q0, q1, q2, q3, q4, q5, q6, q7};\n"
/* d0 d1 d8 d9 */
/* d2 d3 d10 d11 */
/* d4 d5 d12 d13 */
/* d6 d7 d14 d15 */
"vswp d2, d8\n"
"vswp d3, d9\n"
"vswp d12, d6\n"
"vswp d13, d7\n"
/* d0 d1 d2 d3 */
/* d8 d9 d10 d11 */
/* d4 d5 d6 d7 */
/* d12 d13 d14 d15 */
"vst1.8 {d0,d1,d2,d3}, [%0], %r2;\n"
"vst1.8 {d8,d9,d10,d11}, [%0], %r2;\n"
"vst1.8 {d4,d5,d6,d7}, [%0], %r2;\n"
"vst1.8 {d12,d13,d14,d15}, [%0], %r2;\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */
inline void untile16_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
"vldm %1, {q0, q1};\n"
"vst1.8 {d0}, [%0], %r2;\n"
"vst1.8 {d1}, [%0], %r2;\n"
"vst1.8 {d2}, [%0], %r2;\n"
"vst1.8 {d3}, [%0], %r2;\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1");
}
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */
inline void untile16_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
void *cpunext = cpu + 8;
__asm__ volatile (
/* load two adjacent tiles, tiled */
"vldm %2, {q0, q1, q2, q3};\n"
/* store two adjacent tiles, untiled */
"vst1.8 {d0}, [%0], %r3;\n"
"vst1.8 {d4}, [%1], %r3;\n"
"vst1.8 {d1}, [%0], %r3;\n"
"vst1.8 {d5}, [%1], %r3;\n"
"vst1.8 {d2}, [%0], %r3;\n"
"vst1.8 {d6}, [%1], %r3;\n"
"vst1.8 {d3}, [%0], %r3;\n"
"vst1.8 {d7}, [%1], %r3;\n"
: "=r"(cpu), "=r"(cpunext)
: "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
: "q0", "q1", "q2", "q3");
}
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */
inline void untile8_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
/* load two adjacent tiles, from tiled */
"vldm %1, {d0-d3};\n"
/* Transpose:
* Start
* [d0] x2 x0
* [d1] x6 x4
* [d2] x3 x1
* [d3] x7 x5
*/
"vswp d1, d2;\n"
/* [d0] x2 x0
* [d1] x3 x1
* [d2] x6 x4
* [d3] x7 x5
*/
"vtrn.32 d0, d1;\n"
"vtrn.32 d2, d3;\n"
/* [d0] x1 x0
* [d1] x3 x2
* [d2] x5 x4
* [d3] x7 x6
*/
/* store two adjacent tiles, to untiled */
"vst1.8 {d0}, [%0], %r2;\n"
"vst1.8 {d1}, [%0], %r2;\n"
"vst1.8 {d2}, [%0], %r2;\n"
"vst1.8 {d3}, [%0], %r2;\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1");
}
/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */
inline void untile8_4x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
{
__asm__ volatile (
/* load four adjacent tiles, from tiled */
"vldm %1, {q0, q1, q2, q3};\n"
/* Transpose:
* Start
* [q0] x12 x8 x4 x0
* [q1] x13 x9 x5 x1
* [q2] x14 x10 x6 x2
* [q3] x15 x11 x7 x3
*/
"vswp d1, d4;\n"
"vswp d3, d6;\n"
/* [q0] x6 x2 x4 x0
* [q1] x7 x3 x5 x1
* [q2] x14 x10 x12 x8
* [q3] x15 x11 x13 x9
*/
"vtrn.32 q0, q1;\n"
"vtrn.32 q2, q3;\n"
/* [q0] x3 x2 x1 x0
* [q1] x7 x6 x5 x4
* [q2] x11 x10 x9 x8
* [q3] x15 x14 x13 x12
*/
/* store four adjacent tiles, to untiled */
"vst1.8 {d0,d1}, [%0], %r2;\n"
"vst1.8 {d2,d3}, [%0], %r2;\n"
"vst1.8 {d4,d5}, [%0], %r2;\n"
"vst1.8 {d6,d7}, [%0], %r2;\n"
: "=r"(cpu)
: "r"(gpu), "r"(cpu_stride), "0"(cpu)
: "q0", "q1", "q2", "q3");
}
/*** Tile visitor functions */
#define TILE_FUNC(elmtsize,htiles,func) \
void func(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \
{ \
for (uint32_t y=0; y<height; y+=4) { \
void *gpu_tile = gpu; \
const void *cpu_tile = cpu; \
for (uint32_t x=0; x<width; x+=htiles*4) { \
func##_impl(gpu_tile, cpu_tile, cpu_stride);\
gpu_tile += htiles*elmtsize/8*16; \
cpu_tile += htiles*elmtsize/8*4; \
} \
gpu += gpu_stride; \
cpu += cpu_stride*4; \
} \
}
#define UNTILE_FUNC(elmtsize,htiles,func) \
void func(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \
{ \
for (uint32_t y=0; y<height; y+=4) { \
const void *gpu_tile = gpu; \
void *cpu_tile = cpu; \
for (uint32_t x=0; x<width; x+=htiles*4) { \
func##_impl(gpu_tile, cpu_tile, cpu_stride);\
gpu_tile += htiles*elmtsize/8*16; \
cpu_tile += htiles*elmtsize/8*4; \
} \
gpu += gpu_stride; \
cpu += cpu_stride*4; \
} \
}
TILE_FUNC(32, 1, tile32_1x);
TILE_FUNC(32, 2, tile32_2x);
TILE_FUNC(32, 2, tile32_2x_alt);
TILE_FUNC(16, 1, tile16_1x);
TILE_FUNC(16, 2, tile16_2x);
TILE_FUNC(8, 2, tile8_2x);
TILE_FUNC(8, 4, tile8_4x);
UNTILE_FUNC(32, 1, untile32_1x);
UNTILE_FUNC(32, 2, untile32_2x);
UNTILE_FUNC(32, 2, untile32_2x_alt);
UNTILE_FUNC(16, 1, untile16_1x);
UNTILE_FUNC(16, 2, untile16_2x);
UNTILE_FUNC(8, 2, untile8_2x);
UNTILE_FUNC(8, 4, untile8_4x);
/*** Test and benchmarking framework */
typedef void (*tilefunc_t)(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height);
typedef void (*untilefunc_t)(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height);
static void printx(void *ptr, uint32_t width, uint32_t height, uint32_t elmtsize)
{
switch(elmtsize) {
case 32:
for (uint32_t y=0; y<height; ++y) {
for (uint32_t x=0; x<width; ++x) {
printf("%03x ", ((uint32_t*)ptr)[y*width + x]);
}
printf("\n");
}
break;
case 16:
for (uint32_t y=0; y<height; ++y) {
for (uint32_t x=0; x<width; ++x) {
printf("%03x ", ((uint16_t*)ptr)[y*width + x]);
}
printf("\n");
}
break;
case 8:
for (uint32_t y=0; y<height; ++y) {
for (uint32_t x=0; x<width; ++x) {
printf("%02x ", ((uint8_t*)ptr)[y*width + x]);
}
printf("\n");
}
}
}
int tile_test(uint32_t width, uint32_t height, uint32_t elmtsize, tilefunc_t tile_func, untilefunc_t untile_func, const char *name)
{
uint32_t size = width*height*elmtsize/8;
void *gpu = NULL;
void *cpu = NULL;
void *compare = NULL;
uint32_t cpu_stride = width*elmtsize/8;
uint32_t gpu_stride = width*elmtsize/8*4;
if (posix_memalign(&gpu, 64, size) ||
posix_memalign(&cpu, 64, size) ||
posix_memalign(&compare, 64, size)) {
abort();
}
/* test pattern */
switch (elmtsize) {
case 32:
for (uint32_t y=0; y<height; ++y) {
for (uint32_t x=0; x<width; ++x) {
((uint32_t*)cpu)[y*width + x] = y*width + x;
}
}
break;
case 16:
for (uint32_t y=0; y<height; ++y) {
for (uint32_t x=0; x<width; ++x) {
((uint16_t*)cpu)[y*width + x] = y*width + x;
}
}
break;
case 8:
for (uint32_t y=0; y<height; ++y) {
for (uint32_t x=0; x<width; ++x) {
((uint8_t*)cpu)[y*width + x] = y*width + x;
}
}
break;
}
tile_func(gpu, cpu, gpu_stride, cpu_stride, width, height);
untile_func(gpu, compare, gpu_stride, cpu_stride, width, height);
if (memcmp(compare,cpu,size) != 0) {
printf("MISMATCH in %s\n", name);
printx(cpu, width, height, elmtsize);
printf("->\n");
printx(gpu, width, height, elmtsize);
printf("->\n");
printx(compare, width, height, elmtsize);
}
free(cpu);
free(gpu);
free(compare);
return 0;
}
double timediff(struct timespec *b, struct timespec *e)
{
return (e->tv_sec - b->tv_sec)*1.0 + (e->tv_nsec - b->tv_nsec)*1e-9;
}
typedef void (*benchfunc_t)(void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height);
int tile_bench(struct etna_device *dev, uint32_t width, uint32_t height, uint32_t elmtsize, benchfunc_t bench_func, const char *name)
{
uint32_t size = width*height*elmtsize/8;
void *gpu = NULL;
void *cpu = NULL;
uint32_t cpu_stride = width*elmtsize/8;
uint32_t gpu_stride = width*elmtsize/8*4;
struct etna_bo *bmp;
if (posix_memalign(&cpu, 64, size)) {
abort();
}
bmp = etna_bo_new(dev, size, DRM_ETNA_GEM_CACHE_UNCACHED);
gpu = etna_bo_map(bmp);
#if 0
memset(gpu, 0xaa, size);
memset(cpu, 0x55, size);
bench_func(gpu, cpu, gpu_stride, cpu_stride, width, height); /* warm up cache */
#endif
const int reps=20;
struct timespec tvm_b, tvm_e;
printf("[%-16s] ", name);
clock_gettime(CLOCK_MONOTONIC, &tvm_b);
for (int rep=0; rep<reps; ++rep) {
bench_func(gpu, cpu, gpu_stride, cpu_stride, width, height);
}
clock_gettime(CLOCK_MONOTONIC, &tvm_e);
double mtime = timediff(&tvm_b, &tvm_e) / reps;
printf("%.f us", mtime*1e6);
printf("\n");
free(cpu);
etna_bo_del(bmp);
return 0;
}
int main()
{
/* test */
uint32_t twidth = 128;
uint32_t theight = 256;
tile_test(twidth, theight, 32, base_tile32, untile32_1x, "untile32_1x");
tile_test(twidth, theight, 32, tile32_1x, base_untile32, "tile32_1x");
tile_test(twidth, theight, 32, tile32_2x, base_untile32, "tile32_2x");
tile_test(twidth, theight, 32, tile32_2x_alt, base_untile32, "tile32_2x_alt");
tile_test(twidth, theight, 32, base_tile32, untile32_2x, "untile32_2x");
tile_test(twidth, theight, 32, base_tile32, untile32_2x_alt, "untile32_2x_alt");
tile_test(twidth, theight, 16, base_tile16, untile16_1x, "untile16_1x");
tile_test(twidth, theight, 16, tile16_1x, base_untile16, "tile16_1x");
tile_test(twidth, theight, 16, base_tile16, untile16_2x, "untile16_2x");
tile_test(twidth, theight, 16, tile16_2x, base_untile16, "tile16_2x");
tile_test(twidth, theight, 8, base_tile8, untile8_2x, "untile8_2x");
tile_test(twidth, theight, 8, tile8_2x, base_untile8, "tile8_2x");
tile_test(twidth, theight, 8, base_tile8, untile8_4x, "untile8_4x");
tile_test(twidth, theight, 8, tile8_4x, base_untile8, "tile8_4x");
/* benchmark */
uint32_t bwidth = 2048;
uint32_t bheight = 1024;
drmVersionPtr version;
struct etna_device *dev;
int fd = open(drm_device_name, O_RDWR);
if (fd < 0) {
fprintf(stdout, "Unable to open %s\n", drm_device_name);
abort();
}
version = drmGetVersion(fd);
if (version) {
printf("Version: %d.%d.%d\n", version->version_major,
version->version_minor, version->version_patchlevel);
printf(" Name: %s\n", version->name);
printf(" Date: %s\n", version->date);
printf(" Description: %s\n", version->desc);
drmFreeVersion(version);
}
dev = etna_device_new(fd);
if (!dev) {
fprintf(stdout, "Unable to create device\n");
abort();
}
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)base_tile8, "base_tile8");
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)tile8_2x, "tile8_2x");
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)tile8_4x, "tile8_4x");
printf("\n");
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)base_tile16, "base_tile16");
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)tile16_1x, "tile16_1x");
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)tile16_2x, "tile16_2x");
printf("\n");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)base_tile32, "base_tile32");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_1x, "tile32_1x");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_2x, "tile32_2x");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)tile32_2x_alt, "tile32_2x_alt");
printf("\n");
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)base_untile8, "base_untile8");
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)untile8_2x, "untile8_2x");
tile_bench(dev, bwidth, bheight, 8, (benchfunc_t)untile8_4x, "untile8_4x");
printf("\n");
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)base_untile16, "base_untile16");
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)untile16_1x, "untile16_1x");
tile_bench(dev, bwidth, bheight, 16, (benchfunc_t)untile16_2x, "untile16_2x");
printf("\n");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)base_untile32, "base_untile32");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_1x, "untile32_1x");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_2x, "untile32_2x");
tile_bench(dev, bwidth, bheight, 32, (benchfunc_t)untile32_2x_alt, "untile32_2x_alt");
printf("\n");
etna_device_del(dev);
close(fd);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment