cleverca22/simple-test.cpp Secret

## simple-test.cpp
#include "vpu-support.h"
#include <stdio.h>
#include <stdint.h>

template <typename T, int x, int y> static void dump_matrix(const char *name) {
  T matrix[16 * 16];
  vst<T>(x, y, matrix, true, false, 16, true, 16);
  printf("%s\n", name);
  printf("col   ");
  for (int i=y; i<(y+16); i++) printf("%5d", i);
  puts("");
  for (int row=0; row<16; row++) {
    printf("row %2d:", x + row);
    for (int col=0; col<16; col++) {
      switch (sizeof(T)) {
        case 1:
          printf("0x%02x ", matrix[(row*16) + col]);
          break;
        case 2:
          printf("0x%04x ", matrix[(row*16) + col]);
          break;
        case 4:
          printf("0x%08x ", matrix[(row*16) + col]);
          break;
      }
    }
    puts("");
  }
}

int main(int argc, char **argv) {
  const uint16_t input[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
  //const uint16_t b[16] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};

  vld(0,0, input, false,false, 1, true, 16);
  //vld(1,0, b, false,false, 1, true, 16);
  vmul32uu_imm(2,0, 0,0, 0x1000);
  printf("clock cycles spent: %d\n", cycles_spent);
  dump_matrix<uint32_t,0,0>("multed");
}

## vpu-support-native.h
#pragma once
#include <assert.h>

int cycles_spent = 0;

template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
  const char *sizes[3] = { "H", "HX", "HY" };
  if (rep == 1) { // no repitition
    asm volatile ("v%[width]ld H%[width](%[x], %[y]), (%[src])":
        :[width] "i" (sizeof(T) * 8)
        ,[x] "i" (x)
        ,[y] "i" (y)
        ,[src] "r"(&src));
  } else {
    assert(0);
  }
}

template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
  if (rep == 1) { // no repitition
    asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest])":
        :[width] "i" (sizeof(T) * 8)
        ,[x] "i" (x)
        ,[y] "i" (y)
        ,[dest] "r"(&dst));
  } else {
    asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest]+=%[stride]) REP%[rep]":
        :[width] "i" (sizeof(T) * 8)
        ,[x] "i" (x)
        ,[y] "i" (y)
        ,[dest] "r"(&dst)
        ,[rep] "i" (rep)
        ,[stride] "r" (stride * sizeof(T)));
  }
}

## vpu-support-purec.h
#pragma once

#include <stdint.h>
#include <stdbool.h>

__thread uint8_t matrix[64][64];
__thread int cycles_spent = 0;

#define MATRIX16_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; }
#define MATRIX32_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; matrix[x][y+32] = (value >> 16) & 0xff; matrix[x][y+48] = (value >> 24) & 0xff; }

template <typename T> static inline void matrix_write(int x, int y, T value) {
  switch (sizeof(value)) {
    case 1:
      matrix[x][y] = value;
      break;
    case 2:
      MATRIX16_WRITE(x,y,value);
      break;
    case 4:
      MATRIX32_WRITE(x,y,value);
      break;
  }
}

template <typename T> static inline T matrix_read(int x, int y) {
  switch (sizeof(T)) {
    case 1:
      return matrix[x][y];
    case 2:
      return matrix[x][y] | (matrix[x][y+16] << 8);
    case 4:
      return matrix[x][y] | (matrix[x][y+16] << 8) | (matrix[x][y+32] << 16) | (matrix[x][y+48] << 24);
  }
}

template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
  cycles_spent += 11;
  for (int r = 0; r < rep; r++) {
    cycles_spent += 2;
    if (horizontal) {
      for (int i=0; i<16; i++) {
        matrix_write(x,y+i, src[(stride*r) + i]);
      }
    } else {
      for (int i=0; i<16; i++) {
        matrix_write(x+i,y, src[(stride*r) + i]);
      }
    }
    if (xinc) x++;
    if (yinc) y++;
  }
}

template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
  for (int r = 0; r < rep; r++) {
    if (horizontal) {
      for (int i=0; i<16; i++) {
        dst[(stride*r) + i] = matrix_read<T>(x,y+i);
      }
    } else {
      for (int i=0; i<16; i++) {
        dst[(stride*r) + i] = matrix_read<T>(x+i,y);
      }
    }
    if (xinc) x++;
    if (yinc) y++;
  }
}

static inline void vmul32uu(int xD, int yD, int xA, int yA, int xB, int yB) {
  for (int i=0; i<16; i++) {
    uint32_t a = matrix_read<uint16_t>(xA,yA+i);
    uint32_t b = matrix_read<uint16_t>(xB,yB+i);
    uint32_t d = a * b;
    matrix_write<uint32_t>(xD, yD+i, d);
  }
  cycles_spent += 2;
}

static inline void vmul32uu_imm(int xD, int yD, int xA, int yA, uint16_t b) {
  for (int i=0; i<16; i++) {
    uint32_t a = matrix_read<uint16_t>(xA,yA+i);
    uint32_t d = a * b;
    matrix_write<uint32_t>(xD, yD+i, d);
  }
  cycles_spent += 2;
}
	#include "vpu-support.h"
	#include <stdio.h>
	#include <stdint.h>

	template <typename T, int x, int y> static void dump_matrix(const char *name) {
	T matrix[16 * 16];
	vst<T>(x, y, matrix, true, false, 16, true, 16);
	printf("%s\n", name);
	printf("col ");
	for (int i=y; i<(y+16); i++) printf("%5d", i);
	puts("");
	for (int row=0; row<16; row++) {
	printf("row %2d:", x + row);
	for (int col=0; col<16; col++) {
	switch (sizeof(T)) {
	case 1:
	printf("0x%02x ", matrix[(row*16) + col]);
	break;
	case 2:
	printf("0x%04x ", matrix[(row*16) + col]);
	break;
	case 4:
	printf("0x%08x ", matrix[(row*16) + col]);
	break;
	}
	}
	puts("");
	}
	}

	int main(int argc, char **argv) {
	const uint16_t input[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
	//const uint16_t b[16] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};

	vld(0,0, input, false,false, 1, true, 16);
	//vld(1,0, b, false,false, 1, true, 16);
	vmul32uu_imm(2,0, 0,0, 0x1000);
	printf("clock cycles spent: %d\n", cycles_spent);
	dump_matrix<uint32_t,0,0>("multed");
	}
	#pragma once
	#include <assert.h>

	int cycles_spent = 0;

	template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
	const char *sizes[3] = { "H", "HX", "HY" };
	if (rep == 1) { // no repitition
	asm volatile ("v%[width]ld H%[width](%[x], %[y]), (%[src])":
	:[width] "i" (sizeof(T) * 8)
	,[x] "i" (x)
	,[y] "i" (y)
	,[src] "r"(&src));
	} else {
	assert(0);
	}
	}

	template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
	if (rep == 1) { // no repitition
	asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest])":
	:[width] "i" (sizeof(T) * 8)
	,[x] "i" (x)
	,[y] "i" (y)
	,[dest] "r"(&dst));
	} else {
	asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest]+=%[stride]) REP%[rep]":
	:[width] "i" (sizeof(T) * 8)
	,[x] "i" (x)
	,[y] "i" (y)
	,[dest] "r"(&dst)
	,[rep] "i" (rep)
	,[stride] "r" (stride * sizeof(T)));
	}
	}
	#pragma once

	#include <stdint.h>
	#include <stdbool.h>

	__thread uint8_t matrix[64][64];
	__thread int cycles_spent = 0;

	#define MATRIX16_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; }
	#define MATRIX32_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; matrix[x][y+32] = (value >> 16) & 0xff; matrix[x][y+48] = (value >> 24) & 0xff; }

	template <typename T> static inline void matrix_write(int x, int y, T value) {
	switch (sizeof(value)) {
	case 1:
	matrix[x][y] = value;
	break;
	case 2:
	MATRIX16_WRITE(x,y,value);
	break;
	case 4:
	MATRIX32_WRITE(x,y,value);
	break;
	}
	}

	template <typename T> static inline T matrix_read(int x, int y) {
	switch (sizeof(T)) {
	case 1:
	return matrix[x][y];
	case 2:
	return matrix[x][y] \| (matrix[x][y+16] << 8);
	case 4:
	return matrix[x][y] \| (matrix[x][y+16] << 8) \| (matrix[x][y+32] << 16) \| (matrix[x][y+48] << 24);
	}
	}

	template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
	cycles_spent += 11;
	for (int r = 0; r < rep; r++) {
	cycles_spent += 2;
	if (horizontal) {
	for (int i=0; i<16; i++) {
	matrix_write(x,y+i, src[(stride*r) + i]);
	}
	} else {
	for (int i=0; i<16; i++) {
	matrix_write(x+i,y, src[(stride*r) + i]);
	}
	}
	if (xinc) x++;
	if (yinc) y++;
	}
	}

	template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
	for (int r = 0; r < rep; r++) {
	if (horizontal) {
	for (int i=0; i<16; i++) {
	dst[(stride*r) + i] = matrix_read<T>(x,y+i);
	}
	} else {
	for (int i=0; i<16; i++) {
	dst[(stride*r) + i] = matrix_read<T>(x+i,y);
	}
	}
	if (xinc) x++;
	if (yinc) y++;
	}
	}

	static inline void vmul32uu(int xD, int yD, int xA, int yA, int xB, int yB) {
	for (int i=0; i<16; i++) {
	uint32_t a = matrix_read<uint16_t>(xA,yA+i);
	uint32_t b = matrix_read<uint16_t>(xB,yB+i);
	uint32_t d = a * b;
	matrix_write<uint32_t>(xD, yD+i, d);
	}
	cycles_spent += 2;
	}

	static inline void vmul32uu_imm(int xD, int yD, int xA, int yA, uint16_t b) {
	for (int i=0; i<16; i++) {
	uint32_t a = matrix_read<uint16_t>(xA,yA+i);
	uint32_t d = a * b;
	matrix_write<uint32_t>(xD, yD+i, d);
	}
	cycles_spent += 2;
	}