-
-
Save cleverca22/79143cb23a50d572b9d527c9ea479492 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "vpu-support.h" | |
#include <stdio.h> | |
#include <stdint.h> | |
template <typename T, int x, int y> static void dump_matrix(const char *name) { | |
T matrix[16 * 16]; | |
vst<T>(x, y, matrix, true, false, 16, true, 16); | |
printf("%s\n", name); | |
printf("col "); | |
for (int i=y; i<(y+16); i++) printf("%5d", i); | |
puts(""); | |
for (int row=0; row<16; row++) { | |
printf("row %2d:", x + row); | |
for (int col=0; col<16; col++) { | |
switch (sizeof(T)) { | |
case 1: | |
printf("0x%02x ", matrix[(row*16) + col]); | |
break; | |
case 2: | |
printf("0x%04x ", matrix[(row*16) + col]); | |
break; | |
case 4: | |
printf("0x%08x ", matrix[(row*16) + col]); | |
break; | |
} | |
} | |
puts(""); | |
} | |
} | |
int main(int argc, char **argv) { | |
const uint16_t input[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; | |
//const uint16_t b[16] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; | |
vld(0,0, input, false,false, 1, true, 16); | |
//vld(1,0, b, false,false, 1, true, 16); | |
vmul32uu_imm(2,0, 0,0, 0x1000); | |
printf("clock cycles spent: %d\n", cycles_spent); | |
dump_matrix<uint32_t,0,0>("multed"); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <assert.h> | |
int cycles_spent = 0; | |
template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) { | |
const char *sizes[3] = { "H", "HX", "HY" }; | |
if (rep == 1) { // no repitition | |
asm volatile ("v%[width]ld H%[width](%[x], %[y]), (%[src])": | |
:[width] "i" (sizeof(T) * 8) | |
,[x] "i" (x) | |
,[y] "i" (y) | |
,[src] "r"(&src)); | |
} else { | |
assert(0); | |
} | |
} | |
template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) { | |
if (rep == 1) { // no repitition | |
asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest])": | |
:[width] "i" (sizeof(T) * 8) | |
,[x] "i" (x) | |
,[y] "i" (y) | |
,[dest] "r"(&dst)); | |
} else { | |
asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest]+=%[stride]) REP%[rep]": | |
:[width] "i" (sizeof(T) * 8) | |
,[x] "i" (x) | |
,[y] "i" (y) | |
,[dest] "r"(&dst) | |
,[rep] "i" (rep) | |
,[stride] "r" (stride * sizeof(T))); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <stdint.h> | |
#include <stdbool.h> | |
__thread uint8_t matrix[64][64]; | |
__thread int cycles_spent = 0; | |
#define MATRIX16_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; } | |
#define MATRIX32_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; matrix[x][y+32] = (value >> 16) & 0xff; matrix[x][y+48] = (value >> 24) & 0xff; } | |
template <typename T> static inline void matrix_write(int x, int y, T value) { | |
switch (sizeof(value)) { | |
case 1: | |
matrix[x][y] = value; | |
break; | |
case 2: | |
MATRIX16_WRITE(x,y,value); | |
break; | |
case 4: | |
MATRIX32_WRITE(x,y,value); | |
break; | |
} | |
} | |
template <typename T> static inline T matrix_read(int x, int y) { | |
switch (sizeof(T)) { | |
case 1: | |
return matrix[x][y]; | |
case 2: | |
return matrix[x][y] | (matrix[x][y+16] << 8); | |
case 4: | |
return matrix[x][y] | (matrix[x][y+16] << 8) | (matrix[x][y+32] << 16) | (matrix[x][y+48] << 24); | |
} | |
} | |
template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) { | |
cycles_spent += 11; | |
for (int r = 0; r < rep; r++) { | |
cycles_spent += 2; | |
if (horizontal) { | |
for (int i=0; i<16; i++) { | |
matrix_write(x,y+i, src[(stride*r) + i]); | |
} | |
} else { | |
for (int i=0; i<16; i++) { | |
matrix_write(x+i,y, src[(stride*r) + i]); | |
} | |
} | |
if (xinc) x++; | |
if (yinc) y++; | |
} | |
} | |
template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) { | |
for (int r = 0; r < rep; r++) { | |
if (horizontal) { | |
for (int i=0; i<16; i++) { | |
dst[(stride*r) + i] = matrix_read<T>(x,y+i); | |
} | |
} else { | |
for (int i=0; i<16; i++) { | |
dst[(stride*r) + i] = matrix_read<T>(x+i,y); | |
} | |
} | |
if (xinc) x++; | |
if (yinc) y++; | |
} | |
} | |
static inline void vmul32uu(int xD, int yD, int xA, int yA, int xB, int yB) { | |
for (int i=0; i<16; i++) { | |
uint32_t a = matrix_read<uint16_t>(xA,yA+i); | |
uint32_t b = matrix_read<uint16_t>(xB,yB+i); | |
uint32_t d = a * b; | |
matrix_write<uint32_t>(xD, yD+i, d); | |
} | |
cycles_spent += 2; | |
} | |
static inline void vmul32uu_imm(int xD, int yD, int xA, int yA, uint16_t b) { | |
for (int i=0; i<16; i++) { | |
uint32_t a = matrix_read<uint16_t>(xA,yA+i); | |
uint32_t d = a * b; | |
matrix_write<uint32_t>(xD, yD+i, d); | |
} | |
cycles_spent += 2; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment