Skip to content

Instantly share code, notes, and snippets.

Last active May 31, 2021 17:22
Show Gist options
  • Save cleverca22/79143cb23a50d572b9d527c9ea479492 to your computer and use it in GitHub Desktop.
Save cleverca22/79143cb23a50d572b9d527c9ea479492 to your computer and use it in GitHub Desktop.
#include "vpu-support.h"
#include <stdio.h>
#include <stdint.h>
template <typename T, int x, int y> static void dump_matrix(const char *name) {
T matrix[16 * 16];
vst<T>(x, y, matrix, true, false, 16, true, 16);
printf("%s\n", name);
printf("col ");
for (int i=y; i<(y+16); i++) printf("%5d", i);
for (int row=0; row<16; row++) {
printf("row %2d:", x + row);
for (int col=0; col<16; col++) {
switch (sizeof(T)) {
case 1:
printf("0x%02x ", matrix[(row*16) + col]);
case 2:
printf("0x%04x ", matrix[(row*16) + col]);
case 4:
printf("0x%08x ", matrix[(row*16) + col]);
int main(int argc, char **argv) {
const uint16_t input[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
//const uint16_t b[16] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
vld(0,0, input, false,false, 1, true, 16);
//vld(1,0, b, false,false, 1, true, 16);
vmul32uu_imm(2,0, 0,0, 0x1000);
printf("clock cycles spent: %d\n", cycles_spent);
#pragma once
#include <assert.h>
int cycles_spent = 0;
template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
const char *sizes[3] = { "H", "HX", "HY" };
if (rep == 1) { // no repitition
asm volatile ("v%[width]ld H%[width](%[x], %[y]), (%[src])":
:[width] "i" (sizeof(T) * 8)
,[x] "i" (x)
,[y] "i" (y)
,[src] "r"(&src));
} else {
template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
if (rep == 1) { // no repitition
asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest])":
:[width] "i" (sizeof(T) * 8)
,[x] "i" (x)
,[y] "i" (y)
,[dest] "r"(&dst));
} else {
asm volatile ("v%[width]st H%[width](%[x], %[y]), (%[dest]+=%[stride]) REP%[rep]":
:[width] "i" (sizeof(T) * 8)
,[x] "i" (x)
,[y] "i" (y)
,[dest] "r"(&dst)
,[rep] "i" (rep)
,[stride] "r" (stride * sizeof(T)));
#pragma once
#include <stdint.h>
#include <stdbool.h>
__thread uint8_t matrix[64][64];
__thread int cycles_spent = 0;
#define MATRIX16_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; }
#define MATRIX32_WRITE(x,y,value) { matrix[x][y] = value & 0xff; matrix[x][y+16] = (value >> 8) & 0xff; matrix[x][y+32] = (value >> 16) & 0xff; matrix[x][y+48] = (value >> 24) & 0xff; }
template <typename T> static inline void matrix_write(int x, int y, T value) {
switch (sizeof(value)) {
case 1:
matrix[x][y] = value;
case 2:
case 4:
template <typename T> static inline T matrix_read(int x, int y) {
switch (sizeof(T)) {
case 1:
return matrix[x][y];
case 2:
return matrix[x][y] | (matrix[x][y+16] << 8);
case 4:
return matrix[x][y] | (matrix[x][y+16] << 8) | (matrix[x][y+32] << 16) | (matrix[x][y+48] << 24);
template <typename T> static inline void vld(int x, int y, const T *src, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
cycles_spent += 11;
for (int r = 0; r < rep; r++) {
cycles_spent += 2;
if (horizontal) {
for (int i=0; i<16; i++) {
matrix_write(x,y+i, src[(stride*r) + i]);
} else {
for (int i=0; i<16; i++) {
matrix_write(x+i,y, src[(stride*r) + i]);
if (xinc) x++;
if (yinc) y++;
template <typename T> static inline void vst(int x, int y, T *dst, bool xinc, bool yinc, int rep, bool horizontal, int stride) {
for (int r = 0; r < rep; r++) {
if (horizontal) {
for (int i=0; i<16; i++) {
dst[(stride*r) + i] = matrix_read<T>(x,y+i);
} else {
for (int i=0; i<16; i++) {
dst[(stride*r) + i] = matrix_read<T>(x+i,y);
if (xinc) x++;
if (yinc) y++;
static inline void vmul32uu(int xD, int yD, int xA, int yA, int xB, int yB) {
for (int i=0; i<16; i++) {
uint32_t a = matrix_read<uint16_t>(xA,yA+i);
uint32_t b = matrix_read<uint16_t>(xB,yB+i);
uint32_t d = a * b;
matrix_write<uint32_t>(xD, yD+i, d);
cycles_spent += 2;
static inline void vmul32uu_imm(int xD, int yD, int xA, int yA, uint16_t b) {
for (int i=0; i<16; i++) {
uint32_t a = matrix_read<uint16_t>(xA,yA+i);
uint32_t d = a * b;
matrix_write<uint32_t>(xD, yD+i, d);
cycles_spent += 2;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment