Skip to content

Instantly share code, notes, and snippets.

@corsix
Created September 15, 2024 16:55
Show Gist options
  • Save corsix/604455f58d851b006cda2daa0ea9d095 to your computer and use it in GitHub Desktop.
Save corsix/604455f58d851b006cda2daa0ea9d095 to your computer and use it in GitHub Desktop.
#include <immintrin.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#define FATAL(fmt, ...) do {fprintf(stderr, fmt " (%s:%d)\n",##__VA_ARGS__,__FILE__,__LINE__); exit(1);} while(0)
#define ASSERT(cond) if (cond) {} else FATAL("Assertion failed: %s", #cond)
#define TENSTORRENT_IOCTL_QUERY_MAPPINGS 0xFA02
struct tenstorrent_mapping {
uint32_t mapping_id;
uint32_t reserved;
uint64_t mapping_base;
uint64_t mapping_size;
};
#define TENSTORRENT_MAPPING_RESOURCE0_UC 1
#define TENSTORRENT_MAPPING_RESOURCE0_WC 2
#define TENSTORRENT_MAPPING_RESOURCE2_UC 5
#define BAR0_WC_SIZE (464 << 20)
#define BAR0_SIZE (496 << 20)
#define MMAP_SIZE (512 << 20)
#define BAR4_SOC_TARGET_ADDRESS 0x1E000000
#define TLB_CONFIG_ADDR 0x1FC00000
#define TLB_IDX_0 0
#define TLB_IDX_UC0 184
#define TLB_CFG_UNICAST(x, y) (((y) << 6) + (x))
#define TLB_CFG_MULTICAST(x_start, y_start, x_end, y_end) ((1 << 25) + ((y_start) << 18) + ((x_start) << 12) + ((y_end) << 6) + (x_end))
#define TLB_CFG_NOC1 (1 << 24)
static char* set_tlb(char* dev, uint32_t idx, uint64_t cfg, uint32_t suitable_for_addr) {
char* result = dev;
uint32_t abits;
if (idx < 156) {
abits = 20;
result += (idx << 20);
} else if (idx < 166) {
abits = 21;
result += (156 << 20) + ((idx - 156) << 21);
} else {
abits = 24;
result += (156 << 20) + (10 << 21) + ((idx - 166) << 24);
}
cfg = (cfg << (36 - abits)) + (suitable_for_addr >>= abits);
((volatile uint64_t*)(dev + TLB_CONFIG_ADDR))[idx] = cfg;
return result - (suitable_for_addr << abits);
}
#define RV_ADDR_NOC0_MC_DISABLE_COL 0xFFB20110
typedef struct routing_cmd_t {
uint32_t target_addr;
uint16_t target_noc_xy; // From lo to hi: 4 bits zero, 6 bits NoC X, 6 bits NoC Y
uint16_t target_shelf_xy; // From lo to hi: 6 bits shelf-level X, 6 bits shelf-level Y, 4 bits unused
union {
uint32_t inline_data;
uint32_t data_block_length;
};
uint32_t flags;
uint16_t target_rack_xy; // From lo to hi: 8 bits rack X (rack #), 8 bits rack Y (shelf #)
uint16_t reserved[5];
uint32_t data_block_dma_addr;
} routing_cmd_t;
// Request flags:
#define CMD_WR_REQ (1u << 0)
#define CMD_RD_REQ (1u << 2)
#define CMD_DATA_BLOCK_DMA (1u << 4)
#define CMD_DATA_BLOCK (1u << 6)
#define CMD_BROADCAST (1u << 7)
#define CMD_USE_NOC1 (1u << 9)
#define CMD_TIMESTAMP (1u << 10)
#define CMD_ORDERED (1u << 12)
// Response flags:
#define CMD_WR_ACK (1u << 1)
#define CMD_RD_DATA (1u << 3)
#define CMD_DATA_BLOCK_UNAVAILABLE (1u << 30)
#define CMD_DEST_UNREACHABLE (1u << 31)
typedef struct eth_queue_t {
uint32_t wr_req_counter;
uint32_t wr_resp_counter;
uint32_t rd_req_counter;
uint32_t rd_resp_counter;
uint32_t error_counter;
uint32_t padding0[3];
uint32_t wr_idx;
uint32_t padding1[3];
uint32_t rd_idx;
uint32_t padding2[3];
routing_cmd_t contents[4];
} eth_queue_t;
typedef struct eth_base_firmware_queues_t {
uint64_t latency_counter[16];
eth_queue_t sq; // Contains requests, for host -> E tile
eth_queue_t reserved;
eth_queue_t cq; // Contains responses, for E tile -> host
char padding[4096 - sizeof(uint64_t)*16 - sizeof(eth_queue_t)*3];
char buffers[4][1024];
} eth_base_firmware_queues_t;
static void do_eth_cmd(eth_base_firmware_queues_t* q, routing_cmd_t* c) {
// Spin while sq full
uint32_t wr_idx = q->sq.wr_idx;
uint32_t rd_idx;
do {
rd_idx = *(volatile uint32_t*)&q->sq.rd_idx;
} while ((wr_idx - rd_idx) & 4u);
// Push to sq
routing_cmd_t* qc = q->sq.contents + (wr_idx & 3u);
*(volatile __m256i*)qc = _mm256_loadu_si256((__m256i*)c);
_mm_sfence();
*(volatile uint32_t*)&q->sq.wr_idx = (wr_idx + 1) & 7u;
// Spin while cq empty
rd_idx = q->cq.rd_idx;
do {
wr_idx = *(volatile uint32_t*)&q->cq.wr_idx;
} while (rd_idx == wr_idx);
// Wait for cq entry to be populated
qc = q->cq.contents + (rd_idx & 3u);
do {
_mm256_storeu_si256((__m256i*)c, *(volatile __m256i*)qc);
} while (c->flags == 0);
// Pop from cq
*(volatile uint32_t*)&q->cq.rd_idx = (rd_idx + 1) & 7u;
}
int main() {
int fd = open("/dev/tenstorrent/0", O_RDWR | O_CLOEXEC);
ASSERT(fd >= 0);
unsigned char resource_to_mapping[8] = {0};
struct tenstorrent_mapping mappings[sizeof(resource_to_mapping) + 1];
mappings[0].mapping_size = sizeof(resource_to_mapping);
ASSERT(ioctl(fd, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings[0].mapping_size) >= 0);
mappings[0].mapping_size = 0;
for (unsigned i = 1; i <= sizeof(resource_to_mapping); ++i) {
uint32_t resource = mappings[i].mapping_id;
if (resource < sizeof(resource_to_mapping)) {
resource_to_mapping[resource] = i;
}
}
struct tenstorrent_mapping* bar0uc = mappings + resource_to_mapping[TENSTORRENT_MAPPING_RESOURCE0_UC];
struct tenstorrent_mapping* bar0wc = mappings + resource_to_mapping[TENSTORRENT_MAPPING_RESOURCE0_WC];
struct tenstorrent_mapping* bar4uc = mappings + resource_to_mapping[TENSTORRENT_MAPPING_RESOURCE2_UC];
ASSERT(bar0uc->mapping_size >= BAR0_SIZE);
ASSERT(bar4uc->mapping_size >= MMAP_SIZE - BAR4_SOC_TARGET_ADDRESS);
char* dev = mmap(NULL, MMAP_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT(dev != MAP_FAILED);
uint32_t wc_size = bar0wc->mapping_size;
if (wc_size) {
if (wc_size > BAR0_WC_SIZE) {
wc_size = BAR0_WC_SIZE;
}
if (mmap(dev, wc_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, bar0wc->mapping_base) == MAP_FAILED) {
wc_size = 0;
}
}
ASSERT(mmap(dev + wc_size, BAR0_SIZE - wc_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, bar0uc->mapping_base + wc_size) != MAP_FAILED);
ASSERT(mmap(dev + BAR0_SIZE, MMAP_SIZE - BAR0_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, bar4uc->mapping_base + (BAR0_SIZE - BAR4_SOC_TARGET_ADDRESS)) != MAP_FAILED);
char* l1_tlb = set_tlb(dev, TLB_IDX_0, TLB_CFG_UNICAST(8, 6), 0);
uint32_t q_addr = *(volatile uint32_t*)(l1_tlb + 0x170);
eth_base_firmware_queues_t* q = (eth_base_firmware_queues_t*)(l1_tlb + q_addr);
for (uint32_t shelf_y = 0; shelf_y < 2; ++shelf_y) {
for (uint32_t shelf_x = 0; shelf_x < 2; ++shelf_x) {
routing_cmd_t c;
c.target_rack_xy = (0 << 0) + (0 << 8);
c.target_shelf_xy = (shelf_x << 0) + (shelf_y << 6);
c.target_noc_xy = (8 << 4) + (0 << 10);
c.target_addr = RV_ADDR_NOC0_MC_DISABLE_COL;
c.flags = CMD_RD_REQ;
do_eth_cmd(q, &c);
printf("(%u, %u) -> ", shelf_x, shelf_y);
if (c.flags == CMD_RD_DATA) {
printf("value %u\n", c.inline_data);
} else {
printf("error %#08x\n", c.flags);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment