-
-
Save corsix/604455f58d851b006cda2daa0ea9d095 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <fcntl.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <sys/ioctl.h> | |
#include <sys/mman.h> | |
#define FATAL(fmt, ...) do {fprintf(stderr, fmt " (%s:%d)\n",##__VA_ARGS__,__FILE__,__LINE__); exit(1);} while(0) | |
#define ASSERT(cond) if (cond) {} else FATAL("Assertion failed: %s", #cond) | |
#define TENSTORRENT_IOCTL_QUERY_MAPPINGS 0xFA02 | |
struct tenstorrent_mapping { | |
uint32_t mapping_id; | |
uint32_t reserved; | |
uint64_t mapping_base; | |
uint64_t mapping_size; | |
}; | |
#define TENSTORRENT_MAPPING_RESOURCE0_UC 1 | |
#define TENSTORRENT_MAPPING_RESOURCE0_WC 2 | |
#define TENSTORRENT_MAPPING_RESOURCE2_UC 5 | |
#define BAR0_WC_SIZE (464 << 20) | |
#define BAR0_SIZE (496 << 20) | |
#define MMAP_SIZE (512 << 20) | |
#define BAR4_SOC_TARGET_ADDRESS 0x1E000000 | |
#define TLB_CONFIG_ADDR 0x1FC00000 | |
#define TLB_IDX_0 0 | |
#define TLB_IDX_UC0 184 | |
#define TLB_CFG_UNICAST(x, y) (((y) << 6) + (x)) | |
#define TLB_CFG_MULTICAST(x_start, y_start, x_end, y_end) ((1 << 25) + ((y_start) << 18) + ((x_start) << 12) + ((y_end) << 6) + (x_end)) | |
#define TLB_CFG_NOC1 (1 << 24) | |
static char* set_tlb(char* dev, uint32_t idx, uint64_t cfg, uint32_t suitable_for_addr) { | |
char* result = dev; | |
uint32_t abits; | |
if (idx < 156) { | |
abits = 20; | |
result += (idx << 20); | |
} else if (idx < 166) { | |
abits = 21; | |
result += (156 << 20) + ((idx - 156) << 21); | |
} else { | |
abits = 24; | |
result += (156 << 20) + (10 << 21) + ((idx - 166) << 24); | |
} | |
cfg = (cfg << (36 - abits)) + (suitable_for_addr >>= abits); | |
((volatile uint64_t*)(dev + TLB_CONFIG_ADDR))[idx] = cfg; | |
return result - (suitable_for_addr << abits); | |
} | |
#define RV_ADDR_NOC0_MC_DISABLE_COL 0xFFB20110 | |
typedef struct routing_cmd_t { | |
uint32_t target_addr; | |
uint16_t target_noc_xy; // From lo to hi: 4 bits zero, 6 bits NoC X, 6 bits NoC Y | |
uint16_t target_shelf_xy; // From lo to hi: 6 bits shelf-level X, 6 bits shelf-level Y, 4 bits unused | |
union { | |
uint32_t inline_data; | |
uint32_t data_block_length; | |
}; | |
uint32_t flags; | |
uint16_t target_rack_xy; // From lo to hi: 8 bits rack X (rack #), 8 bits rack Y (shelf #) | |
uint16_t reserved[5]; | |
uint32_t data_block_dma_addr; | |
} routing_cmd_t; | |
// Request flags: | |
#define CMD_WR_REQ (1u << 0) | |
#define CMD_RD_REQ (1u << 2) | |
#define CMD_DATA_BLOCK_DMA (1u << 4) | |
#define CMD_DATA_BLOCK (1u << 6) | |
#define CMD_BROADCAST (1u << 7) | |
#define CMD_USE_NOC1 (1u << 9) | |
#define CMD_TIMESTAMP (1u << 10) | |
#define CMD_ORDERED (1u << 12) | |
// Response flags: | |
#define CMD_WR_ACK (1u << 1) | |
#define CMD_RD_DATA (1u << 3) | |
#define CMD_DATA_BLOCK_UNAVAILABLE (1u << 30) | |
#define CMD_DEST_UNREACHABLE (1u << 31) | |
typedef struct eth_queue_t { | |
uint32_t wr_req_counter; | |
uint32_t wr_resp_counter; | |
uint32_t rd_req_counter; | |
uint32_t rd_resp_counter; | |
uint32_t error_counter; | |
uint32_t padding0[3]; | |
uint32_t wr_idx; | |
uint32_t padding1[3]; | |
uint32_t rd_idx; | |
uint32_t padding2[3]; | |
routing_cmd_t contents[4]; | |
} eth_queue_t; | |
typedef struct eth_base_firmware_queues_t { | |
uint64_t latency_counter[16]; | |
eth_queue_t sq; // Contains requests, for host -> E tile | |
eth_queue_t reserved; | |
eth_queue_t cq; // Contains responses, for E tile -> host | |
char padding[4096 - sizeof(uint64_t)*16 - sizeof(eth_queue_t)*3]; | |
char buffers[4][1024]; | |
} eth_base_firmware_queues_t; | |
static void do_eth_cmd(eth_base_firmware_queues_t* q, routing_cmd_t* c) { | |
// Spin while sq full | |
uint32_t wr_idx = q->sq.wr_idx; | |
uint32_t rd_idx; | |
do { | |
rd_idx = *(volatile uint32_t*)&q->sq.rd_idx; | |
} while ((wr_idx - rd_idx) & 4u); | |
// Push to sq | |
routing_cmd_t* qc = q->sq.contents + (wr_idx & 3u); | |
*(volatile __m256i*)qc = _mm256_loadu_si256((__m256i*)c); | |
_mm_sfence(); | |
*(volatile uint32_t*)&q->sq.wr_idx = (wr_idx + 1) & 7u; | |
// Spin while cq empty | |
rd_idx = q->cq.rd_idx; | |
do { | |
wr_idx = *(volatile uint32_t*)&q->cq.wr_idx; | |
} while (rd_idx == wr_idx); | |
// Wait for cq entry to be populated | |
qc = q->cq.contents + (rd_idx & 3u); | |
do { | |
_mm256_storeu_si256((__m256i*)c, *(volatile __m256i*)qc); | |
} while (c->flags == 0); | |
// Pop from cq | |
*(volatile uint32_t*)&q->cq.rd_idx = (rd_idx + 1) & 7u; | |
} | |
int main() { | |
int fd = open("/dev/tenstorrent/0", O_RDWR | O_CLOEXEC); | |
ASSERT(fd >= 0); | |
unsigned char resource_to_mapping[8] = {0}; | |
struct tenstorrent_mapping mappings[sizeof(resource_to_mapping) + 1]; | |
mappings[0].mapping_size = sizeof(resource_to_mapping); | |
ASSERT(ioctl(fd, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings[0].mapping_size) >= 0); | |
mappings[0].mapping_size = 0; | |
for (unsigned i = 1; i <= sizeof(resource_to_mapping); ++i) { | |
uint32_t resource = mappings[i].mapping_id; | |
if (resource < sizeof(resource_to_mapping)) { | |
resource_to_mapping[resource] = i; | |
} | |
} | |
struct tenstorrent_mapping* bar0uc = mappings + resource_to_mapping[TENSTORRENT_MAPPING_RESOURCE0_UC]; | |
struct tenstorrent_mapping* bar0wc = mappings + resource_to_mapping[TENSTORRENT_MAPPING_RESOURCE0_WC]; | |
struct tenstorrent_mapping* bar4uc = mappings + resource_to_mapping[TENSTORRENT_MAPPING_RESOURCE2_UC]; | |
ASSERT(bar0uc->mapping_size >= BAR0_SIZE); | |
ASSERT(bar4uc->mapping_size >= MMAP_SIZE - BAR4_SOC_TARGET_ADDRESS); | |
char* dev = mmap(NULL, MMAP_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | |
ASSERT(dev != MAP_FAILED); | |
uint32_t wc_size = bar0wc->mapping_size; | |
if (wc_size) { | |
if (wc_size > BAR0_WC_SIZE) { | |
wc_size = BAR0_WC_SIZE; | |
} | |
if (mmap(dev, wc_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, bar0wc->mapping_base) == MAP_FAILED) { | |
wc_size = 0; | |
} | |
} | |
ASSERT(mmap(dev + wc_size, BAR0_SIZE - wc_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, bar0uc->mapping_base + wc_size) != MAP_FAILED); | |
ASSERT(mmap(dev + BAR0_SIZE, MMAP_SIZE - BAR0_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, bar4uc->mapping_base + (BAR0_SIZE - BAR4_SOC_TARGET_ADDRESS)) != MAP_FAILED); | |
char* l1_tlb = set_tlb(dev, TLB_IDX_0, TLB_CFG_UNICAST(8, 6), 0); | |
uint32_t q_addr = *(volatile uint32_t*)(l1_tlb + 0x170); | |
eth_base_firmware_queues_t* q = (eth_base_firmware_queues_t*)(l1_tlb + q_addr); | |
for (uint32_t shelf_y = 0; shelf_y < 2; ++shelf_y) { | |
for (uint32_t shelf_x = 0; shelf_x < 2; ++shelf_x) { | |
routing_cmd_t c; | |
c.target_rack_xy = (0 << 0) + (0 << 8); | |
c.target_shelf_xy = (shelf_x << 0) + (shelf_y << 6); | |
c.target_noc_xy = (8 << 4) + (0 << 10); | |
c.target_addr = RV_ADDR_NOC0_MC_DISABLE_COL; | |
c.flags = CMD_RD_REQ; | |
do_eth_cmd(q, &c); | |
printf("(%u, %u) -> ", shelf_x, shelf_y); | |
if (c.flags == CMD_RD_DATA) { | |
printf("value %u\n", c.inline_data); | |
} else { | |
printf("error %#08x\n", c.flags); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment