Skip to content

Instantly share code, notes, and snippets.

@var77
Last active September 5, 2024 15:07
Show Gist options
  • Select an option

  • Save var77/fc028790e5f0f39246abdafb9282d747 to your computer and use it in GitHub Desktop.

Select an option

Save var77/fc028790e5f0f39246abdafb9282d747 to your computer and use it in GitHub Desktop.
Parse raw pgvector index file to JSON
char *parse_pgvector_index_to_json(const char *filename);
#ifndef PGVECTOR_INDEX_PARSER_H
#define PGVECTOR_INDEX_PARSER_H
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#define TID_JSON_SIZE 120
#define HEAP_TID_CNT 10
#define ELEMENT_STRUCT_JSON_SIZE 280
#define METADATA_PAGE_JSON_SIZE 1024
#define ITEM_PTR_JSON_SIZE 256
#define MAX_FLOAT_JSON_SIZE 128
#ifndef HNSW_ELEMENT_TUPLE_TYPE
#define HNSW_ELEMENT_TUPLE_TYPE 1
#define HNSW_NEIGHBOR_TUPLE_TYPE 2
#ifndef BLCKSZ
#define BLCKSZ 8192
typedef unsigned int uint32; /* == 32 bits */
typedef unsigned short uint16; /* == 16 bits */
typedef signed short int16; /* == 16 bits */
typedef unsigned char uint8; /* == 8 bits */
typedef signed int int32; /* == 32 bits */
typedef uint16 LocationIndex;
typedef struct {
uint32 xlogid; /* high bits */
uint32 xrecoff; /* low bits */
} PageXLogRecPtr;
typedef uint32 TransactionId;
typedef struct ItemIdData {
unsigned lp_off : 15, /* offset to tuple (from start of page) */
lp_flags : 2, /* state of line pointer, see below */
lp_len : 15; /* byte length of tuple */
} ItemIdData;
typedef ItemIdData *ItemId;
typedef struct PageHeaderData {
/* XXX LSN is member of *any* block, not only page-organized ones */
PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog
* record for last change to this page */
uint16 pd_checksum; /* checksum */
uint16 pd_flags; /* flag bits, see below */
LocationIndex pd_lower; /* offset to start of free space */
LocationIndex pd_upper; /* offset to end of free space */
LocationIndex pd_special; /* offset to start of special space */
uint16 pd_pagesize_version;
TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
ItemIdData pd_linp[]; /* line pointer array */
} PageHeaderData;
typedef PageHeaderData *PageHeader;
typedef uint32 BlockNumber;
typedef uint16 OffsetNumber;
typedef struct BlockIdData {
uint16 bi_hi;
uint16 bi_lo;
} BlockIdData;
typedef BlockIdData *BlockId;
typedef struct ItemPointerData {
BlockIdData ip_blkid;
OffsetNumber ip_posid;
} ItemPointerData;
typedef ItemPointerData *ItemPointer;
#endif // Postgres defines
typedef struct HnswMetaPageData {
uint32 magicNumber;
uint32 version;
uint32 dimensions;
uint16 m;
uint16 efConstruction;
BlockNumber entryBlkno;
OffsetNumber entryOffno;
int16 entryLevel;
BlockNumber insertPage;
} HnswMetaPageData;
typedef HnswMetaPageData *HnswMetaPage;
typedef HnswMetaPageData *HnswMetaPage;
typedef struct HnswPageOpaqueData {
BlockNumber nextblkno;
uint16 unused;
uint16 page_id; /* for identification of HNSW indexes */
} HnswPageOpaqueData;
typedef HnswPageOpaqueData *HnswPageOpaque;
typedef struct Vector {
int32 vl_len_; /* varlena header (do not touch directly!) */
int16 dim; /* number of dimensions */
int16 unused; /* reserved for future use, always zero */
float x[];
} Vector;
typedef struct HnswElementTupleData {
uint8 type;
uint8 level;
uint8 deleted;
uint8 unused;
ItemPointerData heaptids[HEAP_TID_CNT];
ItemPointerData neighbortid;
uint16 unused2;
Vector data;
} HnswElementTupleData;
typedef HnswElementTupleData *HnswElementTuple;
typedef struct HnswNeighborTupleData {
uint8 type;
uint8 unused;
uint16 count;
ItemPointerData indextids[];
} HnswNeighborTupleData;
typedef HnswNeighborTupleData *HnswNeighborTuple;
#endif // pgvector defines
/* ******* START IMPLEMENTATION ******** */
static char *read_file_content(const char *filename, uint32 *page_cnt) {
char *buf;
FILE *file;
size_t bytes_to_read;
struct stat fst;
if (stat(filename, &fst) < 0) {
printf("could not stat file \"%s\" \n", filename);
return NULL;
}
if (fst.st_size < BLCKSZ) {
puts("file is smaller than page size");
return NULL;
}
bytes_to_read = (size_t)fst.st_size;
if ((file = fopen(filename, "rb")) == NULL) {
printf("could not open file \"%s\" for reading\n", filename);
return NULL;
}
buf = (char *)malloc(bytes_to_read + 1);
*page_cnt = bytes_to_read / BLCKSZ;
int length = fread(buf, 1, bytes_to_read, file);
if (ferror(file)) {
printf("could not read file \"%s\"\n", filename);
return NULL;
}
fclose(file);
buf[length] = '\0';
return buf;
}
static char *metapage_to_json(char *header_page) {
HnswMetaPage metapage = (HnswMetaPage)malloc(sizeof(HnswMetaPageData));
memcpy(metapage, header_page + offsetof(PageHeaderData, pd_linp),
sizeof(HnswMetaPageData));
char *ret = (char *)malloc(METADATA_PAGE_JSON_SIZE);
int len = snprintf(
ret, METADATA_PAGE_JSON_SIZE,
"{"
"\"magicNumber\": %u,"
"\"version\": %u,"
"\"dimensions\": %u,"
"\"m\": %u,"
"\"efConstruction\": %u,"
"\"entryBlkno\": %u,"
"\"entryOffno\": %u,"
"\"entryLevel\": %d,"
"\"insertPage\": %u"
"}",
metapage->magicNumber, metapage->version, metapage->dimensions,
metapage->m, metapage->efConstruction, metapage->entryBlkno,
metapage->entryOffno, metapage->entryLevel, metapage->insertPage);
ret[len] = '\0';
free(metapage);
return ret;
}
static char *linepointers_to_json(char *page, uint32 itemptr_count,
ItemId itemptrs) {
char itemptr_json[ITEM_PTR_JSON_SIZE];
uint32 size = itemptr_count * ITEM_PTR_JSON_SIZE;
char *ret = (char *)malloc(size);
memset(ret, 0x0, size);
for (uint32 i = 0; i < itemptr_count; i++) {
ItemIdData itemptr = ((ItemId)(page + sizeof(PageHeaderData)))[i];
itemptrs[i] = itemptr;
snprintf(itemptr_json, ITEM_PTR_JSON_SIZE,
"{ \"lp_off\": %u, \"lp_flags\": %u, \"lp_len\": %u }",
itemptr.lp_off, itemptr.lp_flags, itemptr.lp_len);
if (i == 0) {
snprintf(ret, size, "[%s,", itemptr_json);
} else {
snprintf(ret, size, "%s%s,", ret, itemptr_json);
}
}
memcpy(&ret[strlen(ret) - 1], "]", 1);
return ret;
}
static char *itemptr_to_json(ItemPointer itemptr) {
char *ret = (char *)malloc(ITEM_PTR_JSON_SIZE);
memset(ret, 0x0, ITEM_PTR_JSON_SIZE);
snprintf(
ret, TID_JSON_SIZE,
"{ \"ip_blkid\": { \"bi_hi\": %u, \"bi_lo\": %u }, \"ip_posid\": %u }",
itemptr->ip_blkid.bi_hi, itemptr->ip_blkid.bi_lo, itemptr->ip_posid);
uint32 len = strlen(ret);
ret[len] = '\0';
return ret;
}
static char *itemptrs_to_json(ItemPointer itemptrs, uint32 count) {
uint32 size = ITEM_PTR_JSON_SIZE * count;
char itemptr_json[ITEM_PTR_JSON_SIZE];
char *ret = (char *)malloc(size);
memset(ret, 0x0, size);
for (uint32 i = 0; i < count; i++) {
memcpy(&itemptr_json, itemptr_to_json(&itemptrs[i]), ITEM_PTR_JSON_SIZE);
if (i == 0) {
snprintf(ret, size, "[%s,", itemptr_json);
} else {
snprintf(ret, size, "%s%s,", ret, itemptr_json);
}
}
uint32 len = strlen(ret);
ret[len - 1] = ']';
ret[len] = '\0';
return ret;
}
static char *vector_to_json(Vector *vec) {
uint32 size = MAX_FLOAT_JSON_SIZE * vec->dim;
char *ret = (char *)malloc(size);
memset(ret, 0x0, size);
for (uint32 i = 0; i < vec->dim; i++) {
if (i == 0) {
snprintf(ret, size, "[%f,", vec->x[i]);
} else {
snprintf(ret, size, "%s%f,", ret, vec->x[i]);
}
}
uint32 len = strlen(ret);
ret[len - 1] = ']';
ret[len] = '\0';
return ret;
}
static char *tuples_to_json(char *page, uint32 blockNumber, ItemId itemptrs,
uint32 tuple_count, uint32 dim, uint32 m) {
uint32 tuple_vector_size = 64 * dim;
uint32 tuple_json_size = TID_JSON_SIZE * HEAP_TID_CNT +
ELEMENT_STRUCT_JSON_SIZE + tuple_vector_size;
uint32 neighbor_json_size = m * 4 * TID_JSON_SIZE + ELEMENT_STRUCT_JSON_SIZE;
uint32 tuple_size = tuple_json_size;
uint32 len = 0;
if (tuple_json_size < neighbor_json_size) {
tuple_size = neighbor_json_size;
}
char *tuple_json = (char *)malloc(tuple_size);
uint32 size = tuple_count * tuple_size;
char *ret = (char *)malloc(size);
memset(ret, 0x0, size);
for (uint32 i = 0; i < tuple_count; i++) {
memset(tuple_json, 0x0, tuple_size);
ItemIdData itemptr = itemptrs[i];
char *tuple = (char *)malloc(itemptr.lp_len);
memcpy(tuple, page + itemptr.lp_off, itemptr.lp_len);
ItemPointerData tid = {0};
tid.ip_blkid.bi_hi = (uint16)(blockNumber >> 16);
tid.ip_blkid.bi_lo = (uint16)(blockNumber & 0xFFFF);
tid.ip_posid = (OffsetNumber)(i + 1);
char *tid_json = itemptr_to_json(&tid);
uint8 type;
memcpy(&type, tuple, sizeof(uint8));
if (type == HNSW_ELEMENT_TUPLE_TYPE) {
HnswElementTuple element_tuple = (HnswElementTuple)tuple;
char *itemptrs_json =
itemptrs_to_json((ItemPointer)&element_tuple->heaptids, HEAP_TID_CNT);
char *neighbortid_json = itemptr_to_json(&element_tuple->neighbortid);
char *vector_json = NULL;
if (element_tuple->deleted) {
strncpy(vector_json, "[]", 3);
} else {
vector_json = vector_to_json(&element_tuple->data);
}
snprintf(
tuple_json, tuple_size,
"{"
"\"tid\": %s,"
"\"value\": {"
"\"type\": \"TYPE_ELEMENT\", \"level\": %u, \"deleted\": %u, "
"\"unused\": "
"%u,"
"\"heaptids\": %s, \"neighbortid\": %s, \"unused2\": %u, \"data\": %s"
"}"
"}",
tid_json, element_tuple->level, element_tuple->deleted,
element_tuple->unused, itemptrs_json, neighbortid_json,
element_tuple->unused2, vector_json);
free(itemptrs_json);
free(neighbortid_json);
free(vector_json);
} else if (type == HNSW_NEIGHBOR_TUPLE_TYPE) {
HnswNeighborTuple neighbor_tuple = (HnswNeighborTuple)tuple;
char *indextids_json = itemptrs_to_json(
(ItemPointer)&neighbor_tuple->indextids, neighbor_tuple->count);
snprintf(tuple_json, tuple_size,
"{"
"\"tid\": %s,"
"\"value\": {"
"\"type\": \"TYPE_NEIGHBOR\",\"unused\": %u,"
"\"count\": %u,"
"\"indextids\": %s"
"}"
"}",
tid_json, neighbor_tuple->unused, neighbor_tuple->count,
indextids_json);
free(indextids_json);
} else {
printf("Invalid tuple type: %u \n", type);
free(ret);
free(tuple_json);
free(tid_json);
return NULL;
}
if (i == 0) {
len = snprintf(ret, size, "[%s,", tuple_json);
} else {
len = snprintf(ret, size, "%s%s,", ret, tuple_json);
}
ret[len] = '\0';
free(tid_json);
}
free(tuple_json);
len = strlen(ret);
ret[len - 1] = ']';
ret[len] = '\0';
return ret;
}
static char *page_to_json(char *page, uint32 blockNumber, uint32 dim,
uint32 m) {
char *item_pointers = NULL;
char *elements = NULL;
PageHeaderData *page_hdr = (PageHeaderData *)malloc(sizeof(PageHeaderData));
memcpy(page_hdr, page, sizeof(PageHeaderData));
HnswPageOpaque opaque_data =
(HnswPageOpaqueData *)malloc(sizeof(HnswPageOpaqueData));
memcpy(opaque_data, page + page_hdr->pd_special, sizeof(HnswPageOpaqueData));
// Get Item Pointers
uint32 itemptr_count =
(page_hdr->pd_lower - sizeof(PageHeaderData)) / sizeof(ItemIdData);
if (itemptr_count == 0) {
strncpy(item_pointers, "[]", 3);
strncpy(elements, "[]", 3);
} else {
ItemId itemptrs = (ItemId)malloc(itemptr_count * sizeof(ItemIdData));
item_pointers = linepointers_to_json(page, itemptr_count, itemptrs);
// Get Elements
elements =
tuples_to_json(page, blockNumber, itemptrs, itemptr_count, dim, m);
if (!elements) {
free(item_pointers);
free(elements);
return NULL;
}
// =============
}
// =================
uint32 size = strlen(elements) + strlen(item_pointers) + 350;
char *ret = (char *)malloc(size);
memset(ret, 0x0, size);
int length = snprintf(
ret, size,
"{"
"\"header\": {"
"\"pd_lsn\": \"%u/%X\", \"pd_checksum\": %u, "
"\"pd_flags\": %u, \"pd_lower\": %u, \"pd_upper\": %u, "
"\"pd_special\": %u, \"pd_pagesize_version\": %u, "
"\"pd_prune_xid\": %u, \"pd_linp\": %s }, "
"\"elements\": %s,"
"\"special\": { \"nextblkno\": %u, \"unused\": %u, \"page_id\": %u } }",
page_hdr->pd_lsn.xlogid, page_hdr->pd_lsn.xrecoff, page_hdr->pd_checksum,
page_hdr->pd_flags, page_hdr->pd_lower, page_hdr->pd_upper,
page_hdr->pd_special, page_hdr->pd_pagesize_version,
page_hdr->pd_prune_xid, item_pointers, elements, opaque_data->nextblkno,
opaque_data->unused, opaque_data->page_id);
ret[length] = '\0';
free(item_pointers);
free(elements);
return ret;
}
static char *pages_to_json(const char *sector, uint32 pagecnt, uint32 dim,
uint32 m) {
uint32 size = 0;
char *ret = NULL;
char *page = (char *)malloc(BLCKSZ);
for (uint32 i = 1; i < pagecnt; i++) {
uint32 page_offset = i * BLCKSZ;
memcpy(page, sector + page_offset, BLCKSZ);
char *page_json = page_to_json(page, i, dim, m);
if (!page_json) {
free(page);
if (ret) {
free(ret);
}
return NULL;
}
if (i == 1) {
size = strlen(page_json) * pagecnt + 4;
ret = (char *)malloc(size);
memset(ret, 0x0, size);
snprintf(ret, size, "[%s,", page_json);
} else {
snprintf(ret, size, "%s%s,", ret, page_json);
}
free(page_json);
}
memcpy(&ret[strlen(ret) - 1], "]\0", 2);
free(page);
return ret;
}
char *parse_pgvector_index_to_json(const char *filename) {
uint32 page_count = 0;
const char *sector;
sector = read_file_content(filename, &page_count);
if (sector == NULL) {
puts("Could not read index file");
return NULL;
}
char *first_page = (char *)malloc(BLCKSZ);
memcpy(first_page, sector + BLCKSZ, BLCKSZ);
/* Metadata Page */
char *header_page = (char *)malloc(BLCKSZ);
memcpy(header_page, sector, BLCKSZ);
HnswMetaPage metapage = (HnswMetaPage)malloc(sizeof(HnswMetaPageData));
memcpy(metapage, header_page + offsetof(PageHeaderData, pd_linp),
sizeof(HnswMetaPageData));
char *metapage_json = metapage_to_json(header_page);
char *data_pages_json =
pages_to_json(sector, page_count, metapage->dimensions, metapage->m);
if (!data_pages_json) {
free(metapage_json);
return NULL;
}
uint32 size = strlen(metapage_json) + strlen(data_pages_json) + 256;
char *ret = (char *)malloc(size);
int len = snprintf(ret, size, "{ \"metadata\": %s, \"pages\": %s }",
metapage_json, data_pages_json);
free(metapage_json);
free(data_pages_json);
ret[len] = '\0';
return ret;
}
#endif // PGVECTOR_INDEX_PARSER_H
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment