-
-
Save var77/fc028790e5f0f39246abdafb9282d747 to your computer and use it in GitHub Desktop.
Parse raw pgvector index file to JSON
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| char *parse_pgvector_index_to_json(const char *filename); | |
| #ifndef PGVECTOR_INDEX_PARSER_H | |
| #define PGVECTOR_INDEX_PARSER_H | |
| #include <stddef.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <sys/stat.h> | |
| #include <unistd.h> | |
| #define TID_JSON_SIZE 120 | |
| #define HEAP_TID_CNT 10 | |
| #define ELEMENT_STRUCT_JSON_SIZE 280 | |
| #define METADATA_PAGE_JSON_SIZE 1024 | |
| #define ITEM_PTR_JSON_SIZE 256 | |
| #define MAX_FLOAT_JSON_SIZE 128 | |
| #ifndef HNSW_ELEMENT_TUPLE_TYPE | |
| #define HNSW_ELEMENT_TUPLE_TYPE 1 | |
| #define HNSW_NEIGHBOR_TUPLE_TYPE 2 | |
| #ifndef BLCKSZ | |
| #define BLCKSZ 8192 | |
| typedef unsigned int uint32; /* == 32 bits */ | |
| typedef unsigned short uint16; /* == 16 bits */ | |
| typedef signed short int16; /* == 16 bits */ | |
| typedef unsigned char uint8; /* == 8 bits */ | |
| typedef signed int int32; /* == 32 bits */ | |
| typedef uint16 LocationIndex; | |
| typedef struct { | |
| uint32 xlogid; /* high bits */ | |
| uint32 xrecoff; /* low bits */ | |
| } PageXLogRecPtr; | |
| typedef uint32 TransactionId; | |
| typedef struct ItemIdData { | |
| unsigned lp_off : 15, /* offset to tuple (from start of page) */ | |
| lp_flags : 2, /* state of line pointer, see below */ | |
| lp_len : 15; /* byte length of tuple */ | |
| } ItemIdData; | |
| typedef ItemIdData *ItemId; | |
| typedef struct PageHeaderData { | |
| /* XXX LSN is member of *any* block, not only page-organized ones */ | |
| PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog | |
| * record for last change to this page */ | |
| uint16 pd_checksum; /* checksum */ | |
| uint16 pd_flags; /* flag bits, see below */ | |
| LocationIndex pd_lower; /* offset to start of free space */ | |
| LocationIndex pd_upper; /* offset to end of free space */ | |
| LocationIndex pd_special; /* offset to start of special space */ | |
| uint16 pd_pagesize_version; | |
| TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ | |
| ItemIdData pd_linp[]; /* line pointer array */ | |
| } PageHeaderData; | |
| typedef PageHeaderData *PageHeader; | |
| typedef uint32 BlockNumber; | |
| typedef uint16 OffsetNumber; | |
| typedef struct BlockIdData { | |
| uint16 bi_hi; | |
| uint16 bi_lo; | |
| } BlockIdData; | |
| typedef BlockIdData *BlockId; | |
| typedef struct ItemPointerData { | |
| BlockIdData ip_blkid; | |
| OffsetNumber ip_posid; | |
| } ItemPointerData; | |
| typedef ItemPointerData *ItemPointer; | |
| #endif // Postgres defines | |
| typedef struct HnswMetaPageData { | |
| uint32 magicNumber; | |
| uint32 version; | |
| uint32 dimensions; | |
| uint16 m; | |
| uint16 efConstruction; | |
| BlockNumber entryBlkno; | |
| OffsetNumber entryOffno; | |
| int16 entryLevel; | |
| BlockNumber insertPage; | |
| } HnswMetaPageData; | |
| typedef HnswMetaPageData *HnswMetaPage; | |
| typedef HnswMetaPageData *HnswMetaPage; | |
| typedef struct HnswPageOpaqueData { | |
| BlockNumber nextblkno; | |
| uint16 unused; | |
| uint16 page_id; /* for identification of HNSW indexes */ | |
| } HnswPageOpaqueData; | |
| typedef HnswPageOpaqueData *HnswPageOpaque; | |
| typedef struct Vector { | |
| int32 vl_len_; /* varlena header (do not touch directly!) */ | |
| int16 dim; /* number of dimensions */ | |
| int16 unused; /* reserved for future use, always zero */ | |
| float x[]; | |
| } Vector; | |
| typedef struct HnswElementTupleData { | |
| uint8 type; | |
| uint8 level; | |
| uint8 deleted; | |
| uint8 unused; | |
| ItemPointerData heaptids[HEAP_TID_CNT]; | |
| ItemPointerData neighbortid; | |
| uint16 unused2; | |
| Vector data; | |
| } HnswElementTupleData; | |
| typedef HnswElementTupleData *HnswElementTuple; | |
| typedef struct HnswNeighborTupleData { | |
| uint8 type; | |
| uint8 unused; | |
| uint16 count; | |
| ItemPointerData indextids[]; | |
| } HnswNeighborTupleData; | |
| typedef HnswNeighborTupleData *HnswNeighborTuple; | |
| #endif // pgvector defines | |
| /* ******* START IMPLEMENTATION ******** */ | |
| static char *read_file_content(const char *filename, uint32 *page_cnt) { | |
| char *buf; | |
| FILE *file; | |
| size_t bytes_to_read; | |
| struct stat fst; | |
| if (stat(filename, &fst) < 0) { | |
| printf("could not stat file \"%s\" \n", filename); | |
| return NULL; | |
| } | |
| if (fst.st_size < BLCKSZ) { | |
| puts("file is smaller than page size"); | |
| return NULL; | |
| } | |
| bytes_to_read = (size_t)fst.st_size; | |
| if ((file = fopen(filename, "rb")) == NULL) { | |
| printf("could not open file \"%s\" for reading\n", filename); | |
| return NULL; | |
| } | |
| buf = (char *)malloc(bytes_to_read + 1); | |
| *page_cnt = bytes_to_read / BLCKSZ; | |
| int length = fread(buf, 1, bytes_to_read, file); | |
| if (ferror(file)) { | |
| printf("could not read file \"%s\"\n", filename); | |
| return NULL; | |
| } | |
| fclose(file); | |
| buf[length] = '\0'; | |
| return buf; | |
| } | |
| static char *metapage_to_json(char *header_page) { | |
| HnswMetaPage metapage = (HnswMetaPage)malloc(sizeof(HnswMetaPageData)); | |
| memcpy(metapage, header_page + offsetof(PageHeaderData, pd_linp), | |
| sizeof(HnswMetaPageData)); | |
| char *ret = (char *)malloc(METADATA_PAGE_JSON_SIZE); | |
| int len = snprintf( | |
| ret, METADATA_PAGE_JSON_SIZE, | |
| "{" | |
| "\"magicNumber\": %u," | |
| "\"version\": %u," | |
| "\"dimensions\": %u," | |
| "\"m\": %u," | |
| "\"efConstruction\": %u," | |
| "\"entryBlkno\": %u," | |
| "\"entryOffno\": %u," | |
| "\"entryLevel\": %d," | |
| "\"insertPage\": %u" | |
| "}", | |
| metapage->magicNumber, metapage->version, metapage->dimensions, | |
| metapage->m, metapage->efConstruction, metapage->entryBlkno, | |
| metapage->entryOffno, metapage->entryLevel, metapage->insertPage); | |
| ret[len] = '\0'; | |
| free(metapage); | |
| return ret; | |
| } | |
| static char *linepointers_to_json(char *page, uint32 itemptr_count, | |
| ItemId itemptrs) { | |
| char itemptr_json[ITEM_PTR_JSON_SIZE]; | |
| uint32 size = itemptr_count * ITEM_PTR_JSON_SIZE; | |
| char *ret = (char *)malloc(size); | |
| memset(ret, 0x0, size); | |
| for (uint32 i = 0; i < itemptr_count; i++) { | |
| ItemIdData itemptr = ((ItemId)(page + sizeof(PageHeaderData)))[i]; | |
| itemptrs[i] = itemptr; | |
| snprintf(itemptr_json, ITEM_PTR_JSON_SIZE, | |
| "{ \"lp_off\": %u, \"lp_flags\": %u, \"lp_len\": %u }", | |
| itemptr.lp_off, itemptr.lp_flags, itemptr.lp_len); | |
| if (i == 0) { | |
| snprintf(ret, size, "[%s,", itemptr_json); | |
| } else { | |
| snprintf(ret, size, "%s%s,", ret, itemptr_json); | |
| } | |
| } | |
| memcpy(&ret[strlen(ret) - 1], "]", 1); | |
| return ret; | |
| } | |
| static char *itemptr_to_json(ItemPointer itemptr) { | |
| char *ret = (char *)malloc(ITEM_PTR_JSON_SIZE); | |
| memset(ret, 0x0, ITEM_PTR_JSON_SIZE); | |
| snprintf( | |
| ret, TID_JSON_SIZE, | |
| "{ \"ip_blkid\": { \"bi_hi\": %u, \"bi_lo\": %u }, \"ip_posid\": %u }", | |
| itemptr->ip_blkid.bi_hi, itemptr->ip_blkid.bi_lo, itemptr->ip_posid); | |
| uint32 len = strlen(ret); | |
| ret[len] = '\0'; | |
| return ret; | |
| } | |
| static char *itemptrs_to_json(ItemPointer itemptrs, uint32 count) { | |
| uint32 size = ITEM_PTR_JSON_SIZE * count; | |
| char itemptr_json[ITEM_PTR_JSON_SIZE]; | |
| char *ret = (char *)malloc(size); | |
| memset(ret, 0x0, size); | |
| for (uint32 i = 0; i < count; i++) { | |
| memcpy(&itemptr_json, itemptr_to_json(&itemptrs[i]), ITEM_PTR_JSON_SIZE); | |
| if (i == 0) { | |
| snprintf(ret, size, "[%s,", itemptr_json); | |
| } else { | |
| snprintf(ret, size, "%s%s,", ret, itemptr_json); | |
| } | |
| } | |
| uint32 len = strlen(ret); | |
| ret[len - 1] = ']'; | |
| ret[len] = '\0'; | |
| return ret; | |
| } | |
| static char *vector_to_json(Vector *vec) { | |
| uint32 size = MAX_FLOAT_JSON_SIZE * vec->dim; | |
| char *ret = (char *)malloc(size); | |
| memset(ret, 0x0, size); | |
| for (uint32 i = 0; i < vec->dim; i++) { | |
| if (i == 0) { | |
| snprintf(ret, size, "[%f,", vec->x[i]); | |
| } else { | |
| snprintf(ret, size, "%s%f,", ret, vec->x[i]); | |
| } | |
| } | |
| uint32 len = strlen(ret); | |
| ret[len - 1] = ']'; | |
| ret[len] = '\0'; | |
| return ret; | |
| } | |
| static char *tuples_to_json(char *page, uint32 blockNumber, ItemId itemptrs, | |
| uint32 tuple_count, uint32 dim, uint32 m) { | |
| uint32 tuple_vector_size = 64 * dim; | |
| uint32 tuple_json_size = TID_JSON_SIZE * HEAP_TID_CNT + | |
| ELEMENT_STRUCT_JSON_SIZE + tuple_vector_size; | |
| uint32 neighbor_json_size = m * 4 * TID_JSON_SIZE + ELEMENT_STRUCT_JSON_SIZE; | |
| uint32 tuple_size = tuple_json_size; | |
| uint32 len = 0; | |
| if (tuple_json_size < neighbor_json_size) { | |
| tuple_size = neighbor_json_size; | |
| } | |
| char *tuple_json = (char *)malloc(tuple_size); | |
| uint32 size = tuple_count * tuple_size; | |
| char *ret = (char *)malloc(size); | |
| memset(ret, 0x0, size); | |
| for (uint32 i = 0; i < tuple_count; i++) { | |
| memset(tuple_json, 0x0, tuple_size); | |
| ItemIdData itemptr = itemptrs[i]; | |
| char *tuple = (char *)malloc(itemptr.lp_len); | |
| memcpy(tuple, page + itemptr.lp_off, itemptr.lp_len); | |
| ItemPointerData tid = {0}; | |
| tid.ip_blkid.bi_hi = (uint16)(blockNumber >> 16); | |
| tid.ip_blkid.bi_lo = (uint16)(blockNumber & 0xFFFF); | |
| tid.ip_posid = (OffsetNumber)(i + 1); | |
| char *tid_json = itemptr_to_json(&tid); | |
| uint8 type; | |
| memcpy(&type, tuple, sizeof(uint8)); | |
| if (type == HNSW_ELEMENT_TUPLE_TYPE) { | |
| HnswElementTuple element_tuple = (HnswElementTuple)tuple; | |
| char *itemptrs_json = | |
| itemptrs_to_json((ItemPointer)&element_tuple->heaptids, HEAP_TID_CNT); | |
| char *neighbortid_json = itemptr_to_json(&element_tuple->neighbortid); | |
| char *vector_json = NULL; | |
| if (element_tuple->deleted) { | |
| strncpy(vector_json, "[]", 3); | |
| } else { | |
| vector_json = vector_to_json(&element_tuple->data); | |
| } | |
| snprintf( | |
| tuple_json, tuple_size, | |
| "{" | |
| "\"tid\": %s," | |
| "\"value\": {" | |
| "\"type\": \"TYPE_ELEMENT\", \"level\": %u, \"deleted\": %u, " | |
| "\"unused\": " | |
| "%u," | |
| "\"heaptids\": %s, \"neighbortid\": %s, \"unused2\": %u, \"data\": %s" | |
| "}" | |
| "}", | |
| tid_json, element_tuple->level, element_tuple->deleted, | |
| element_tuple->unused, itemptrs_json, neighbortid_json, | |
| element_tuple->unused2, vector_json); | |
| free(itemptrs_json); | |
| free(neighbortid_json); | |
| free(vector_json); | |
| } else if (type == HNSW_NEIGHBOR_TUPLE_TYPE) { | |
| HnswNeighborTuple neighbor_tuple = (HnswNeighborTuple)tuple; | |
| char *indextids_json = itemptrs_to_json( | |
| (ItemPointer)&neighbor_tuple->indextids, neighbor_tuple->count); | |
| snprintf(tuple_json, tuple_size, | |
| "{" | |
| "\"tid\": %s," | |
| "\"value\": {" | |
| "\"type\": \"TYPE_NEIGHBOR\",\"unused\": %u," | |
| "\"count\": %u," | |
| "\"indextids\": %s" | |
| "}" | |
| "}", | |
| tid_json, neighbor_tuple->unused, neighbor_tuple->count, | |
| indextids_json); | |
| free(indextids_json); | |
| } else { | |
| printf("Invalid tuple type: %u \n", type); | |
| free(ret); | |
| free(tuple_json); | |
| free(tid_json); | |
| return NULL; | |
| } | |
| if (i == 0) { | |
| len = snprintf(ret, size, "[%s,", tuple_json); | |
| } else { | |
| len = snprintf(ret, size, "%s%s,", ret, tuple_json); | |
| } | |
| ret[len] = '\0'; | |
| free(tid_json); | |
| } | |
| free(tuple_json); | |
| len = strlen(ret); | |
| ret[len - 1] = ']'; | |
| ret[len] = '\0'; | |
| return ret; | |
| } | |
| static char *page_to_json(char *page, uint32 blockNumber, uint32 dim, | |
| uint32 m) { | |
| char *item_pointers = NULL; | |
| char *elements = NULL; | |
| PageHeaderData *page_hdr = (PageHeaderData *)malloc(sizeof(PageHeaderData)); | |
| memcpy(page_hdr, page, sizeof(PageHeaderData)); | |
| HnswPageOpaque opaque_data = | |
| (HnswPageOpaqueData *)malloc(sizeof(HnswPageOpaqueData)); | |
| memcpy(opaque_data, page + page_hdr->pd_special, sizeof(HnswPageOpaqueData)); | |
| // Get Item Pointers | |
| uint32 itemptr_count = | |
| (page_hdr->pd_lower - sizeof(PageHeaderData)) / sizeof(ItemIdData); | |
| if (itemptr_count == 0) { | |
| strncpy(item_pointers, "[]", 3); | |
| strncpy(elements, "[]", 3); | |
| } else { | |
| ItemId itemptrs = (ItemId)malloc(itemptr_count * sizeof(ItemIdData)); | |
| item_pointers = linepointers_to_json(page, itemptr_count, itemptrs); | |
| // Get Elements | |
| elements = | |
| tuples_to_json(page, blockNumber, itemptrs, itemptr_count, dim, m); | |
| if (!elements) { | |
| free(item_pointers); | |
| free(elements); | |
| return NULL; | |
| } | |
| // ============= | |
| } | |
| // ================= | |
| uint32 size = strlen(elements) + strlen(item_pointers) + 350; | |
| char *ret = (char *)malloc(size); | |
| memset(ret, 0x0, size); | |
| int length = snprintf( | |
| ret, size, | |
| "{" | |
| "\"header\": {" | |
| "\"pd_lsn\": \"%u/%X\", \"pd_checksum\": %u, " | |
| "\"pd_flags\": %u, \"pd_lower\": %u, \"pd_upper\": %u, " | |
| "\"pd_special\": %u, \"pd_pagesize_version\": %u, " | |
| "\"pd_prune_xid\": %u, \"pd_linp\": %s }, " | |
| "\"elements\": %s," | |
| "\"special\": { \"nextblkno\": %u, \"unused\": %u, \"page_id\": %u } }", | |
| page_hdr->pd_lsn.xlogid, page_hdr->pd_lsn.xrecoff, page_hdr->pd_checksum, | |
| page_hdr->pd_flags, page_hdr->pd_lower, page_hdr->pd_upper, | |
| page_hdr->pd_special, page_hdr->pd_pagesize_version, | |
| page_hdr->pd_prune_xid, item_pointers, elements, opaque_data->nextblkno, | |
| opaque_data->unused, opaque_data->page_id); | |
| ret[length] = '\0'; | |
| free(item_pointers); | |
| free(elements); | |
| return ret; | |
| } | |
| static char *pages_to_json(const char *sector, uint32 pagecnt, uint32 dim, | |
| uint32 m) { | |
| uint32 size = 0; | |
| char *ret = NULL; | |
| char *page = (char *)malloc(BLCKSZ); | |
| for (uint32 i = 1; i < pagecnt; i++) { | |
| uint32 page_offset = i * BLCKSZ; | |
| memcpy(page, sector + page_offset, BLCKSZ); | |
| char *page_json = page_to_json(page, i, dim, m); | |
| if (!page_json) { | |
| free(page); | |
| if (ret) { | |
| free(ret); | |
| } | |
| return NULL; | |
| } | |
| if (i == 1) { | |
| size = strlen(page_json) * pagecnt + 4; | |
| ret = (char *)malloc(size); | |
| memset(ret, 0x0, size); | |
| snprintf(ret, size, "[%s,", page_json); | |
| } else { | |
| snprintf(ret, size, "%s%s,", ret, page_json); | |
| } | |
| free(page_json); | |
| } | |
| memcpy(&ret[strlen(ret) - 1], "]\0", 2); | |
| free(page); | |
| return ret; | |
| } | |
| char *parse_pgvector_index_to_json(const char *filename) { | |
| uint32 page_count = 0; | |
| const char *sector; | |
| sector = read_file_content(filename, &page_count); | |
| if (sector == NULL) { | |
| puts("Could not read index file"); | |
| return NULL; | |
| } | |
| char *first_page = (char *)malloc(BLCKSZ); | |
| memcpy(first_page, sector + BLCKSZ, BLCKSZ); | |
| /* Metadata Page */ | |
| char *header_page = (char *)malloc(BLCKSZ); | |
| memcpy(header_page, sector, BLCKSZ); | |
| HnswMetaPage metapage = (HnswMetaPage)malloc(sizeof(HnswMetaPageData)); | |
| memcpy(metapage, header_page + offsetof(PageHeaderData, pd_linp), | |
| sizeof(HnswMetaPageData)); | |
| char *metapage_json = metapage_to_json(header_page); | |
| char *data_pages_json = | |
| pages_to_json(sector, page_count, metapage->dimensions, metapage->m); | |
| if (!data_pages_json) { | |
| free(metapage_json); | |
| return NULL; | |
| } | |
| uint32 size = strlen(metapage_json) + strlen(data_pages_json) + 256; | |
| char *ret = (char *)malloc(size); | |
| int len = snprintf(ret, size, "{ \"metadata\": %s, \"pages\": %s }", | |
| metapage_json, data_pages_json); | |
| free(metapage_json); | |
| free(data_pages_json); | |
| ret[len] = '\0'; | |
| return ret; | |
| } | |
| #endif // PGVECTOR_INDEX_PARSER_H |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment