Created
May 15, 2020 16:57
-
-
Save samkhn/f489875d75d7fdc9a9a3a0f215c2a4f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Sparse Matrices | |
#include <stdio.h> | |
#include <stdlib.h> | |
void PrintMatrix(int** twoDimensionalArray, unsigned numRows, unsigned numColumns); | |
// COO: coordinate format. We store tuples of (row, column, value). | |
// Ex: | |
// For matrix M = { | |
// {1, 2}, | |
// {0, 4} | |
// } | |
// We will generate three tuples: (0, 0, 1), (0, 1, 2), (1, 1, 4) | |
// This will become C arrays: row = {0, 0, 1}, column = {0, 1, 1}, values = {1, 2, 4}. | |
// We never store zeroes; any coordinate not stored is zero | |
// | |
// Performance characteristics: (assuming matrix holds doubles) | |
// Every tuple will take up 16 bytes aka every non zero value in the matrix will now be 16 bytes. This can be weighed against 8 bytes for every element of a matrix. | |
// So ( 16 * sparsity * numRow * numCol < 8 * numRow * numCol ) -> ( sparsity < 0.5 ). This tells us COO works well if more than 50% of the values in a matrix are 0. | |
// Some problems: | |
// - COO takes more memory per value | |
// - if rows/columns aren't sorted (aka just a set of tuples) then lookups are linear (whereas in an double M[A][B] the lookup would be constant). | |
struct IntMatrixCoordinates; | |
typedef struct IntMatrixCoordinates IntMatrixCoordinates; | |
IntMatrixCoordinates* CreateIntMatrixCoordinate(int** twoDimensionalArray, unsigned numRows, unsigned numColumns); | |
void PrintMatrixCoordinates(IntMatrixCoordinates* coordinates); | |
// CSR: optimization of COO | |
// COO can store a lot of redundant values. For instance in the matrix M from the previous example, we see repeated instances of 0 in the row array. We can shave these off. | |
// We can compress further by saying that row array is instead a row_count. We keep columns and values sorted by array and that way, row_count[i] stores the number of nonzero values at row i. | |
// That representation can muddle lookups a little bit. When you do a lookup you'll often need to sum up the row counts to know where something lies. | |
// Instead we will store the offsets which is the sum of the counts from row_count[0] to row_count[i-1]. That way we know immediately which row and column to lookup. <- This is CSR. | |
// CSR stores three arrays | |
// row_offsets: int at index i of this array says that row i's columns start at row_offset[i]. The value at row_offset[i] is the index in column/value where the column starts. | |
// columns: (row sorted) container of unsigned ints | |
// values: (row sorted) container of values that we see in M | |
// Ex: | |
// For matrix N = { | |
// {3.0, 0.0, 0.0, 4.0, 0.0, 1.0}, | |
// {1.0, 0.0, 0.0, 0.0, 0.0, 0.0}, | |
// {0.0, 0.0, 0.0, 2.0, 0.0, 0.0}, | |
// } | |
// row_counts[0] = 3, [1] = 1, [2] = 1. | |
// CSR format will produce the following: | |
// row_offset: {0, 3, 4}, column = {0, 3, 5, 0, 3}, values = {3.0, 4.0, 1.0, 1.0, 2.0} | |
struct IntCompressedSparseRow; | |
typedef struct IntCompressedSparseRow IntCompressedSparseRow; | |
IntCompressedSparseRow* CreateIntCompressedSparseRow(int** twoDimensionalArray, unsigned numRows, unsigned numColumns); | |
int CompressedSparseRow_Get(IntCompressedSparseRow *csr, unsigned row, unsigned column); | |
void PrintCompressedSparseRow(IntCompressedSparseRow* csr); | |
int main() | |
{ | |
int oneDimensionArray[5] = {0, 1, 2, 3, 4}; | |
int *pOneDimensionalArray = NULL; | |
pOneDimensionalArray = (int*)calloc(5, sizeof(int)); | |
for (int i = 0; i < 5; i++) | |
pOneDimensionalArray[i] = i; | |
for (int i = 0; i < 5; i++) | |
{ | |
if (oneDimensionArray[i] != pOneDimensionalArray[i]) | |
{ | |
printf("One dimensional arrays aren't the same: at index %d oneDimensionArray has %d while pOneDimensionalArray has %d\n", i, oneDimensionArray[i], pOneDimensionalArray[i]); | |
break; | |
} | |
} | |
int twoDimensionalArray[3][3] = { | |
{0, 0, 1}, | |
{2, 0, 0}, | |
{1, 2, 0} | |
}; | |
// First index goes into a row, second index goes into a column. | |
if (twoDimensionalArray[2][2] != 0) | |
printf("Two dimensional array [%d][%d] got %d wanted %d\n", 2, 2, twoDimensionalArray[2][2], 0); | |
if (twoDimensionalArray[1][0] != 2) | |
printf("Two dimensional array [%d][%d] got %d wanted %d\n", 1, 0, twoDimensionalArray[1][0], 2); | |
if (twoDimensionalArray[2][1] != 2) | |
printf("Two dimensional array [%d][%d] got %d wanted %d\n", 2, 1, twoDimensionalArray[2][1], 2); | |
if (twoDimensionalArray[0][2] != twoDimensionalArray[0][2]) | |
printf("Two dimensional array [%d][%d] (value: %d) isn't same as [%d][%d] (value %d)\n", 0, 2, twoDimensionalArray[0][2], 2, 0, twoDimensionalArray[2][0]); | |
// Different ways to represent sparse matrices | |
// COO: | |
int** M = (int**)malloc(2 * sizeof(int*)); | |
for (int i = 0; i < 2; i++) | |
M[i] = (int*)malloc(2 * sizeof(int)); | |
M[0][0] = 1; | |
M[0][1] = 2; | |
M[1][0] = 0; | |
M[1][1] = 4; | |
IntMatrixCoordinates *coordinates = CreateIntMatrixCoordinate(M, 2, 2); | |
printf("Matrix:\n"); | |
PrintMatrix(M, 2, 2); | |
printf("in COO format is:\n"); | |
PrintMatrixCoordinates(coordinates); | |
// CSR: | |
int** N = (int**)malloc(3 * sizeof(int*)); | |
for (int i = 0; i < 3; i++) | |
N[i] = (int*)malloc(6 * sizeof(int)); | |
N[0][0] = 3; | |
N[0][3] = 4; | |
N[0][5] = 1; | |
N[1][0] = 1; | |
N[2][3] = 2; | |
IntCompressedSparseRow* csr = CreateIntCompressedSparseRow(N, 3, 6); | |
printf("Matrix:\n"); | |
PrintMatrix(N, 3, 6); | |
printf("in CSR format is:\n"); | |
PrintCompressedSparseRow(csr); | |
int lookup = CompressedSparseRow_Get(csr, 0, 0); | |
if (lookup != 3) | |
printf("CompressedSparseRow_Get(%d, %d) failed. Got %d. Wanted %d\n", 0, 0, lookup, 3); | |
lookup = CompressedSparseRow_Get(csr, 0, 3); | |
if (lookup != 4) | |
printf("CompressedSparseRow_Get(%d, %d) failed. Got %d. Wanted %d\n", 0, 3, lookup, 4); | |
lookup = CompressedSparseRow_Get(csr, 2, 3); | |
if (lookup != 2) | |
printf("CompressedSparseRow_Get(%d, %d) failed. Got %d. Wanted %d\n", 2, 3, lookup, 2); | |
free(pOneDimensionalArray); | |
for (int i = 0; i < 2; i++) | |
free(N[i]); | |
free(N); | |
for (int i = 0; i < 2; i++) | |
free(M[i]); | |
free(M); | |
free(coordinates); | |
return 0; | |
} | |
void PrintMatrix(int** twoDimensionalArray, unsigned numRows, unsigned numColumns) | |
{ | |
for (int i = 0; i < numRows; i++) | |
{ | |
for (int j = 0; j < numColumns; j++) | |
printf(" %d", twoDimensionalArray[i][j]); | |
printf("\n"); | |
} | |
} | |
typedef struct IntMatrixCoordinates | |
{ | |
int numTuples; | |
int *row; | |
int *column; | |
int *value; | |
} IntMatrixCoordinates; | |
IntMatrixCoordinates* CreateIntMatrixCoordinate(int** twoDimensionalArray, unsigned numRows, unsigned numColumns) | |
{ | |
IntMatrixCoordinates *coordinates = (IntMatrixCoordinates*)calloc(1, sizeof(IntMatrixCoordinates)); | |
long long rawValueCount = numRows * numColumns; | |
coordinates->row = (int*)calloc(rawValueCount, sizeof(int)); | |
coordinates->column = (int*)calloc(rawValueCount, sizeof(int)); | |
coordinates->value = (int*)calloc(rawValueCount, sizeof(int)); | |
int currentTupleIndex = 0; | |
for (unsigned i = 0; i < numRows; i++) | |
{ | |
for (unsigned j = 0; j < numColumns; j++) | |
{ | |
if (twoDimensionalArray[i][j] == 0) continue; | |
coordinates->row[currentTupleIndex] = i; | |
coordinates->column[currentTupleIndex] = j; | |
coordinates->value[currentTupleIndex++] = twoDimensionalArray[i][j]; | |
} | |
} | |
// Resize now that we know the max number of tuples | |
coordinates->numTuples = currentTupleIndex; | |
coordinates->row = (int*)realloc(coordinates->row, currentTupleIndex * sizeof(int)); | |
coordinates->column = (int*)realloc(coordinates->column, currentTupleIndex * sizeof(int)); | |
coordinates->value = (int*)realloc(coordinates->value, currentTupleIndex * sizeof(int)); | |
return coordinates; | |
} | |
void PrintMatrixCoordinates(IntMatrixCoordinates* coordinates) | |
{ | |
for (int i = 0; i < coordinates->numTuples; i++) | |
printf(" (%d %d %d)\n", coordinates->row[i], coordinates->column[i], coordinates->value[i]); | |
printf("\n"); | |
} | |
typedef struct IntCompressedSparseRow | |
{ | |
int numRowOffsets; | |
int numColumns; | |
int numValues; | |
int *row_offset; | |
int *column; | |
int *value; | |
} IntCompressedSparseRow; | |
IntCompressedSparseRow* CreateIntCompressedSparseRow(int** twoDimensionalArray, unsigned numRows, unsigned numColumns) | |
{ | |
IntCompressedSparseRow* csr = (IntCompressedSparseRow*)calloc(1, sizeof(IntCompressedSparseRow)); | |
long long rawValueCount = numRows * numColumns; | |
csr->row_offset = (int*)calloc(rawValueCount, sizeof(int)); | |
csr->column = (int*)calloc(rawValueCount, sizeof(int)); | |
csr->value = (int*)calloc(rawValueCount, sizeof(int)); | |
for (unsigned i = 0; i < numRows; i++) | |
{ | |
int rowOffsetMade = 0; | |
for (unsigned j = 0; j < numColumns; j++) | |
{ | |
if (twoDimensionalArray[i][j] == 0) continue; | |
csr->column[csr->numColumns] = j; | |
if (!rowOffsetMade) | |
{ | |
csr->row_offset[csr->numRowOffsets++] = csr->numColumns; | |
rowOffsetMade = 1; | |
} | |
csr->value[csr->numColumns++] = twoDimensionalArray[i][j]; | |
csr->numValues = csr->numColumns; | |
} | |
} | |
// Resize now that we know the max number of tuples | |
csr->row_offset = (int*)realloc(csr->row_offset, csr->numRowOffsets * sizeof(int)); | |
csr->column = (int*)realloc(csr->column, csr->numColumns * sizeof(int)); | |
csr->value = (int*)realloc(csr->value, csr->numValues * sizeof(int)); | |
return csr; | |
} | |
int CompressedSparseRow_Get(IntCompressedSparseRow *csr, unsigned row, unsigned column) | |
{ | |
if (!csr || row >= csr->numRowOffsets || column >= csr->numColumns) | |
return -1; | |
int columnEnd = csr->numColumns; | |
if (row != csr->numRowOffsets - 1) | |
columnEnd = csr->row_offset[row+1]; | |
for (int columnIndex = csr->column[csr->row_offset[row]]; columnIndex < columnEnd; columnIndex++) | |
if (csr->column[columnIndex] == column) return csr->value[columnIndex]; | |
return -1; | |
} | |
void PrintCompressedSparseRow(IntCompressedSparseRow* csr) | |
{ | |
printf(" row_offsets: {"); | |
for (int i = 0; i < csr->numRowOffsets; i++) | |
printf(" %d", csr->row_offset[i]); | |
printf(" }\n"); | |
printf(" columns: {"); | |
for (int i = 0; i < csr->numColumns; i++) | |
printf(" %d", csr->column[i]); | |
printf(" }\n"); | |
printf(" values: {"); | |
for (int i = 0; i < csr->numValues; i++) | |
printf(" %d", csr->value[i]); | |
printf(" }\n\n"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment