Skip to content

Instantly share code, notes, and snippets.

@samkhn
Created May 15, 2020 16:57
Show Gist options
  • Save samkhn/f489875d75d7fdc9a9a3a0f215c2a4f3 to your computer and use it in GitHub Desktop.
Save samkhn/f489875d75d7fdc9a9a3a0f215c2a4f3 to your computer and use it in GitHub Desktop.
// Sparse Matrices
#include <stdio.h>
#include <stdlib.h>
void PrintMatrix(int** twoDimensionalArray, unsigned numRows, unsigned numColumns);
// COO: coordinate format. We store tuples of (row, column, value).
// Ex:
// For matrix M = {
// {1, 2},
// {0, 4}
// }
// We will generate three tuples: (0, 0, 1), (0, 1, 2), (1, 1, 4)
// This will become C arrays: row = {0, 0, 1}, column = {0, 1, 1}, values = {1, 2, 4}.
// We never store zeroes; any coordinate not stored is zero
//
// Performance characteristics: (assuming matrix holds doubles)
// Every tuple will take up 16 bytes aka every non zero value in the matrix will now be 16 bytes. This can be weighed against 8 bytes for every element of a matrix.
// So ( 16 * sparsity * numRow * numCol < 8 * numRow * numCol ) -> ( sparsity < 0.5 ). This tells us COO works well if more than 50% of the values in a matrix are 0.
// Some problems:
// - COO takes more memory per value
// - if rows/columns aren't sorted (aka just a set of tuples) then lookups are linear (whereas in an double M[A][B] the lookup would be constant).
struct IntMatrixCoordinates;
typedef struct IntMatrixCoordinates IntMatrixCoordinates;
IntMatrixCoordinates* CreateIntMatrixCoordinate(int** twoDimensionalArray, unsigned numRows, unsigned numColumns);
void PrintMatrixCoordinates(IntMatrixCoordinates* coordinates);
// CSR: optimization of COO
// COO can store a lot of redundant values. For instance in the matrix M from the previous example, we see repeated instances of 0 in the row array. We can shave these off.
// We can compress further by saying that row array is instead a row_count. We keep columns and values sorted by array and that way, row_count[i] stores the number of nonzero values at row i.
// That representation can muddle lookups a little bit. When you do a lookup you'll often need to sum up the row counts to know where something lies.
// Instead we will store the offsets which is the sum of the counts from row_count[0] to row_count[i-1]. That way we know immediately which row and column to lookup. <- This is CSR.
// CSR stores three arrays
// row_offsets: int at index i of this array says that row i's columns start at row_offset[i]. The value at row_offset[i] is the index in column/value where the column starts.
// columns: (row sorted) container of unsigned ints
// values: (row sorted) container of values that we see in M
// Ex:
// For matrix N = {
// {3.0, 0.0, 0.0, 4.0, 0.0, 1.0},
// {1.0, 0.0, 0.0, 0.0, 0.0, 0.0},
// {0.0, 0.0, 0.0, 2.0, 0.0, 0.0},
// }
// row_counts[0] = 3, [1] = 1, [2] = 1.
// CSR format will produce the following:
// row_offset: {0, 3, 4}, column = {0, 3, 5, 0, 3}, values = {3.0, 4.0, 1.0, 1.0, 2.0}
struct IntCompressedSparseRow;
typedef struct IntCompressedSparseRow IntCompressedSparseRow;
IntCompressedSparseRow* CreateIntCompressedSparseRow(int** twoDimensionalArray, unsigned numRows, unsigned numColumns);
int CompressedSparseRow_Get(IntCompressedSparseRow *csr, unsigned row, unsigned column);
void PrintCompressedSparseRow(IntCompressedSparseRow* csr);
int main()
{
int oneDimensionArray[5] = {0, 1, 2, 3, 4};
int *pOneDimensionalArray = NULL;
pOneDimensionalArray = (int*)calloc(5, sizeof(int));
for (int i = 0; i < 5; i++)
pOneDimensionalArray[i] = i;
for (int i = 0; i < 5; i++)
{
if (oneDimensionArray[i] != pOneDimensionalArray[i])
{
printf("One dimensional arrays aren't the same: at index %d oneDimensionArray has %d while pOneDimensionalArray has %d\n", i, oneDimensionArray[i], pOneDimensionalArray[i]);
break;
}
}
int twoDimensionalArray[3][3] = {
{0, 0, 1},
{2, 0, 0},
{1, 2, 0}
};
// First index goes into a row, second index goes into a column.
if (twoDimensionalArray[2][2] != 0)
printf("Two dimensional array [%d][%d] got %d wanted %d\n", 2, 2, twoDimensionalArray[2][2], 0);
if (twoDimensionalArray[1][0] != 2)
printf("Two dimensional array [%d][%d] got %d wanted %d\n", 1, 0, twoDimensionalArray[1][0], 2);
if (twoDimensionalArray[2][1] != 2)
printf("Two dimensional array [%d][%d] got %d wanted %d\n", 2, 1, twoDimensionalArray[2][1], 2);
if (twoDimensionalArray[0][2] != twoDimensionalArray[0][2])
printf("Two dimensional array [%d][%d] (value: %d) isn't same as [%d][%d] (value %d)\n", 0, 2, twoDimensionalArray[0][2], 2, 0, twoDimensionalArray[2][0]);
// Different ways to represent sparse matrices
// COO:
int** M = (int**)malloc(2 * sizeof(int*));
for (int i = 0; i < 2; i++)
M[i] = (int*)malloc(2 * sizeof(int));
M[0][0] = 1;
M[0][1] = 2;
M[1][0] = 0;
M[1][1] = 4;
IntMatrixCoordinates *coordinates = CreateIntMatrixCoordinate(M, 2, 2);
printf("Matrix:\n");
PrintMatrix(M, 2, 2);
printf("in COO format is:\n");
PrintMatrixCoordinates(coordinates);
// CSR:
int** N = (int**)malloc(3 * sizeof(int*));
for (int i = 0; i < 3; i++)
N[i] = (int*)malloc(6 * sizeof(int));
N[0][0] = 3;
N[0][3] = 4;
N[0][5] = 1;
N[1][0] = 1;
N[2][3] = 2;
IntCompressedSparseRow* csr = CreateIntCompressedSparseRow(N, 3, 6);
printf("Matrix:\n");
PrintMatrix(N, 3, 6);
printf("in CSR format is:\n");
PrintCompressedSparseRow(csr);
int lookup = CompressedSparseRow_Get(csr, 0, 0);
if (lookup != 3)
printf("CompressedSparseRow_Get(%d, %d) failed. Got %d. Wanted %d\n", 0, 0, lookup, 3);
lookup = CompressedSparseRow_Get(csr, 0, 3);
if (lookup != 4)
printf("CompressedSparseRow_Get(%d, %d) failed. Got %d. Wanted %d\n", 0, 3, lookup, 4);
lookup = CompressedSparseRow_Get(csr, 2, 3);
if (lookup != 2)
printf("CompressedSparseRow_Get(%d, %d) failed. Got %d. Wanted %d\n", 2, 3, lookup, 2);
free(pOneDimensionalArray);
for (int i = 0; i < 2; i++)
free(N[i]);
free(N);
for (int i = 0; i < 2; i++)
free(M[i]);
free(M);
free(coordinates);
return 0;
}
void PrintMatrix(int** twoDimensionalArray, unsigned numRows, unsigned numColumns)
{
for (int i = 0; i < numRows; i++)
{
for (int j = 0; j < numColumns; j++)
printf(" %d", twoDimensionalArray[i][j]);
printf("\n");
}
}
typedef struct IntMatrixCoordinates
{
int numTuples;
int *row;
int *column;
int *value;
} IntMatrixCoordinates;
IntMatrixCoordinates* CreateIntMatrixCoordinate(int** twoDimensionalArray, unsigned numRows, unsigned numColumns)
{
IntMatrixCoordinates *coordinates = (IntMatrixCoordinates*)calloc(1, sizeof(IntMatrixCoordinates));
long long rawValueCount = numRows * numColumns;
coordinates->row = (int*)calloc(rawValueCount, sizeof(int));
coordinates->column = (int*)calloc(rawValueCount, sizeof(int));
coordinates->value = (int*)calloc(rawValueCount, sizeof(int));
int currentTupleIndex = 0;
for (unsigned i = 0; i < numRows; i++)
{
for (unsigned j = 0; j < numColumns; j++)
{
if (twoDimensionalArray[i][j] == 0) continue;
coordinates->row[currentTupleIndex] = i;
coordinates->column[currentTupleIndex] = j;
coordinates->value[currentTupleIndex++] = twoDimensionalArray[i][j];
}
}
// Resize now that we know the max number of tuples
coordinates->numTuples = currentTupleIndex;
coordinates->row = (int*)realloc(coordinates->row, currentTupleIndex * sizeof(int));
coordinates->column = (int*)realloc(coordinates->column, currentTupleIndex * sizeof(int));
coordinates->value = (int*)realloc(coordinates->value, currentTupleIndex * sizeof(int));
return coordinates;
}
void PrintMatrixCoordinates(IntMatrixCoordinates* coordinates)
{
for (int i = 0; i < coordinates->numTuples; i++)
printf(" (%d %d %d)\n", coordinates->row[i], coordinates->column[i], coordinates->value[i]);
printf("\n");
}
typedef struct IntCompressedSparseRow
{
int numRowOffsets;
int numColumns;
int numValues;
int *row_offset;
int *column;
int *value;
} IntCompressedSparseRow;
IntCompressedSparseRow* CreateIntCompressedSparseRow(int** twoDimensionalArray, unsigned numRows, unsigned numColumns)
{
IntCompressedSparseRow* csr = (IntCompressedSparseRow*)calloc(1, sizeof(IntCompressedSparseRow));
long long rawValueCount = numRows * numColumns;
csr->row_offset = (int*)calloc(rawValueCount, sizeof(int));
csr->column = (int*)calloc(rawValueCount, sizeof(int));
csr->value = (int*)calloc(rawValueCount, sizeof(int));
for (unsigned i = 0; i < numRows; i++)
{
int rowOffsetMade = 0;
for (unsigned j = 0; j < numColumns; j++)
{
if (twoDimensionalArray[i][j] == 0) continue;
csr->column[csr->numColumns] = j;
if (!rowOffsetMade)
{
csr->row_offset[csr->numRowOffsets++] = csr->numColumns;
rowOffsetMade = 1;
}
csr->value[csr->numColumns++] = twoDimensionalArray[i][j];
csr->numValues = csr->numColumns;
}
}
// Resize now that we know the max number of tuples
csr->row_offset = (int*)realloc(csr->row_offset, csr->numRowOffsets * sizeof(int));
csr->column = (int*)realloc(csr->column, csr->numColumns * sizeof(int));
csr->value = (int*)realloc(csr->value, csr->numValues * sizeof(int));
return csr;
}
int CompressedSparseRow_Get(IntCompressedSparseRow *csr, unsigned row, unsigned column)
{
if (!csr || row >= csr->numRowOffsets || column >= csr->numColumns)
return -1;
int columnEnd = csr->numColumns;
if (row != csr->numRowOffsets - 1)
columnEnd = csr->row_offset[row+1];
for (int columnIndex = csr->column[csr->row_offset[row]]; columnIndex < columnEnd; columnIndex++)
if (csr->column[columnIndex] == column) return csr->value[columnIndex];
return -1;
}
void PrintCompressedSparseRow(IntCompressedSparseRow* csr)
{
printf(" row_offsets: {");
for (int i = 0; i < csr->numRowOffsets; i++)
printf(" %d", csr->row_offset[i]);
printf(" }\n");
printf(" columns: {");
for (int i = 0; i < csr->numColumns; i++)
printf(" %d", csr->column[i]);
printf(" }\n");
printf(" values: {");
for (int i = 0; i < csr->numValues; i++)
printf(" %d", csr->value[i]);
printf(" }\n\n");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment