Skip to content

Instantly share code, notes, and snippets.

@swyxio
Last active April 16, 2024 19:52
Show Gist options
  • Save swyxio/d37350c1061d39b38b103c8d0f1438c7 to your computer and use it in GitHub Desktop.
Save swyxio/d37350c1061d39b38b103c8d0f1438c7 to your computer and use it in GitHub Desktop.
Devin-coded version of @karpathy's train_gpt.py ported to C, per his challenge https://x.com/swyx/status/1777496494448488541 this is where Devin stopped about 6 hours in, however it is not complete and I can prompt it to keep going.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
// Constants for model dimensions, learning rate, etc.
#define VOCAB_SIZE 50257 // Example size, to be adjusted based on actual model
#define BLOCK_SIZE 1024 // Example size, to be adjusted based on actual model
#define N_LAYER 12 // Example size, to be adjusted based on actual model
#define N_HEAD 12 // Example size, to be adjusted based on actual model
#define N_EMBD 768 // Example size, to be adjusted based on actual model
#define LEARNING_RATE 0.001
// Data structures for model configuration and layers
typedef struct {
int vocab_size;
int block_size;
int n_layer;
int n_head;
int n_embd;
} GPTConfig;
typedef struct {
// Embedding layers, attention blocks, MLP blocks, etc.
float **embedding_weights; // Example for embedding weights
// Other components to be added
float ***queries;
float ***keys;
float ***values;
// Weights for queries, keys, and values
float **query_weights;
float **key_weights;
float **value_weights;
GPTConfig config; // Added config here
float **token_embeddings; // Embeddings for tokens
float **position_embeddings; // Embeddings for positions
float *ln_gamma; // Layer normalization gamma parameter
float *ln_beta; // Layer normalization beta parameter
float *mlp_weights_1; // Weights for the first MLP layer
float *mlp_weights_2; // Weights for the second MLP layer
} GPTModel;
// Function prototypes
void initialize_model(GPTModel *model, GPTConfig config);
void forward_pass(GPTModel *model, int *input_indices, float *output);
void backward_pass(GPTModel *model, float *grad_output, float *grad_input);
void update_weights(GPTModel *model);
void matrix_multiply(float *A, float *B, float *C, int n, int m, int k);
float gelu_activation(float x);
void test_matrix_multiply();
void test_gelu_activation();
void initialize_attention_matrices(GPTModel *model, GPTConfig config);
void compute_queries_keys_values(float *input, GPTModel *model, float ***queries, float ***keys, float ***values);
void test_initialize_attention_matrices();
void test_compute_queries_keys_values();
void free_attention_matrices(GPTModel *model, GPTConfig config);
void initialize_embeddings(GPTModel *model, GPTConfig config);
void free_model(GPTModel *model, GPTConfig config); // Prototype for new function to free model memory
void test_embeddings(); // Prototype for new unit test function
void layer_normalize(float **inputs, float *gamma, float *beta, int n, int m, float epsilon); // New function prototype
void test_layer_normalize(); // New unit test function prototype
void softmax(float *input, float *output, int length); // New function prototype for softmax
void dot_product_attention(float *queries, float *keys, float *values, float *output, int n_head, int block_size, int n_embd); // New function prototype for dot-product attention
void mlp_block(float *input, float *output, int block_size, int n_embd, float *mlp_weights_1, float *mlp_weights_2); // New function prototype for MLP block
// Function to flatten 3D attention matrices into 1D arrays
void flatten_attention_matrices(float ***matrices, float *flat_array, int n_head, int block_size, int n_embd_per_head) {
for (int h = 0; h < n_head; ++h) {
for (int i = 0; i < block_size; ++i) {
for (int j = 0; j < n_embd_per_head; ++j) {
flat_array[h * block_size * n_embd_per_head + i * n_embd_per_head + j] = matrices[h][i][j];
}
}
}
}
// Matrix multiplication function
void matrix_multiply(float *A, float *B, float *C, int n, int m, int k) {
// Assertions to ensure indices are within bounds
assert(A != NULL && B != NULL && C != NULL);
assert(n > 0 && m > 0 && k > 0);
// Initialize C to zero
for (int i = 0; i < n * m; ++i) {
C[i] = 0;
}
// Perform matrix multiplication
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) { // Change m to k to match the inner dimension
for (int p = 0; p < m; ++p) { // Change k to m to match the second matrix's inner dimension
C[i * m + p] += A[i * k + j] * B[j * m + p]; // Change the indices to match the correct access pattern
}
}
}
}
// GELU activation function
float gelu_activation(float x) {
return 0.5 * x * (1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * pow(x, 3))));
}
// Softmax function
void softmax(float *input, float *output, int length) {
float max = input[0];
for (int i = 1; i < length; ++i) {
if (input[i] > max) {
max = input[i];
}
}
float sum = 0.0;
for (int i = 0; i < length; ++i) {
output[i] = exp(input[i] - max);
sum += output[i];
}
for (int i = 0; i < length; ++i) {
output[i] /= sum;
}
}
// Corrected dot_product_attention function
void dot_product_attention(float *queries, float *keys, float *values, float *output, int n_head, int block_size, int n_embd) {
// Temporary storage for the attention scores
float *attention_scores = (float*)malloc(block_size * block_size * sizeof(float));
for (int h = 0; h < n_head; ++h) {
// Compute the dot product between queries and keys for each head
matrix_multiply(queries + h * block_size * (n_embd / n_head), keys + h * block_size * (n_embd / n_head), attention_scores, block_size, block_size, n_embd / n_head);
// Apply softmax to the attention scores
for (int i = 0; i < block_size; ++i) {
softmax(attention_scores + i * block_size, attention_scores + i * block_size, block_size);
}
// Multiply by values to get the final attention output for this head
matrix_multiply(attention_scores, values + h * block_size * (n_embd / n_head), output + h * block_size * (n_embd / n_head), block_size, n_embd / n_head, block_size);
}
// Free the temporary storage for attention scores
free(attention_scores);
}
// Corrected mlp_block function
void mlp_block(float *input, float *output, int block_size, int n_embd, float *mlp_weights_1, float *mlp_weights_2) {
// Define the intermediate size for the MLP
int intermediate_size = n_embd * 4; // This can be a different size
float *intermediate_output = (float*)malloc(block_size * intermediate_size * sizeof(float));
// First linear layer
matrix_multiply(input, mlp_weights_1, intermediate_output, block_size, intermediate_size, n_embd);
// Apply GELU activation
for (int i = 0; i < block_size * intermediate_size; ++i) {
intermediate_output[i] = gelu_activation(intermediate_output[i]);
}
// Second linear layer to project back to n_embd dimensions
matrix_multiply(intermediate_output, mlp_weights_2, output, block_size, n_embd, intermediate_size);
// Free the intermediate output
free(intermediate_output);
}
// Changes in initialize_model function to initialize new members
void initialize_model(GPTModel *model, GPTConfig config) {
// Example of allocating memory for the embedding layer and initializing weights
// Assuming embedding weights are a 2D array with dimensions [vocab_size, n_embd]
model->embedding_weights = (float**)malloc(config.vocab_size * sizeof(float*));
for (int i = 0; i < config.vocab_size; ++i) {
model->embedding_weights[i] = (float*)malloc(config.n_embd * sizeof(float));
for (int j = 0; j < config.n_embd; ++j) {
// Initialize weights with random values, for example using a simple normal distribution
model->embedding_weights[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate and initialize weights for queries, keys, and values
model->query_weights = (float**)malloc(config.n_head * sizeof(float*));
model->key_weights = (float**)malloc(config.n_head * sizeof(float*));
model->value_weights = (float**)malloc(config.n_head * sizeof(float*));
for (int h = 0; h < config.n_head; ++h) {
model->query_weights[h] = (float*)malloc((config.n_embd / config.n_head) * config.n_embd * sizeof(float));
model->key_weights[h] = (float*)malloc((config.n_embd / config.n_head) * config.n_embd * sizeof(float));
model->value_weights[h] = (float*)malloc((config.n_embd / config.n_head) * config.n_embd * sizeof(float));
for (int i = 0; i < (config.n_embd / config.n_head) * config.n_embd; ++i) {
model->query_weights[h][i] = (float)rand() / (float)RAND_MAX;
model->key_weights[h][i] = (float)rand() / (float)RAND_MAX;
model->value_weights[h][i] = (float)rand() / (float)RAND_MAX;
}
}
// Initialize ln_gamma and ln_beta
model->ln_gamma = (float*)malloc(config.n_embd * sizeof(float));
model->ln_beta = (float*)malloc(config.n_embd * sizeof(float));
for (int i = 0; i < config.n_embd; ++i) {
model->ln_gamma[i] = 1.0; // Typically initialized to ones
model->ln_beta[i] = 0.0; // Typically initialized to zeros
}
// Initialize MLP weights
int intermediate_size = config.n_embd * 4; // This can be a different size
model->mlp_weights_1 = (float*)malloc(config.n_embd * intermediate_size * sizeof(float));
model->mlp_weights_2 = (float*)malloc(intermediate_size * config.n_embd * sizeof(float));
// Random initialization of MLP weights (example)
for (int i = 0; i < config.n_embd * intermediate_size; ++i) {
model->mlp_weights_1[i] = (float)rand() / (float)RAND_MAX;
model->mlp_weights_2[i] = (float)rand() / (float)RAND_MAX;
}
// Allocate and initialize token embeddings
model->token_embeddings = (float**)malloc(config.vocab_size * sizeof(float*));
for (int i = 0; i < config.vocab_size; ++i) {
model->token_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
for (int j = 0; j < config.n_embd; ++j) {
model->token_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate and initialize position embeddings
model->position_embeddings = (float**)malloc(config.block_size * sizeof(float*));
for (int i = 0; i < config.block_size; ++i) {
model->position_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
for (int j = 0; j < config.n_embd; ++j) {
model->position_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Note: This is a simplified example. In practice, you would need to implement a proper random initialization
// (e.g., Xavier initialization) and also consider biases, layer normalization parameters, etc.
}
// Initialize attention matrices for queries, keys, and values
void initialize_attention_matrices(GPTModel *model, GPTConfig config) {
// Assuming queries, keys, and values are 3D arrays with dimensions [n_head, block_size, n_embd/n_head]
// Allocate memory for queries, keys, and values
model->queries = (float***)malloc(config.n_head * sizeof(float**));
model->keys = (float***)malloc(config.n_head * sizeof(float**));
model->values = (float***)malloc(config.n_head * sizeof(float**));
for (int h = 0; h < config.n_head; ++h) {
model->queries[h] = (float**)malloc(config.block_size * sizeof(float*));
model->keys[h] = (float**)malloc(config.block_size * sizeof(float*));
model->values[h] = (float**)malloc(config.block_size * sizeof(float*));
for (int s = 0; s < config.block_size; ++s) {
model->queries[h][s] = (float*)calloc(config.block_size * (config.n_embd / config.n_head), sizeof(float));
model->keys[h][s] = (float*)calloc(config.block_size * (config.n_embd / config.n_head), sizeof(float));
model->values[h][s] = (float*)calloc(config.block_size * (config.n_embd / config.n_head), sizeof(float));
}
}
}
// Compute queries, keys, and values for the self-attention mechanism
void compute_queries_keys_values(float *input, GPTModel *model, float ***queries, float ***keys, float ***values) {
printf("Entering compute_queries_keys_values\n");
printf("Model config - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
// Use the model's weights to compute queries, keys, and values from the input
// This will involve matrix multiplication and addition operations
for (int h = 0; h < model->config.n_head; ++h) {
printf("Matrix dimensions for queries (head %d): (%d, %d) * (%d, %d)\n", h, model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd, model->config.n_embd / model->config.n_head);
printf("Computing queries for head %d\n", h);
matrix_multiply(model->query_weights[h], input, (*queries)[h], model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd);
printf("Matrix dimensions for keys (head %d): (%d, %d) * (%d, %d)\n", h, model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd, model->config.n_embd / model->config.n_head);
printf("Computing keys for head %d\n", h);
matrix_multiply(model->key_weights[h], input, (*keys)[h], model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd);
printf("Matrix dimensions for values (head %d): (%d, %d) * (%d, %d)\n", h, model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd, model->config.n_embd / model->config.n_head);
printf("Computing values for head %d\n", h);
matrix_multiply(model->value_weights[h], input, (*values)[h], model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd);
}
printf("Exiting compute_queries_keys_values\n");
}
// Unit test for matrix multiplication
void test_matrix_multiply() {
// Create test matrices A, B, and C
float A[2][3] = {{1, 2, 3}, {4, 5, 6}};
float B[3][2] = {{7, 8}, {9, 10}, {11, 12}};
float C[2][2] = {0};
// Expected result of multiplication
float expected[2][2] = {{58, 64}, {139, 154}};
// Perform matrix multiplication
matrix_multiply(&A[0][0], &B[0][0], &C[0][0], 2, 2, 3);
// Assert each element of the result matrix C is as expected
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 2; ++j) {
assert(fabs(C[i][j] - expected[i][j]) < 1e-5);
}
}
}
// Unit test for GELU activation
void test_gelu_activation() {
// Test input and expected output
float input = 0.5;
float expected_output = 0.3457; // Approximate expected value
printf("GELU activation input: %f\n", input);
printf("Expected output: %f\n", expected_output);
float output = gelu_activation(input);
printf("Actual output: %f\n", output);
printf("Difference: %f\n", fabs(output - expected_output));
// Assert the output is as expected
assert(fabs(output - expected_output) < 1e-4);
}
// Unit test for initializing attention matrices
void test_initialize_attention_matrices() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
initialize_model(&model, config); // Assuming this also initializes attention matrices
initialize_attention_matrices(&model, config);
// Check if memory allocation was successful and dimensions are correct
assert(model.queries != NULL);
assert(model.keys != NULL);
assert(model.values != NULL);
for (int h = 0; h < config.n_head; ++h) {
assert(model.queries[h] != NULL);
assert(model.keys[h] != NULL);
assert(model.values[h] != NULL);
for (int s = 0; s < config.block_size; ++s) {
assert(model.queries[h][s] != NULL);
assert(model.keys[h][s] != NULL);
assert(model.values[h][s] != NULL);
}
}
// Clean up
free_attention_matrices(&model, config);
}
// Unit test for computing queries, keys, and values
void test_compute_queries_keys_values() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
model.config = config; // Set the model configuration
initialize_model(&model, config); // Assuming this also initializes attention matrices
initialize_attention_matrices(&model, config);
// Ensure that the weights are not NULL
assert(model.query_weights != NULL);
assert(model.key_weights != NULL);
assert(model.value_weights != NULL);
for (int h = 0; h < config.n_head; ++h) {
assert(model.query_weights[h] != NULL);
assert(model.key_weights[h] != NULL);
assert(model.value_weights[h] != NULL);
}
// Create mock input and model weights for testing
float *input = (float*)malloc(config.block_size * config.n_embd * sizeof(float));
// Initialize input with some values
for (int i = 0; i < config.block_size * config.n_embd; ++i) {
input[i] = i;
}
// Assuming model weights are initialized in initialize_model
compute_queries_keys_values(input, &model, model.queries, model.keys, model.values);
// Check if queries, keys, and values are computed correctly
// This would involve checking the results of the matrix multiplication operations
// ...
// Clean up
free(input);
free_attention_matrices(&model, config);
}
// Function to free attention matrices
void free_attention_matrices(GPTModel *model, GPTConfig config) {
for (int h = 0; h < config.n_head; ++h) {
for (int s = 0; s < config.block_size; ++s) {
free(model->queries[h][s]);
free(model->keys[h][s]);
free(model->values[h][s]);
}
free(model->queries[h]);
free(model->keys[h]);
free(model->values[h]);
}
free(model->queries);
free(model->keys);
free(model->values);
}
// New function to initialize embeddings
void initialize_embeddings(GPTModel *model, GPTConfig config) {
// Allocate memory for token embeddings
model->token_embeddings = (float**)malloc(config.vocab_size * sizeof(float*));
for (int i = 0; i < config.vocab_size; ++i) {
model->token_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
// Initialize weights with random values
for (int j = 0; j < config.n_embd; ++j) {
model->token_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate memory for position embeddings
model->position_embeddings = (float**)malloc(config.block_size * sizeof(float*));
for (int i = 0; i < config.block_size; ++i) {
model->position_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
// Initialize weights with random values
for (int j = 0; j < config.n_embd; ++j) {
model->position_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
}
// Modify forward_pass function to apply embeddings and call flatten_attention_matrices
void forward_pass(GPTModel *model, int *input_indices, float *output) {
// Allocate memory for the output array if not already allocated
if (output == NULL) {
output = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
assert(output != NULL); // Ensure memory allocation was successful
}
// Apply token and position embeddings to input indices
for (int i = 0; i < model->config.block_size; ++i) {
int index = input_indices[i];
assert(index >= 0 && index < model->config.vocab_size);
assert(model->token_embeddings != NULL);
assert(model->position_embeddings != NULL);
printf("Embedding size (n_embd): %d\n", model->config.n_embd);
for (int j = 0; j < model->config.n_embd; ++j) {
assert(model->token_embeddings[index] != NULL);
assert(model->position_embeddings[i] != NULL);
printf("i: %d, j: %d, index: %d, token_emb_ptr: %p, pos_emb_ptr: %p\n", i, j, index, (void*)model->token_embeddings[index], (void*)model->position_embeddings[i]);
if (j >= model->config.n_embd) {
fprintf(stderr, "Error: Variable 'j' exceeded bounds: %d\n", j);
abort();
}
output[i * model->config.n_embd + j] = model->token_embeddings[index][j] + model->position_embeddings[i][j];
}
}
// Flatten the 3D arrays into 1D arrays for dot_product_attention
float *queries_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
float *keys_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
float *values_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
float *self_attention_output_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
// Call self-attention mechanism
compute_queries_keys_values(output, model, model->queries, model->keys, model->values);
flatten_attention_matrices(model->queries, queries_flat, model->config.n_head, model->config.block_size, model->config.n_embd / model->config.n_head);
flatten_attention_matrices(model->keys, keys_flat, model->config.n_head, model->config.block_size, model->config.n_embd / model->config.n_head);
flatten_attention_matrices(model->values, values_flat, model->config.n_head, model->config.block_size, model->config.n_embd / model->config.n_head);
dot_product_attention(queries_flat, keys_flat, values_flat, self_attention_output_flat, model->config.n_head, model->config.block_size, model->config.n_embd);
// Flatten the 3D self_attention_output into a 1D array for mlp_block
float *mlp_output_flat = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
// Call MLP block
mlp_block(self_attention_output_flat, mlp_output_flat, model->config.block_size, model->config.n_embd, model->mlp_weights_1, model->mlp_weights_2);
// Create a temporary 2D array for layer normalization
float **mlp_output_2d = (float**)malloc(model->config.block_size * sizeof(float*));
for (int i = 0; i < model->config.block_size; ++i) {
mlp_output_2d[i] = &mlp_output_flat[i * model->config.n_embd];
}
// Apply final layer normalization to the output of the MLP block
layer_normalize(mlp_output_2d, model->ln_gamma, model->ln_beta, model->config.block_size, model->config.n_embd, 1e-5);
// Free the temporary 2D array
free(mlp_output_2d);
// Copy the final output to the output variable
for (int i = 0; i < model->config.block_size; ++i) {
for (int j = 0; j < model->config.n_embd; ++j) {
output[i * model->config.n_embd + j] = mlp_output_flat[i * model->config.n_embd + j];
}
}
// Free intermediate variables
free(queries_flat);
free(keys_flat);
free(values_flat);
free(self_attention_output_flat);
free(mlp_output_flat);
}
// Unit test for token and position embeddings
void test_embeddings() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
initialize_model(&model, config); // Initialize the model with embeddings
// Create mock input indices (for simplicity, use indices 0 to block_size-1)
int input_indices[BLOCK_SIZE];
for (int i = 0; i < BLOCK_SIZE; ++i) {
input_indices[i] = i;
}
// Allocate memory for the output of the forward pass
float *output = (float*)malloc(BLOCK_SIZE * N_EMBD * sizeof(float));
// Apply embeddings using the forward pass
forward_pass(&model, input_indices, output);
// Check if the output contains the correct values
for (int i = 0; i < BLOCK_SIZE; ++i) {
for (int j = 0; j < N_EMBD; ++j) {
float expected_value = model.token_embeddings[input_indices[i]][j] + model.position_embeddings[i][j];
assert(fabs(output[i * N_EMBD + j] - expected_value) < 1e-5);
}
}
// Clean up
free(output);
free_model(&model, config); // This function will need to be implemented to free all allocated memory in the model
}
// Function to free the model
void free_model(GPTModel *model, GPTConfig config) {
// Free token and position embeddings
for (int i = 0; i < config.vocab_size; ++i) {
free(model->token_embeddings[i]);
}
free(model->token_embeddings);
for (int i = 0; i < config.block_size; ++i) {
free(model->position_embeddings[i]);
}
free(model->position_embeddings);
// Free queries, keys, and values
free_attention_matrices(model, config);
// Free weights for queries, keys, and values
for (int h = 0; h < config.n_head; ++h) {
free(model->query_weights[h]);
free(model->key_weights[h]);
free(model->value_weights[h]);
}
free(model->query_weights);
free(model->key_weights);
free(model->value_weights);
// Free layer normalization parameters
free(model->ln_gamma);
free(model->ln_beta);
// Free MLP weights
free(model->mlp_weights_1);
free(model->mlp_weights_2);
// Free any other dynamically allocated memory within the model
// ...
}
// Layer normalization function
void layer_normalize(float **inputs, float *gamma, float *beta, int n, int m, float epsilon) {
for (int i = 0; i < n; ++i) {
float sum = 0.0;
for (int j = 0; j < m; ++j) {
sum += inputs[i][j];
}
float mean = sum / m;
float variance_sum = 0.0;
for (int j = 0; j < m; ++j) {
variance_sum += (inputs[i][j] - mean) * (inputs[i][j] - mean);
}
float variance = variance_sum / m;
for (int j = 0; j < m; ++j) {
inputs[i][j] = (inputs[i][j] - mean) / sqrt(variance + epsilon);
inputs[i][j] = inputs[i][j] * gamma[j] + beta[j];
}
}
}
// Unit test for layer normalization
void test_layer_normalize() {
int n = 2; // Number of input vectors
int m = 3; // Number of features
float epsilon = 1e-5;
float **inputs = (float**)malloc(n * sizeof(float*));
float *gamma = (float*)malloc(m * sizeof(float));
float *beta = (float*)malloc(m * sizeof(float));
// Initialize inputs, gamma, and beta
// ...
// Call layer_normalize
layer_normalize(inputs, gamma, beta, n, m, epsilon);
// Check if the output is normalized correctly
// ...
// Clean up
free(gamma);
free(beta);
for (int i = 0; i < n; ++i) {
free(inputs[i]);
}
free(inputs);
}
// Add the new unit test to the main function
int main(int argc, char *argv[]) {
// Run unit tests
test_matrix_multiply();
test_gelu_activation();
test_initialize_attention_matrices();
test_compute_queries_keys_values();
test_embeddings(); // New unit test for embeddings
test_layer_normalize(); // New unit test for layer normalization
// Rest of the main function...
// ...
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
// Constants for model dimensions, learning rate, etc.
#define VOCAB_SIZE 50257 // Example size, to be adjusted based on actual model
#define BLOCK_SIZE 1024 // Example size, to be adjusted based on actual model
#define N_LAYER 12 // Example size, to be adjusted based on actual model
#define N_HEAD 12 // Example size, to be adjusted based on actual model
#define N_EMBD 768 // Example size, to be adjusted based on actual model
#define LEARNING_RATE 0.001
// Data structures for model configuration and layers
typedef struct {
int vocab_size;
int block_size;
int n_layer;
int n_head;
int n_embd;
} GPTConfig;
typedef struct {
// Embedding layers, attention blocks, MLP blocks, etc.
float **embedding_weights; // Example for embedding weights
// Other components to be added
float ***queries;
float ***keys;
float ***values;
// Weights for queries, keys, and values
float **query_weights;
float **key_weights;
float **value_weights;
GPTConfig config; // Added config here
float **token_embeddings; // Embeddings for tokens
float **position_embeddings; // Embeddings for positions
float *ln_gamma; // Layer normalization gamma parameter
float *ln_beta; // Layer normalization beta parameter
float *mlp_weights_1; // Weights for the first MLP layer
float *mlp_weights_2; // Weights for the second MLP layer
} GPTModel;
// Function prototypes
void initialize_model(GPTModel *model, GPTConfig config);
void forward_pass(GPTModel *model, int *input_indices, float **output);
void backward_pass(GPTModel *model, float *grad_output, float *grad_input);
void update_weights(GPTModel *model);
void matrix_multiply(float *A, float *B, float *C, int n, int m, int k);
float gelu_activation(float x);
void test_matrix_multiply();
void test_gelu_activation();
void initialize_attention_matrices(GPTModel *model, GPTConfig config);
void compute_queries_keys_values(float *input, GPTModel *model, float ***queries, float ***keys, float ***values);
void test_initialize_attention_matrices();
void test_compute_queries_keys_values();
void free_attention_matrices(GPTModel *model, GPTConfig config);
void initialize_embeddings(GPTModel *model, GPTConfig config);
void free_model(GPTModel *model, GPTConfig config); // Prototype for new function to free model memory
void test_embeddings(); // Prototype for new unit test function
void layer_normalize(float **inputs, float *gamma, float *beta, int n, int m, float epsilon); // New function prototype
void test_layer_normalize(); // New unit test function prototype
void softmax(float *input, float *output, int length); // New function prototype for softmax
void dot_product_attention(float *queries, float *keys, float *values, float *output, int n_head, int block_size, int n_embd); // New function prototype for dot-product attention
void mlp_block(float *input, float *output, int block_size, int n_embd, float *mlp_weights_1, float *mlp_weights_2); // New function prototype for MLP block
// Function to flatten 3D attention matrices into 1D arrays
void flatten_attention_matrices(float ***matrices, float *flat_array, int n_head, int block_size, int n_embd_per_head) {
for (int h = 0; h < n_head; ++h) {
for (int i = 0; i < block_size; ++i) {
for (int j = 0; j < n_embd_per_head; ++j) {
flat_array[h * block_size * n_embd_per_head + i * n_embd_per_head + j] = matrices[h][i][j];
}
}
}
}
// Matrix multiplication function
void matrix_multiply(float *A, float *B, float *C, int n, int m, int k) {
// Ensure that the pointers are not NULL and dimensions are greater than zero
if (A == NULL || B == NULL || C == NULL) {
fprintf(stderr, "Null pointer provided to matrix_multiply function\n");
exit(EXIT_FAILURE);
}
if (n <= 0 || m <= 0 || k <= 0) {
fprintf(stderr, "Invalid dimensions provided to matrix_multiply function\n");
exit(EXIT_FAILURE);
}
// Diagnostic print statements
printf("Matrix A address: %p, Matrix B address: %p, Matrix C address: %p\n", (void*)A, (void*)B, (void*)C);
printf("Matrix dimensions - n: %d, m: %d, k: %d\n", n, m, k);
// Initialize C to zero
for (int i = 0; i < n * m; ++i) {
C[i] = 0;
}
// Perform matrix multiplication
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
for (int p = 0; p < m; ++p) {
C[i * m + p] += A[i * k + j] * B[j * m + p];
}
}
}
}
// GELU activation function
float gelu_activation(float x) {
return 0.5 * x * (1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * pow(x, 3))));
}
// Softmax function
void softmax(float *input, float *output, int length) {
float max = input[0];
for (int i = 1; i < length; ++i) {
if (input[i] > max) {
max = input[i];
}
}
float sum = 0.0;
for (int i = 0; i < length; ++i) {
output[i] = exp(input[i] - max);
sum += output[i];
}
for (int i = 0; i < length; ++i) {
output[i] /= sum;
}
}
// Corrected dot_product_attention function
void dot_product_attention(float *queries, float *keys, float *values, float *output, int n_head, int block_size, int n_embd) {
// Temporary storage for the attention scores
float *attention_scores = (float*)malloc(block_size * block_size * sizeof(float));
for (int h = 0; h < n_head; ++h) {
// Compute the dot product between queries and keys for each head
matrix_multiply(queries + h * block_size * (n_embd / n_head), keys + h * block_size * (n_embd / n_head), attention_scores, block_size, block_size, n_embd / n_head);
// Apply softmax to the attention scores
for (int i = 0; i < block_size; ++i) {
softmax(attention_scores + i * block_size, attention_scores + i * block_size, block_size);
}
// Multiply by values to get the final attention output for this head
matrix_multiply(attention_scores, values + h * block_size * (n_embd / n_head), output + h * block_size * (n_embd / n_head), block_size, n_embd / n_head, block_size);
}
// Free the temporary storage for attention scores
free(attention_scores);
}
// Corrected mlp_block function
void mlp_block(float *input, float *output, int block_size, int n_embd, float *mlp_weights_1, float *mlp_weights_2) {
// Define the intermediate size for the MLP
int intermediate_size = n_embd * 4; // This can be a different size
float *intermediate_output = (float*)malloc(block_size * intermediate_size * sizeof(float));
// First linear layer
matrix_multiply(input, mlp_weights_1, intermediate_output, block_size, intermediate_size, n_embd);
// Apply GELU activation
for (int i = 0; i < block_size * intermediate_size; ++i) {
intermediate_output[i] = gelu_activation(intermediate_output[i]);
}
// Second linear layer to project back to n_embd dimensions
matrix_multiply(intermediate_output, mlp_weights_2, output, block_size, n_embd, intermediate_size);
// Free the intermediate output
free(intermediate_output);
}
// Check for successful allocation and handle errors
#define CHECK_ALLOCATION(ptr) if ((ptr) == NULL) { \
fprintf(stderr, "Memory allocation failed\n"); \
free_model(model, config); \
exit(EXIT_FAILURE); \
}
// Changes in initialize_model function to initialize new members
void initialize_model(GPTModel *model, GPTConfig config) {
// Example of allocating memory for the embedding layer and initializing weights
// Assuming embedding weights are a 2D array with dimensions [vocab_size, n_embd]
model->embedding_weights = (float**)malloc(config.vocab_size * sizeof(float*));
CHECK_ALLOCATION(model->embedding_weights);
for (int i = 0; i < config.vocab_size; ++i) {
model->embedding_weights[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->embedding_weights[i]);
for (int j = 0; j < config.n_embd; ++j) {
// Initialize weights with random values, for example using a simple normal distribution
model->embedding_weights[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate and initialize weights for queries, keys, and values
model->query_weights = (float**)malloc(config.n_head * sizeof(float*));
CHECK_ALLOCATION(model->query_weights);
model->key_weights = (float**)malloc(config.n_head * sizeof(float*));
CHECK_ALLOCATION(model->key_weights);
model->value_weights = (float**)malloc(config.n_head * sizeof(float*));
CHECK_ALLOCATION(model->value_weights);
for (int h = 0; h < config.n_head; ++h) {
model->query_weights[h] = (float*)malloc(config.n_embd * (config.n_embd / config.n_head) * sizeof(float));
CHECK_ALLOCATION(model->query_weights[h]);
model->key_weights[h] = (float*)malloc(config.n_embd * (config.n_embd / config.n_head) * sizeof(float));
CHECK_ALLOCATION(model->key_weights[h]);
model->value_weights[h] = (float*)malloc(config.n_embd * (config.n_embd / config.n_head) * sizeof(float));
CHECK_ALLOCATION(model->value_weights[h]);
for (int i = 0; i < config.n_embd * (config.n_embd / config.n_head); ++i) {
model->query_weights[h][i] = (float)rand() / (float)RAND_MAX;
model->key_weights[h][i] = (float)rand() / (float)RAND_MAX;
model->value_weights[h][i] = (float)rand() / (float)RAND_MAX;
}
}
// Initialize ln_gamma and ln_beta
model->ln_gamma = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->ln_gamma);
model->ln_beta = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->ln_beta);
for (int i = 0; i < config.n_embd; ++i) {
model->ln_gamma[i] = 1.0; // Typically initialized to ones
model->ln_beta[i] = 0.0; // Typically initialized to zeros
}
// Initialize MLP weights
int intermediate_size = config.n_embd * 4; // This can be a different size
model->mlp_weights_1 = (float*)malloc(config.n_embd * intermediate_size * sizeof(float));
CHECK_ALLOCATION(model->mlp_weights_1);
model->mlp_weights_2 = (float*)malloc(intermediate_size * config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->mlp_weights_2);
// Random initialization of MLP weights (example)
for (int i = 0; i < config.n_embd * intermediate_size; ++i) {
model->mlp_weights_1[i] = (float)rand() / (float)RAND_MAX;
model->mlp_weights_2[i] = (float)rand() / (float)RAND_MAX;
}
// Allocate and initialize token embeddings
model->token_embeddings = (float**)malloc(config.vocab_size * sizeof(float*));
CHECK_ALLOCATION(model->token_embeddings);
for (int i = 0; i < config.vocab_size; ++i) {
model->token_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->token_embeddings[i]);
for (int j = 0; j < config.n_embd; ++j) {
model->token_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate and initialize position embeddings
model->position_embeddings = (float**)malloc(config.block_size * sizeof(float*));
for (int i = 0; i < config.block_size; ++i) {
model->position_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
for (int j = 0; j < config.n_embd; ++j) {
model->position_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Note: This is a simplified example. In practice, you would need to implement a proper random initialization
// (e.g., Xavier initialization) and also consider biases, layer normalization parameters, etc.
}
// Initialize attention matrices for queries, keys, and values
void initialize_attention_matrices(GPTModel *model, GPTConfig config) {
// Allocate memory for queries, keys, and values
model->queries = (float***)malloc(config.n_head * sizeof(float**));
model->keys = (float***)malloc(config.n_head * sizeof(float**));
model->values = (float***)malloc(config.n_head * sizeof(float**));
if (!model->queries || !model->keys || !model->values) {
fprintf(stderr, "Allocation failed for attention matrices\n");
if (model->queries) free(model->queries);
if (model->keys) free(model->keys);
if (model->values) free(model->values);
exit(EXIT_FAILURE);
}
for (int h = 0; h < config.n_head; ++h) {
model->queries[h] = (float**)malloc(config.block_size * sizeof(float*));
model->keys[h] = (float**)malloc(config.block_size * sizeof(float*));
model->values[h] = (float**)malloc(config.block_size * sizeof(float*));
if (!model->queries[h] || !model->keys[h] || !model->values[h]) {
fprintf(stderr, "Allocation failed for attention matrix heads\n");
// Free any allocated memory
for (int i = 0; i < h; ++i) {
free(model->queries[i]);
free(model->keys[i]);
free(model->values[i]);
}
free(model->queries);
free(model->keys);
free(model->values);
exit(EXIT_FAILURE);
}
for (int i = 0; i < config.block_size; ++i) {
model->queries[h][i] = (float*)calloc(config.n_embd / config.n_head, sizeof(float));
model->keys[h][i] = (float*)calloc(config.n_embd / config.n_head, sizeof(float));
model->values[h][i] = (float*)calloc(config.n_embd / config.n_head, sizeof(float));
if (!model->queries[h][i] || !model->keys[h][i] || !model->values[h][i]) {
fprintf(stderr, "Allocation failed for attention matrix blocks\n");
// Free any allocated memory
for (int j = 0; j <= h; ++j) {
for (int k = 0; k < (j < h ? config.block_size : i); ++k) {
if (model->queries[j][k]) free(model->queries[j][k]);
if (model->keys[j][k]) free(model->keys[j][k]);
if (model->values[j][k]) free(model->values[j][k]);
}
if (model->queries[j]) free(model->queries[j]);
if (model->keys[j]) free(model->keys[j]);
if (model->values[j]) free(model->values[j]);
}
free(model->queries);
free(model->keys);
free(model->values);
exit(EXIT_FAILURE);
}
}
for (int i = 0; i < config.block_size; ++i) {
model->keys[h][i] = (float*)calloc(config.n_embd / config.n_head, sizeof(float));
if (!model->keys[h][i]) {
fprintf(stderr, "Allocation failed for keys block %d of head %d\n", i, h);
// Free any allocated memory
for (int j = 0; j <= h; ++j) {
for (int k = 0; k < (j < h ? config.block_size : i); ++k) {
free(model->keys[j][k]);
}
free(model->keys[j]);
}
free(model->keys);
exit(EXIT_FAILURE);
}
// Diagnostic print statement to check the address of the allocated block
printf("Allocated memory for keys[%d][%d] at address %p\n", h, i, (void*)model->keys[h][i]);
}
}
}
// Compute queries, keys, and values for the self-attention mechanism
void compute_queries_keys_values(float *input, GPTModel *model, float ***queries, float ***keys, float ***values) {
printf("Entering compute_queries_keys_values\n");
printf("Model config - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
// Compute queries, keys, and values for each head
for (int h = 0; h < model->config.n_head; ++h) {
// Dimension checks for matrix_multiply
int expected_block_size = model->config.block_size;
int expected_n_embd_div_n_head = model->config.n_embd / model->config.n_head;
int expected_n_embd = model->config.n_embd;
if (expected_block_size != model->config.block_size || expected_n_embd_div_n_head != (model->config.n_embd / model->config.n_head) || expected_n_embd != model->config.n_embd) {
fprintf(stderr, "Dimension mismatch before matrix_multiply call for queries\n");
exit(EXIT_FAILURE);
}
matrix_multiply(model->query_weights[h], input, (*queries)[h], model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd);
// Dimension checks for matrix_multiply
if (expected_block_size != model->config.block_size || expected_n_embd_div_n_head != (model->config.n_embd / model->config.n_head) || expected_n_embd != model->config.n_embd) {
fprintf(stderr, "Dimension mismatch before matrix_multiply call for keys\n");
exit(EXIT_FAILURE);
}
matrix_multiply(model->key_weights[h], input, (*keys)[h], model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd);
// Dimension checks for matrix_multiply
if (expected_block_size != model->config.block_size || expected_n_embd_div_n_head != (model->config.n_embd / model->config.n_head) || expected_n_embd != model->config.n_embd) {
fprintf(stderr, "Dimension mismatch before matrix_multiply call for values\n");
exit(EXIT_FAILURE);
}
matrix_multiply(model->value_weights[h], input, (*values)[h], model->config.block_size, model->config.n_embd / model->config.n_head, model->config.n_embd);
}
printf("Exiting compute_queries_keys_values - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
}
// Unit test for matrix multiplication
void test_matrix_multiply() {
// Create test matrices A, B, and C
float A[2][3] = {{1, 2, 3}, {4, 5, 6}};
float B[3][2] = {{7, 8}, {9, 10}, {11, 12}};
float C[2][2] = {0};
// Expected result of multiplication
float expected[2][2] = {{58, 64}, {139, 154}};
// Perform matrix multiplication
matrix_multiply(&A[0][0], &B[0][0], &C[0][0], 2, 2, 3);
// Assert each element of the result matrix C is as expected
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 2; ++j) {
assert(fabs(C[i][j] - expected[i][j]) < 1e-5);
}
}
}
// Unit test for GELU activation
void test_gelu_activation() {
// Test input and expected output
float input = 0.5;
float expected_output = 0.3457; // Approximate expected value
printf("GELU activation input: %f\n", input);
printf("Expected output: %f\n", expected_output);
float output = gelu_activation(input);
printf("Actual output: %f\n", output);
printf("Difference: %f\n", fabs(output - expected_output));
// Assert the output is as expected
assert(fabs(output - expected_output) < 1e-4);
}
// Unit test for initializing attention matrices
void test_initialize_attention_matrices() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
initialize_model(&model, config); // Assuming this also initializes attention matrices
initialize_attention_matrices(&model, config);
// Check if memory allocation was successful and dimensions are correct
assert(model.queries != NULL);
assert(model.keys != NULL);
assert(model.values != NULL);
for (int h = 0; h < config.n_head; ++h) {
assert(model.queries[h] != NULL);
assert(model.keys[h] != NULL);
assert(model.values[h] != NULL);
for (int s = 0; s < config.block_size; ++s) {
assert(model.queries[h][s] != NULL);
assert(model.keys[h][s] != NULL);
assert(model.values[h][s] != NULL);
}
}
// Clean up
free_attention_matrices(&model, config);
}
// Unit test for computing queries, keys, and values
void test_compute_queries_keys_values() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
model.config = config; // Set the model configuration
initialize_model(&model, config); // Assuming this also initializes attention matrices
initialize_attention_matrices(&model, config);
// Ensure that the weights are not NULL
assert(model.query_weights != NULL);
assert(model.key_weights != NULL);
assert(model.value_weights != NULL);
for (int h = 0; h < config.n_head; ++h) {
assert(model.query_weights[h] != NULL);
assert(model.key_weights[h] != NULL);
assert(model.value_weights[h] != NULL);
}
// Create mock input and model weights for testing
float *input = (float*)malloc(config.block_size * config.n_embd * sizeof(float));
// Initialize input with some values
for (int i = 0; i < config.block_size * config.n_embd; ++i) {
input[i] = i;
}
// Assuming model weights are initialized in initialize_model
compute_queries_keys_values(input, &model, model.queries, model.keys, model.values);
// Check if queries, keys, and values are computed correctly
// This would involve checking the results of the matrix multiplication operations
// ...
// Clean up
free(input);
free_attention_matrices(&model, config);
}
// Function to free attention matrices
void free_attention_matrices(GPTModel *model, GPTConfig config) {
if (model->queries != NULL) {
for (int h = 0; h < config.n_head; ++h) {
if (model->queries[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
if (model->queries[h][s] != NULL) {
free(model->queries[h][s]);
model->queries[h][s] = NULL;
}
}
free(model->queries[h]);
model->queries[h] = NULL;
}
}
free(model->queries);
model->queries = NULL;
}
if (model->keys != NULL) {
for (int h = 0; h < config.n_head; ++h) {
if (model->keys[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
if (model->keys[h][s] != NULL) {
free(model->keys[h][s]);
model->keys[h][s] = NULL;
}
}
free(model->keys[h]);
model->keys[h] = NULL;
}
}
free(model->keys);
model->keys = NULL;
}
if (model->values != NULL) {
for (int h = 0; h < config.n_head; ++h) {
if (model->values[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
if (model->values[h][s] != NULL) {
free(model->values[h][s]);
model->values[h][s] = NULL;
}
}
free(model->values[h]);
model->values[h] = NULL;
}
}
free(model->values);
model->values = NULL;
}
}
// New function to initialize embeddings
void initialize_embeddings(GPTModel *model, GPTConfig config) {
// Allocate memory for token embeddings
model->token_embeddings = (float**)malloc(config.vocab_size * sizeof(float*));
CHECK_ALLOCATION(model->token_embeddings);
for (int i = 0; i < config.vocab_size; ++i) {
model->token_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->token_embeddings[i]);
// Initialize weights with random values
for (int j = 0; j < config.n_embd; ++j) {
model->token_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate memory for position embeddings
model->position_embeddings = (float**)malloc(config.block_size * sizeof(float*));
CHECK_ALLOCATION(model->position_embeddings);
for (int i = 0; i < config.block_size; ++i) {
model->position_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->position_embeddings[i]);
// Initialize weights with random values
for (int j = 0; j < config.n_embd; ++j) {
model->position_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
}
// Modify forward_pass function to apply embeddings and call flatten_attention_matrices
void forward_pass(GPTModel *model, int *input_indices, float **output) {
printf("Entering forward_pass - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
// Allocate memory for the output array if not already allocated
if (*output == NULL) {
*output = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
assert(*output != NULL); // Ensure memory allocation was successful
}
printf("After embeddings - block_size: %d, n_embd: %d\n", model->config.block_size, model->config.n_embd);
// Apply token and position embeddings to input indices
for (int i = 0; i < model->config.block_size; ++i) {
int index = input_indices[i];
assert(index >= 0 && index < model->config.vocab_size);
assert(model->token_embeddings != NULL);
assert(model->position_embeddings != NULL);
for (int j = 0; j < model->config.n_embd; ++j) {
assert(model->token_embeddings[index] != NULL);
assert(model->position_embeddings[i] != NULL);
assert(i < model->config.block_size); // Assert that i is within the expected range
assert(j < model->config.n_embd); // Assert that j is within the expected range
(*output)[i * model->config.n_embd + j] = model->token_embeddings[index][j] + model->position_embeddings[i][j];
}
}
printf("Before compute_queries_keys_values - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
compute_queries_keys_values(*output, model, model->queries, model->keys, model->values);
printf("After compute_queries_keys_values - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
float *queries_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
assert(queries_flat != NULL); // Ensure memory allocation was successful
float *keys_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
assert(keys_flat != NULL); // Ensure memory allocation was successful
float *values_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
assert(values_flat != NULL); // Ensure memory allocation was successful
float *self_attention_output_flat = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
assert(self_attention_output_flat != NULL); // Ensure memory allocation was successful
printf("Before dot_product_attention - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
dot_product_attention(queries_flat, keys_flat, values_flat, self_attention_output_flat, model->config.n_head, model->config.block_size, model->config.n_embd);
printf("After dot_product_attention - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
float *mlp_output_flat = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
assert(mlp_output_flat != NULL); // Ensure memory allocation was successful
printf("Before mlp_block - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
mlp_block(self_attention_output_flat, mlp_output_flat, model->config.block_size, model->config.n_embd, model->mlp_weights_1, model->mlp_weights_2);
printf("After mlp_block - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
float **mlp_output_2d = (float**)malloc(model->config.block_size * sizeof(float*));
for (int i = 0; i < model->config.block_size; ++i) {
mlp_output_2d[i] = &mlp_output_flat[i * model->config.n_embd];
}
printf("Before layer_normalize - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
layer_normalize(mlp_output_2d, model->ln_gamma, model->ln_beta, model->config.block_size, model->config.n_embd, 1e-5);
printf("After layer_normalize - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
free(mlp_output_2d);
for (int i = 0; i < model->config.block_size; ++i) {
for (int j = 0; j < model->config.n_embd; ++j) {
(*output)[i * model->config.n_embd + j] = mlp_output_flat[i * model->config.n_embd + j];
}
}
free(queries_flat);
free(keys_flat);
free(values_flat);
free(self_attention_output_flat);
free(mlp_output_flat);
printf("At end of forward_pass (before returning) - n_head: %d, block_size: %d, n_embd: %d\n", model->config.n_head, model->config.block_size, model->config.n_embd);
}
// Unit test for token and position embeddings
void test_embeddings() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
initialize_model(&model, config); // Initialize the model with embeddings
// Create mock input indices (for simplicity, use indices 0 to block_size-1)
int input_indices[BLOCK_SIZE];
for (int i = 0; i < BLOCK_SIZE; ++i) {
input_indices[i] = i;
}
// Allocate memory for the output of the forward pass
float *output = NULL;
// Apply embeddings using the forward pass
forward_pass(&model, input_indices, &output);
// Check if the output contains the correct values
for (int i = 0; i < BLOCK_SIZE; ++i) {
for (int j = 0; j < N_EMBD; ++j) {
float expected_value = model.token_embeddings[input_indices[i]][j] + model.position_embeddings[i][j];
assert(fabs(output[i * N_EMBD + j] - expected_value) < 1e-5);
}
}
// Clean up
free(output);
free_model(&model, config); // This function will need to be implemented to free all allocated memory in the model
}
// Function to free the model
void free_model(GPTModel *model, GPTConfig config) {
printf("Entering free_model - n_head: %d, block_size: %d, n_embd: %d\n", config.n_head, config.block_size, config.n_embd);
// Free token and position embeddings
if (model->token_embeddings != NULL) {
for (int i = 0; i < config.vocab_size; ++i) {
free(model->token_embeddings[i]);
}
free(model->token_embeddings);
model->token_embeddings = NULL;
}
if (model->position_embeddings != NULL) {
for (int i = 0; i < config.block_size; ++i) {
free(model->position_embeddings[i]);
}
free(model->position_embeddings);
model->position_embeddings = NULL;
}
// Free embedding weights
if (model->embedding_weights != NULL) {
for (int i = 0; i < config.vocab_size; ++i) {
free(model->embedding_weights[i]);
}
free(model->embedding_weights);
model->embedding_weights = NULL;
}
// Free layer normalization parameters
if (model->ln_gamma != NULL) {
free(model->ln_gamma);
model->ln_gamma = NULL;
}
if (model->ln_beta != NULL) {
free(model->ln_beta);
model->ln_beta = NULL;
}
// Free MLP weights
if (model->mlp_weights_1 != NULL) {
free(model->mlp_weights_1);
model->mlp_weights_1 = NULL;
}
if (model->mlp_weights_2 != NULL) {
free(model->mlp_weights_2);
model->mlp_weights_2 = NULL;
}
// Free attention matrices if they have not been freed already
if (model->queries != NULL || model->keys != NULL || model->values != NULL) {
free_attention_matrices(model, config);
}
// Reset the model configuration to a known state
model->config.vocab_size = 0;
model->config.block_size = 0;
model->config.n_layer = 0;
model->config.n_head = 0;
model->config.n_embd = 0;
printf("Exiting free_model - n_head: %d, block_size: %d, n_embd: %d\n", config.n_head, config.block_size, config.n_embd);
}
// Layer normalization function
void layer_normalize(float **inputs, float *gamma, float *beta, int n, int m, float epsilon) {
for (int i = 0; i < n; ++i) {
float sum = 0.0;
for (int j = 0; j < m; ++j) {
sum += inputs[i][j];
}
float mean = sum / m;
float variance_sum = 0.0;
for (int j = 0; j < m; ++j) {
variance_sum += (inputs[i][j] - mean) * (inputs[i][j] - mean);
}
float variance = variance_sum / m;
for (int j = 0; j < m; ++j) {
inputs[i][j] = (inputs[i][j] - mean) / sqrt(variance + epsilon);
inputs[i][j] = inputs[i][j] * gamma[j] + beta[j];
}
}
}
// Unit test for layer normalization
void test_layer_normalize() {
int n = 2; // Number of input vectors
int m = 3; // Number of features
float epsilon = 1e-5;
float **inputs = (float**)malloc(n * sizeof(float*));
float *gamma = (float*)malloc(m * sizeof(float));
float *beta = (float*)malloc(m * sizeof(float));
// Initialize inputs, gamma, and beta
// ...
// Call layer_normalize
layer_normalize(inputs, gamma, beta, n, m, epsilon);
// Check if the output is normalized correctly
// ...
// Clean up
free(gamma);
free(beta);
for (int i = 0; i < n; ++i) {
free(inputs[i]);
}
free(inputs);
}
// Add the new unit test to the main function
int main(int argc, char *argv[]) {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model = {0}; // Zero-initialize the model structure
// Initialize the model with the configuration
initialize_model(&model, config);
// Run unit tests with fresh model instances
test_matrix_multiply();
free_model(&model, config);
initialize_model(&model, config);
test_gelu_activation();
free_model(&model, config);
initialize_model(&model, config);
test_initialize_attention_matrices();
free_model(&model, config);
initialize_model(&model, config);
test_compute_queries_keys_values();
free_model(&model, config);
initialize_model(&model, config);
test_embeddings(); // New unit test for embeddings
free_model(&model, config);
initialize_model(&model, config);
test_layer_normalize(); // New unit test for layer normalization
free_model(&model, config);
// Rest of the main function...
// ...
return 0;
}
// i realized i wasnt uploading train_gpt2.py in previous days which was extremely unfair to devin - this is
// where i started to be more hands on rather than hands off and explicitly tell it to read and log the gpt2.py first before
// attempting the gpt2.c impl
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
// Constants for model dimensions, learning rate, etc.
#define VOCAB_SIZE 50257 // Example size, to be adjusted based on actual model
#define BLOCK_SIZE 1024 // Example size, to be adjusted based on actual model
#define N_LAYER 12 // Example size, to be adjusted based on actual model
#define N_HEAD 12 // Example size, to be adjusted based on actual model
#define N_EMBD 768 // Example size, to be adjusted based on actual model
#define LEARNING_RATE 0.001
// Data structures for model configuration and layers
typedef struct {
int vocab_size;
int block_size;
int n_layer;
int n_head;
int n_embd;
} GPTConfig;
typedef struct {
// Embedding layers, attention blocks, MLP blocks, etc.
float **embedding_weights; // Example for embedding weights
// Other components to be added
float ***queries;
float ***keys;
float ***values;
// Weights for queries, keys, and values
float **query_weights;
float **key_weights;
float **value_weights;
GPTConfig config; // Added config here
float **token_embeddings; // Embeddings for tokens
float **position_embeddings; // Embeddings for positions
float *ln_gamma; // Layer normalization gamma parameter
float *ln_beta; // Layer normalization beta parameter
float *mlp_weights_1; // Weights for the first MLP layer
float *mlp_weights_2; // Weights for the second MLP layer
} GPTModel;
// Function prototypes
void initialize_model(GPTModel *model, GPTConfig config);
void forward_pass(GPTModel *model, int *input_indices, float **output);
void backward_pass(GPTModel *model, float *grad_output, float *grad_input);
void update_weights(GPTModel *model);
void matrix_multiply(float *A, float *B, float *C, int n, int m, int k);
float gelu_activation(float x);
void test_matrix_multiply();
void test_gelu_activation();
void initialize_attention_matrices(GPTModel *model, GPTConfig config);
void compute_queries_keys_values(float *input, GPTModel *model, float ***queries, float ***keys, float ***values);
void test_initialize_attention_matrices();
void test_compute_queries_keys_values();
void free_attention_matrices(GPTModel *model, GPTConfig config);
void initialize_embeddings(GPTModel *model, GPTConfig config);
void free_model(GPTModel *model, GPTConfig config); // Prototype for new function to free model memory
void test_embeddings(); // Prototype for new unit test function
void layer_normalize(float **inputs, float *gamma, float *beta, int n, int m, float epsilon); // New function prototype
void test_layer_normalize(); // New unit test function prototype
void softmax(float *input, float *output, int length); // New function prototype for softmax
void dot_product_attention(float *queries, float *keys, float *values, float *output, int n_head, int block_size, int n_embd); // New function prototype for dot-product attention
void mlp_block(float *input, float *output, int block_size, int n_embd, float *mlp_weights_1, float *mlp_weights_2); // New function prototype for MLP block
// Function to flatten 3D attention matrices into 1D arrays
void flatten_attention_matrices(float ***matrices, float *flat_array, int n_head, int block_size, int n_embd_per_head) {
for (int h = 0; h < n_head; ++h) {
for (int i = 0; i < block_size; ++i) {
for (int j = 0; j < n_embd_per_head; ++j) {
flat_array[h * block_size * n_embd_per_head + i * n_embd_per_head + j] = matrices[h][i][j];
}
}
}
}
// Matrix multiplication function with boundary checks
void matrix_multiply(float *A, float *B, float *C, int n, int m, int k) {
// Ensure that the pointers are not NULL and dimensions are greater than zero
if (A == NULL || B == NULL || C == NULL) {
fprintf(stderr, "Null pointer provided to matrix_multiply function\n");
exit(EXIT_FAILURE);
}
if (n <= 0 || m <= 0 || k <= 0) {
fprintf(stderr, "Invalid dimensions provided to matrix_multiply function\n");
exit(EXIT_FAILURE);
}
printf("Matrix multiplication dimensions: A[%d][%d], B[%d][%d], C[%d][%d]\n", n, m, m, k, n, k);
// Perform matrix multiplication
// The outer loop iterates over the rows of matrix A and the result matrix C
for (int i = 0; i < n; ++i) {
// The inner loop iterates over the columns of matrix B and the result matrix C
for (int j = 0; j < k; ++j) {
float sum = 0;
// The innermost loop performs the dot product of the i-th row of matrix A and the j-th column of matrix B
for (int p = 0; p < m; ++p) {
sum += A[i * m + p] * B[p * k + j];
}
// Log the indices and sum before writing to matrix C
printf("Attempting to write to C at index [%d][%d] (linear index %d), total allocated size: %d\n", i, j, i * k + j, n * k);
C[i * k + j] = sum;
}
}
}
// GELU activation function
float gelu_activation(float x) {
return 0.5 * x * (1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * pow(x, 3))));
}
// Softmax function
void softmax(float *input, float *output, int length) {
float max = input[0];
for (int i = 1; i < length; ++i) {
if (input[i] > max) {
max = input[i];
}
}
float sum = 0.0;
for (int i = 0; i < length; ++i) {
output[i] = exp(input[i] - max);
sum += output[i];
}
for (int i = 0; i < length; ++i) {
output[i] /= sum;
}
}
// Corrected dot_product_attention function
void dot_product_attention(float *queries, float *keys, float *values, float *output, int n_head, int block_size, int n_embd) {
printf("Entering dot_product_attention\n");
// Temporary storage for the attention scores
float *attention_scores = (float*)malloc(block_size * block_size * sizeof(float));
for (int h = 0; h < n_head; ++h) {
// Compute the dot product between queries and keys for each head
printf("Before matrix_multiply in dot_product_attention\n");
matrix_multiply(queries + h * block_size * (n_embd / n_head), keys + h * block_size * (n_embd / n_head), attention_scores, block_size, block_size, n_embd / n_head);
printf("After matrix_multiply in dot_product_attention\n");
// Apply softmax to the attention scores
for (int i = 0; i < block_size; ++i) {
softmax(attention_scores + i * block_size, attention_scores + i * block_size, block_size);
}
// Multiply by values to get the final attention output for this head
matrix_multiply(attention_scores, values + h * block_size * (n_embd / n_head), output + h * block_size * (n_embd / n_head), block_size, n_embd / n_head, block_size);
}
// Free the temporary storage for attention scores
free(attention_scores);
printf("Exiting dot_product_attention\n");
}
// Corrected mlp_block function
void mlp_block(float *input, float *output, int block_size, int n_embd, float *mlp_weights_1, float *mlp_weights_2) {
// Define the intermediate size for the MLP
int intermediate_size = n_embd * 4; // This can be a different size
float *intermediate_output = (float*)malloc(block_size * intermediate_size * sizeof(float));
// First linear layer
matrix_multiply(input, mlp_weights_1, intermediate_output, block_size, intermediate_size, n_embd);
// Apply GELU activation
for (int i = 0; i < block_size * intermediate_size; ++i) {
intermediate_output[i] = gelu_activation(intermediate_output[i]);
}
// Second linear layer to project back to n_embd dimensions
matrix_multiply(intermediate_output, mlp_weights_2, output, block_size, n_embd, intermediate_size);
// Free the intermediate output
free(intermediate_output);
}
// Check for successful allocation and handle errors
#define CHECK_ALLOCATION(ptr) if ((ptr) == NULL) { \
fprintf(stderr, "Memory allocation failed\n"); \
free_model(model, config); \
exit(EXIT_FAILURE); \
}
// Add print statements to the initialize_model function
void initialize_model(GPTModel *model, GPTConfig config) {
printf("Entering initialize_model\n");
// Example of allocating memory for the embedding layer and initializing weights
// Assuming embedding weights are a 2D array with dimensions [vocab_size, n_embd]
model->embedding_weights = (float**)malloc(config.vocab_size * sizeof(float*));
CHECK_ALLOCATION(model->embedding_weights);
for (int i = 0; i < config.vocab_size; ++i) {
model->embedding_weights[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->embedding_weights[i]);
for (int j = 0; j < config.n_embd; ++j) {
// Initialize weights with random values, for example using a simple normal distribution
model->embedding_weights[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate and initialize weights for queries, keys, and values
model->query_weights = (float**)malloc(config.n_head * sizeof(float*));
CHECK_ALLOCATION(model->query_weights);
model->key_weights = (float**)malloc(config.n_head * sizeof(float*));
CHECK_ALLOCATION(model->key_weights);
model->value_weights = (float**)malloc(config.n_head * sizeof(float*));
CHECK_ALLOCATION(model->value_weights);
for (int h = 0; h < config.n_head; ++h) {
model->query_weights[h] = (float*)malloc(config.n_embd * (config.n_embd / config.n_head) * sizeof(float));
CHECK_ALLOCATION(model->query_weights[h]);
model->key_weights[h] = (float*)malloc(config.n_embd * (config.n_embd / config.n_head) * sizeof(float));
CHECK_ALLOCATION(model->key_weights[h]);
model->value_weights[h] = (float*)malloc(config.n_embd * (config.n_embd / config.n_head) * sizeof(float));
CHECK_ALLOCATION(model->value_weights[h]);
for (int i = 0; i < config.n_embd; ++i) {
for (int j = 0; j < config.n_embd / config.n_head; ++j) {
model->query_weights[h][i * (config.n_embd / config.n_head) + j] = ((float)rand() / (float)RAND_MAX - 0.5) * sqrt(2.0 / (config.n_embd + config.n_embd / config.n_head));
model->key_weights[h][i * (config.n_embd / config.n_head) + j] = ((float)rand() / (float)RAND_MAX - 0.5) * sqrt(2.0 / (config.n_embd + config.n_embd / config.n_head));
model->value_weights[h][i * (config.n_embd / config.n_head) + j] = ((float)rand() / (float)RAND_MAX - 0.5) * sqrt(2.0 / (config.n_embd + config.n_embd / config.n_head));
}
}
}
// Initialize ln_gamma and ln_beta
model->ln_gamma = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->ln_gamma);
model->ln_beta = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->ln_beta);
for (int i = 0; i < config.n_embd; ++i) {
model->ln_gamma[i] = 1.0; // Typically initialized to ones
model->ln_beta[i] = 0.0; // Typically initialized to zeros
}
// Initialize MLP weights
int intermediate_size = config.n_embd * 4; // This can be a different size
model->mlp_weights_1 = (float*)malloc(config.n_embd * intermediate_size * sizeof(float));
CHECK_ALLOCATION(model->mlp_weights_1);
model->mlp_weights_2 = (float*)malloc(intermediate_size * config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->mlp_weights_2);
// Random initialization of MLP weights (example)
for (int i = 0; i < config.n_embd * intermediate_size; ++i) {
model->mlp_weights_1[i] = (float)rand() / (float)RAND_MAX;
model->mlp_weights_2[i] = (float)rand() / (float)RAND_MAX;
}
// Allocate and initialize token embeddings
model->token_embeddings = (float**)malloc(config.vocab_size * sizeof(float*));
CHECK_ALLOCATION(model->token_embeddings);
for (int i = 0; i < config.vocab_size; ++i) {
model->token_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->token_embeddings[i]);
for (int j = 0; j < config.n_embd; ++j) {
model->token_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate and initialize position embeddings
model->position_embeddings = (float**)malloc(config.block_size * sizeof(float*));
for (int i = 0; i < config.block_size; ++i) {
model->position_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
for (int j = 0; j < config.n_embd; ++j) {
model->position_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Note: This is a simplified example. In practice, you would need to implement a proper random initialization
// (e.g., Xavier initialization) and also consider biases, layer normalization parameters, etc.
printf("Exiting initialize_model\n");
}
// Initialize attention matrices for queries, keys, and values
void initialize_attention_matrices(GPTModel *model, GPTConfig config) {
int n_head = config.n_head;
int block_size = config.block_size;
int k = config.n_embd / n_head; // Corrected number of columns in the result matrix
printf("Initializing attention matrices...\n");
fflush(stdout);
printf("n_head: %d, block_size: %d, k: %d\n", n_head, block_size, k);
fflush(stdout);
printf("Debug: block_size=%d, n_head=%d, model=%p, model->queries=%p\n", block_size, n_head, (void*)model, (void*)model->queries);
fflush(stdout);
// Allocate memory for the array of pointers for queries, keys, and values
model->queries = (float***)malloc(n_head * sizeof(float**));
if (model->queries == NULL) {
fprintf(stderr, "Failed to allocate memory for queries\n");
exit(EXIT_FAILURE);
}
printf("Allocated memory for queries array of pointers: %p, size: %lu\n", (void*)model->queries, n_head * sizeof(float**));
model->keys = (float***)malloc(n_head * sizeof(float**));
if (model->keys == NULL) {
fprintf(stderr, "Failed to allocate memory for keys\n");
exit(EXIT_FAILURE);
}
printf("Allocated memory for keys array of pointers: %p, size: %lu\n", (void*)model->keys, n_head * sizeof(float**));
model->values = (float***)malloc(n_head * sizeof(float**));
if (model->values == NULL) {
fprintf(stderr, "Failed to allocate memory for values\n");
exit(EXIT_FAILURE);
}
printf("Allocated memory for values array of pointers: %p, size: %lu\n", (void*)model->values, n_head * sizeof(float**));
// Allocate 2D arrays for each head
for (int i = 0; i < n_head; ++i) {
model->queries[i] = (float**)calloc(block_size, sizeof(float*));
if (model->queries[i] == NULL) {
fprintf(stderr, "Failed to allocate memory for queries for head %d\n", i);
free_attention_matrices(model, config);
exit(EXIT_FAILURE);
}
printf("Allocated memory for queries 2D array for head %d: %p, size: %lu\n", i, (void*)model->queries[i], block_size * sizeof(float*));
model->keys[i] = (float**)calloc(block_size, sizeof(float*));
if (model->keys[i] == NULL) {
fprintf(stderr, "Failed to allocate memory for keys for head %d\n", i);
free_attention_matrices(model, config);
exit(EXIT_FAILURE);
}
printf("Allocated memory for keys 2D array for head %d: %p, size: %lu\n", i, (void*)model->keys[i], block_size * sizeof(float*));
model->values[i] = (float**)calloc(block_size, sizeof(float*));
if (model->values[i] == NULL) {
fprintf(stderr, "Failed to allocate memory for values for head %d\n", i);
free_attention_matrices(model, config);
exit(EXIT_FAILURE);
}
printf("Allocated memory for values 2D array for head %d: %p, size: %lu\n", i, (void*)model->values[i], block_size * sizeof(float*));
}
}
// Compute queries, keys, and values for each head
void compute_queries_keys_values(float *input, GPTModel *model, float ***queries, float ***keys, float ***values) {
printf("Entering compute_queries_keys_values\n");
// Verify that the input matrix has the correct dimensions
assert(input != NULL);
// Assertions to verify the dimensions of the matrices before multiplication
for (int h = 0; h < model->config.n_head; ++h) {
assert(model->query_weights[h] != NULL);
assert(model->key_weights[h] != NULL);
assert(model->value_weights[h] != NULL);
assert(queries[h] != NULL);
assert(keys[h] != NULL);
assert(values[h] != NULL);
}
// Allocate memory for each row of queries, keys, and values for each head
int k = model->config.n_embd / model->config.n_head; // Corrected number of columns in the result matrix
for (int h = 0; h < model->config.n_head; ++h) {
for (int s = 0; s < model->config.block_size; ++s) {
queries[h][s] = (float*)calloc(k, sizeof(float));
keys[h][s] = (float*)calloc(k, sizeof(float));
values[h][s] = (float*)calloc(k, sizeof(float));
if (!queries[h][s] || !keys[h][s] || !values[h][s]) {
fprintf(stderr, "Allocation failed for attention matrix rows for head %d, row %d\n", h, s);
// Handle allocation failure: free any allocated memory and exit
free_attention_matrices(model, model->config);
exit(EXIT_FAILURE);
}
}
}
// Compute queries, keys, and values for each head
int matrix_multiply_count = 0; // Counter to track the number of matrix_multiply calls
for (int h = 0; h < model->config.n_head; ++h) {
// Corrected dimensions for matrix multiplication
int n = model->config.block_size; // Number of rows in the result matrix
int m = k; // Number of columns in matrix A and rows in matrix B
for (int s = 0; s < model->config.block_size; ++s) {
printf("Before matrix_multiply for queries head %d\n", h);
matrix_multiply(input, model->query_weights[h], queries[h][s], n, m, k);
matrix_multiply_count++;
printf("After matrix_multiply for queries head %d\n", h);
printf("Before matrix_multiply for keys head %d\n", h);
matrix_multiply(input, model->key_weights[h], keys[h][s], n, m, k);
matrix_multiply_count++;
printf("After matrix_multiply for keys head %d\n", h);
printf("Before matrix_multiply for values head %d\n", h);
matrix_multiply(input, model->value_weights[h], values[h][s], n, m, k);
matrix_multiply_count++;
printf("After matrix_multiply for values head %d\n", h);
}
}
printf("Exiting compute_queries_keys_values with %d calls to matrix_multiply\n", matrix_multiply_count);
}
// Unit test for matrix multiplication
void test_matrix_multiply() {
// Create test matrices A, B, and C
float A[2][3] = {{1, 2, 3}, {4, 5, 6}};
float B[3][2] = {{7, 8}, {9, 10}, {11, 12}};
float C[2][2] = {0};
// Expected result of multiplication
float expected[2][2] = {{58, 64}, {139, 154}};
// Perform matrix multiplication
matrix_multiply(&A[0][0], &B[0][0], &C[0][0], 2, 2, 3);
// Assert each element of the result matrix C is as expected
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 2; ++j) {
assert(fabs(C[i][j] - expected[i][j]) < 1e-5);
}
}
}
// Unit test for GELU activation
void test_gelu_activation() {
// Test input and expected output
float input = 0.5;
float expected_output = 0.3457; // Approximate expected value
float output = gelu_activation(input);
// Assert the output is as expected
assert(fabs(output - expected_output) < 1e-4);
}
// Unit test for initializing attention matrices
void test_initialize_attention_matrices() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
initialize_model(&model, config); // Assuming this also initializes attention matrices
initialize_attention_matrices(&model, config);
// Check if memory allocation was successful and dimensions are correct
assert(model.queries != NULL);
assert(model.keys != NULL);
assert(model.values != NULL);
for (int h = 0; h < config.n_head; ++h) {
assert(model.queries[h] != NULL);
assert(model.keys[h] != NULL);
assert(model.values[h] != NULL);
for (int s = 0; s < config.block_size; ++s) {
assert(model.queries[h][s] != NULL);
assert(model.keys[h][s] != NULL);
assert(model.values[h][s] != NULL);
}
}
// Clean up
free_attention_matrices(&model, config);
}
// Unit test for computing queries, keys, and values
void test_compute_queries_keys_values() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model = {0}; // Zero-initialize the model structure to ensure all fields are set to a valid state
model.config = config; // Set the model configuration
initialize_model(&model, config); // Assuming this also initializes attention matrices
initialize_attention_matrices(&model, config);
// Ensure that the weights are not NULL
assert(model.query_weights != NULL);
assert(model.key_weights != NULL);
assert(model.value_weights != NULL);
for (int h = 0; h < config.n_head; ++h) {
assert(model.query_weights[h] != NULL);
assert(model.key_weights[h] != NULL);
assert(model.value_weights[h] != NULL);
}
// Create mock input and model weights for testing
float *input = (float*)calloc(config.block_size * config.n_embd, sizeof(float)); // Use calloc to ensure the input is initialized to zero
// Check if input allocation was successful
assert(input != NULL);
printf("Debug: Input allocation successful.\n");
// Initialize input with some values
for (int i = 0; i < config.block_size * config.n_embd; ++i) {
input[i] = i;
}
// Print the first few elements of the input array for verification
printf("Debug: First elements of input array after initialization:\n");
for (int i = 0; i < 5; ++i) {
printf("input[%d] = %f\n", i, input[i]);
}
fflush(stdout);
compute_queries_keys_values(input, &model, model.queries, model.keys, model.values);
// Check if queries, keys, and values are computed correctly
// This would involve checking the results of the matrix multiplication operations
// ...
// Clean up
free(input);
free_attention_matrices(&model, config);
}
// Function to free attention matrices
void free_attention_matrices(GPTModel *model, GPTConfig config) {
if (model->queries != NULL) {
for (int h = 0; h < config.n_head; ++h) {
if (model->queries[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
if (model->queries[h][s] != NULL) {
free(model->queries[h][s]);
model->queries[h][s] = NULL;
}
}
free(model->queries[h]);
model->queries[h] = NULL;
}
}
free(model->queries);
model->queries = NULL;
}
if (model->keys != NULL) {
for (int h = 0; h < config.n_head; ++h) {
if (model->keys[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
if (model->keys[h][s] != NULL) {
free(model->keys[h][s]);
model->keys[h][s] = NULL;
}
}
free(model->keys[h]);
model->keys[h] = NULL;
}
}
free(model->keys);
model->keys = NULL;
}
if (model->values != NULL) {
for (int h = 0; h < config.n_head; ++h) {
if (model->values[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
if (model->values[h][s] != NULL) {
free(model->values[h][s]);
model->values[h][s] = NULL;
}
}
free(model->values[h]);
model->values[h] = NULL;
}
}
free(model->values);
model->values = NULL;
}
}
// New function to initialize embeddings
void initialize_embeddings(GPTModel *model, GPTConfig config) {
// Allocate memory for token embeddings
model->token_embeddings = (float**)malloc(config.vocab_size * sizeof(float*));
CHECK_ALLOCATION(model->token_embeddings);
for (int i = 0; i < config.vocab_size; ++i) {
model->token_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->token_embeddings[i]);
// Initialize weights with random values
for (int j = 0; j < config.n_embd; ++j) {
model->token_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
// Allocate memory for position embeddings
model->position_embeddings = (float**)malloc(config.block_size * sizeof(float*));
CHECK_ALLOCATION(model->position_embeddings);
for (int i = 0; i < config.block_size; ++i) {
model->position_embeddings[i] = (float*)malloc(config.n_embd * sizeof(float));
CHECK_ALLOCATION(model->position_embeddings[i]);
// Initialize weights with random values
for (int j = 0; j < config.n_embd; ++j) {
model->position_embeddings[i][j] = (float)rand() / (float)RAND_MAX;
}
}
}
// Modify forward_pass function to apply embeddings and call flatten_attention_matrices
void forward_pass(GPTModel *model, int *input_indices, float **output) {
// Allocate memory for the output array if not already allocated
if (*output == NULL) {
*output = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
assert(*output != NULL); // Ensure memory allocation was successful
}
// Apply token and position embeddings to input indices
for (int i = 0; i < model->config.block_size; ++i) {
int index = input_indices[i];
assert(index >= 0 && index < model->config.vocab_size);
assert(model->token_embeddings != NULL);
assert(model->position_embeddings != NULL);
for (int j = 0; j < model->config.n_embd; ++j) {
assert(model->token_embeddings[index] != NULL);
assert(model->position_embeddings[i] != NULL);
assert(i < model->config.block_size); // Assert that i is within the expected range
assert(j < model->config.n_embd); // Assert that j is within the expected range
(*output)[i * model->config.n_embd + j] = model->token_embeddings[index][j] + model->position_embeddings[i][j];
}
}
compute_queries_keys_values(*output, model, model->queries, model->keys, model->values);
float *queries_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
assert(queries_flat != NULL); // Ensure memory allocation was successful
float *keys_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
assert(keys_flat != NULL); // Ensure memory allocation was successful
float *values_flat = (float*)malloc(model->config.n_head * model->config.block_size * (model->config.n_embd / model->config.n_head) * sizeof(float));
assert(values_flat != NULL); // Ensure memory allocation was successful
float *self_attention_output_flat = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
assert(self_attention_output_flat != NULL); // Ensure memory allocation was successful
dot_product_attention(queries_flat, keys_flat, values_flat, self_attention_output_flat, model->config.n_head, model->config.block_size, model->config.n_embd);
float *mlp_output_flat = (float*)malloc(model->config.block_size * model->config.n_embd * sizeof(float));
assert(mlp_output_flat != NULL); // Ensure memory allocation was successful
mlp_block(self_attention_output_flat, mlp_output_flat, model->config.block_size, model->config.n_embd, model->mlp_weights_1, model->mlp_weights_2);
float **mlp_output_2d = (float**)malloc(model->config.block_size * sizeof(float*));
for (int i = 0; i < model->config.block_size; ++i) {
mlp_output_2d[i] = &mlp_output_flat[i * model->config.n_embd];
}
layer_normalize(mlp_output_2d, model->ln_gamma, model->ln_beta, model->config.block_size, model->config.n_embd, 1e-5);
free(mlp_output_2d);
for (int i = 0; i < model->config.block_size; ++i) {
for (int j = 0; j < model->config.n_embd; ++j) {
(*output)[i * model->config.n_embd + j] = mlp_output_flat[i * model->config.n_embd + j];
}
}
free(queries_flat);
free(keys_flat);
free(values_flat);
free(self_attention_output_flat);
free(mlp_output_flat);
}
// Unit test for token and position embeddings
void test_embeddings() {
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model;
initialize_model(&model, config); // Initialize the model with embeddings
// Create mock input indices (for simplicity, use indices 0 to block_size-1)
int input_indices[BLOCK_SIZE];
for (int i = 0; i < BLOCK_SIZE; ++i) {
input_indices[i] = i;
}
// Allocate memory for the output of the forward pass
float *output = NULL;
// Apply embeddings using the forward pass
forward_pass(&model, input_indices, &output);
// Check if the output contains the correct values
for (int i = 0; i < BLOCK_SIZE; ++i) {
for (int j = 0; j < N_EMBD; ++j) {
float expected_value = model.token_embeddings[input_indices[i]][j] + model.position_embeddings[i][j];
assert(fabs(output[i * N_EMBD + j] - expected_value) < 1e-5);
}
}
// Clean up
free(output);
free_model(&model, config); // This function will need to be implemented to free all allocated memory in the model
}
// Function to free the model
void free_model(GPTModel *model, GPTConfig config) {
// Free token and position embeddings
if (model->token_embeddings != NULL) {
for (int i = 0; i < config.vocab_size; ++i) {
free(model->token_embeddings[i]);
}
free(model->token_embeddings);
model->token_embeddings = NULL;
}
if (model->position_embeddings != NULL) {
for (int i = 0; i < config.block_size; ++i) {
free(model->position_embeddings[i]);
}
free(model->position_embeddings);
model->position_embeddings = NULL;
}
// Free embedding weights
if (model->embedding_weights != NULL) {
for (int i = 0; i < config.vocab_size; ++i) {
free(model->embedding_weights[i]);
}
free(model->embedding_weights);
model->embedding_weights = NULL;
}
// Free layer normalization parameters
if (model->ln_gamma != NULL) {
free(model->ln_gamma);
model->ln_gamma = NULL;
}
if (model->ln_beta != NULL) {
free(model->ln_beta);
model->ln_beta = NULL;
}
// Free MLP weights
if (model->mlp_weights_1 != NULL) {
free(model->mlp_weights_1);
model->mlp_weights_1 = NULL;
}
if (model->mlp_weights_2 != NULL) {
free(model->mlp_weights_2);
model->mlp_weights_2 = NULL;
}
// Free queries, keys, and values
for (int h = 0; h < config.n_head; ++h) {
if (model->queries[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
free(model->queries[h][s]);
}
free(model->queries[h]);
}
if (model->keys[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
free(model->keys[h][s]);
}
free(model->keys[h]);
}
if (model->values[h] != NULL) {
for (int s = 0; s < config.block_size; ++s) {
free(model->values[h][s]);
}
free(model->values[h]);
}
}
free(model->queries);
free(model->keys);
free(model->values);
// Free query, key, and value weights
if (model->query_weights != NULL) {
for (int i = 0; i < config.n_head; ++i) {
free(model->query_weights[i]);
}
free(model->query_weights);
model->query_weights = NULL;
}
if (model->key_weights != NULL) {
for (int i = 0; i < config.n_head; ++i) {
free(model->key_weights[i]);
}
free(model->key_weights);
model->key_weights = NULL;
}
if (model->value_weights != NULL) {
for (int i = 0; i < config.n_head; ++i) {
free(model->value_weights[i]);
}
free(model->value_weights);
model->value_weights = NULL;
}
// Reset the model configuration to a known state
model->config.vocab_size = 0;
model->config.block_size = 0;
model->config.n_layer = 0;
model->config.n_head = 0;
model->config.n_embd = 0;
}
// Layer normalization function
void layer_normalize(float **inputs, float *gamma, float *beta, int n, int m, float epsilon) {
for (int i = 0; i < n; ++i) {
float sum = 0.0;
for (int j = 0; j < m; ++j) {
sum += inputs[i][j];
}
float mean = sum / m;
float variance_sum = 0.0;
for (int j = 0; j < m; ++j) {
variance_sum += (inputs[i][j] - mean) * (inputs[i][j] - mean);
}
float variance = variance_sum / m;
for (int j = 0; j < m; ++j) {
inputs[i][j] = (inputs[i][j] - mean) / sqrt(variance + epsilon);
inputs[i][j] = inputs[i][j] * gamma[j] + beta[j];
}
}
}
// Unit test for layer normalization
void test_layer_normalize() {
int n = 2; // Number of input vectors
int m = 3; // Number of features
float epsilon = 1e-5;
float **inputs = (float**)malloc(n * sizeof(float*));
float *gamma = (float*)malloc(m * sizeof(float));
float *beta = (float*)malloc(m * sizeof(float));
// Initialize inputs, gamma, and beta
// ...
// Call layer_normalize
layer_normalize(inputs, gamma, beta, n, m, epsilon);
// Check if the output is normalized correctly
// ...
// Clean up
free(gamma);
free(beta);
for (int i = 0; i < n; ++i) {
free(inputs[i]);
}
free(inputs);
}
// Add the new unit test to the main function
int main(int argc, char *argv[]) {
printf("Starting main function\n");
GPTConfig config = {VOCAB_SIZE, BLOCK_SIZE, N_LAYER, N_HEAD, N_EMBD};
GPTModel model = {0}; // Zero-initialize the model structure
printf("Calling initialize_model\n");
initialize_model(&model, config);
printf("initialize_model completed\n");
// Commenting out all other tests to isolate test_compute_queries_keys_values
// test_matrix_multiply();
// free_model(&model, config);
// initialize_model(&model, config);
// test_gelu_activation();
// free_model(&model, config);
// initialize_model(&model, config);
// test_initialize_attention_matrices();
// free_model(&model, config);
// initialize_model(&model, config);
// test_embeddings(); // New unit test for embeddings
// free_model(&model, config);
// initialize_model(&model, config);
// test_layer_normalize(); // New unit test for layer normalization
// free_model(&model, config);
printf("Calling test_compute_queries_keys_values\n");
test_compute_queries_keys_values();
printf("test_compute_queries_keys_values completed\n");
free_model(&model, config);
// Rest of the main function...
// ...
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment