Skip to content

Instantly share code, notes, and snippets.

@liuliu
Created March 7, 2014 21:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save liuliu/9420735 to your computer and use it in GitHub Desktop.
Save liuliu/9420735 to your computer and use it in GitHub Desktop.
From 2d2ec7ba797e279a9196d0f6fb409f3781d174bc Mon Sep 17 00:00:00 2001
From: Liu Liu <i@liuliu.me>
Date: Sat, 8 Feb 2014 19:25:02 -0500
Subject: for testing with multiple sampling
---
lib/cuda/cwc_convnet.cu | 203 +++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 193 insertions(+), 10 deletions(-)
diff --git a/lib/cuda/cwc_convnet.cu b/lib/cuda/cwc_convnet.cu
index 094cb7a..017dd0b 100644
--- a/lib/cuda/cwc_convnet.cu
+++ b/lib/cuda/cwc_convnet.cu
@@ -142,7 +142,7 @@ static void _cwc_convnet_alloc_reserved(ccv_convnet_t* convnet, int batch, ccv_c
ccv_convnet_compact(convnet);
else if (GPU(convnet))
return; // it is allocated properly, no-op
- convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 10);
+ convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 7);
GPU(convnet)->batch = batch;
GPU(convnet)->layer_params = layer_params;
GPU(convnet)->device.memory_usage = 0;
@@ -1572,6 +1572,71 @@ static void _cwc_convnet_tests_return(int batch, int count, float* a, int* c, co
(batch, count, a, c);
}
+template <int input_per_thread>
+__global__ static void _cwc_kern_convnet_tests_return_10(const int batch, const int count, float* a, int* c)
+{
+ int i, j;
+ const int thidx = threadIdx.x;
+ float max_val = a[thidx * 10];
+ for (j = 1; j < 10; j++)
+ max_val += a[thidx * 10 + j];
+ int max_idx = 0;
+ for (i = 1; i < count; i++)
+ {
+ float val = a[i * batch + thidx * 10];
+ for (j = 1; j < 10; j++)
+ val += a[i * batch + thidx * 10 + j];
+ if (val > max_val)
+ max_val = val, max_idx = i;
+ }
+ c[thidx] = max_idx;
+}
+
+template <int input_per_thread>
+__global__ static void _cwc_kern_convnet_softmax_10(const int batch, const int count, float* a)
+{
+ int i;
+ extern float shared[];
+ const int thidx = threadIdx.x;
+ float max_val = a[thidx];
+ for (i = 1; i < count; i++)
+ {
+ shared[thidx] = a[i * batch + thidx];
+ if (shared[thidx] > max_val)
+ max_val = shared[thidx];
+ }
+ float val = 0;
+ for (i = 0; i < count; i++)
+ {
+ shared[thidx] = a[i * batch + thidx];
+ val += (shared[thidx] = expf(shared[thidx] - max_val));
+ a[i * batch + thidx] = shared[thidx];
+ }
+ val = 1.0 / val;
+ for (i = 0; i < count; i++)
+ a[i * batch + thidx] *= val;
+}
+
+static void _cwc_convnet_tests_return_10(int batch, int count, float* a, int* c, const cudaStream_t& stream)
+{
+ dim3 num_blocks_sm(1);
+ dim3 threads_per_block_sm(batch / 10 * 10);
+ assert(threads_per_block_sm.x <= 1024);
+ int shared_memory_size = sizeof(float) * (batch / 10 * 10);
+ _cwc_kern_convnet_softmax_10
+ <1>
+ <<<num_blocks_sm, threads_per_block_sm, shared_memory_size, stream>>>
+ (batch, count, a);
+ // because we want to average the output through different inputs, softmax output first
+ dim3 num_blocks(1);
+ dim3 threads_per_block(batch / 10);
+ assert(threads_per_block.x <= 1024);
+ _cwc_kern_convnet_tests_return_10
+ <1>
+ <<<num_blocks, threads_per_block, 0, stream>>>
+ (batch, count, a, c);
+}
+
template <int momentum_read>
__global__ static void _cwc_kern_net_sgd(float* a, float* grad, float* momentum,
const int count,
@@ -1670,6 +1735,120 @@ static void _cwc_convnet_net_sgd(ccv_convnet_t* convnet, int momentum_read, int
}
}
+static void _cwc_convnet_batch_formation_10(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c)
+{
+ int i, k, x;
+ assert(size <= batch);
+ for (i = 0; i < size; i++)
+ {
+ ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx ? idx[offset + i] : offset + i);
+ if (c)
+ c[i] = categorized->c;
+ switch (categorized->type)
+ {
+ case CCV_CATEGORIZED_DENSE_MATRIX:
+ assert(rows == categorized->matrix->rows && cols == categorized->matrix->cols && channels == CCV_GET_CHANNEL(categorized->matrix->type));
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i] = categorized->matrix->data.f32[x * channels + k];
+ break;
+ case CCV_CATEGORIZED_FILE:
+ {
+ ccv_dense_matrix_t* image = 0;
+ ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
+ if (image)
+ {
+ // 1
+ ccv_dense_matrix_t* input = 0;
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, 0, dim.height, dim.width);
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+ ccv_dense_matrix_t* patch = 0;
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, 0, rows, cols);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10] = patch->data.f32[x * channels + k];
+ // 2
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 1] = patch->data.f32[x * channels + k];
+ ccv_matrix_free(patch);
+ ccv_matrix_free(input);
+ // 3
+ input = 0;
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, 0, dim.height, dim.width);
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+ patch = 0;
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, 0, rows, cols);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 2] = patch->data.f32[x * channels + k];
+ // 4
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 3] = patch->data.f32[x * channels + k];
+ ccv_matrix_free(patch);
+ ccv_matrix_free(input);
+ // 5
+ input = 0;
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, image->cols - dim.width, dim.height, dim.width);
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+ patch = 0;
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, input->cols - cols, rows, cols);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 4] = patch->data.f32[x * channels + k];
+ // 6
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 5] = patch->data.f32[x * channels + k];
+ ccv_matrix_free(patch);
+ ccv_matrix_free(input);
+ // 7
+ input = 0;
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, image->cols - dim.width, dim.height, dim.width);
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+ patch = 0;
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, input->cols - cols, rows, cols);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 6] = patch->data.f32[x * channels + k];
+ // 8
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 7] = patch->data.f32[x * channels + k];
+ ccv_matrix_free(patch);
+ ccv_matrix_free(input);
+ // 9
+ input = 0;
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, (image->rows - dim.height) / 2, (image->cols - dim.width) / 2, dim.height, dim.width);
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+ patch = 0;
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, (input->rows - rows) / 2, (input->cols - cols) / 2, rows, cols);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 8] = patch->data.f32[x * channels + k];
+ // 10
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+ for (k = 0; k < channels; k++)
+ for (x = 0; x < rows * cols; x++)
+ b[(k * rows * cols + x) * batch + i * 10 + 9] = patch->data.f32[x * channels + k];
+ ccv_matrix_free(patch);
+ ccv_matrix_free(input);
+ // we loaded it in, deallocate it now
+ if (categorized->type != CCV_CATEGORIZED_DENSE_MATRIX)
+ ccv_matrix_free(image);
+ } else
+ printf("cannot load %s.\n", categorized->file.filename);
+ break;
+ }
+ }
+ }
+}
+
static void _cwc_convnet_batch_formation(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c)
{
int i, k, x;
@@ -2282,7 +2461,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
z.eigenvectors = 0;
z.eigenvalues = 0;
z.line_no = 0;
- int miss;
+ int miss, test_batch;
float elapsed_time;
ccv_function_state_begin(_cwc_convnet_supervised_train_function_state_read, z, filename);
_cwc_convnet_mean_formation(categorizeds, z.convnet->input, z.convnet->channels, params.symmetric, &z.convnet->mean_activity);
@@ -2343,12 +2522,15 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
cudaDeviceSynchronize(); // synchronize at this point
// using context-1's cublas handle because we will wait this handle to finish when the copy to context-0 is required in testing
_cwc_convnet_dor_mean_net(z.convnet, params.layer_params, GPU(z.convnet)->contexts[1].device.cublas);
+ cudaDeviceSynchronize(); // synchronize at this point
+ ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
// run tests
miss = 0;
- for (i = j = 0; i < tests->rnum; i += params.mini_batch, j++)
+ test_batch = params.mini_batch / 10;
+ for (i = j = 0; i < tests->rnum; i += test_batch, j++)
{
cwc_convnet_context_t* context = GPU(z.convnet)->contexts + (j % 2);
- _cwc_convnet_batch_formation(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(params.mini_batch, tests->rnum - i), context->host.input, 0);
+ _cwc_convnet_batch_formation_10(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(test_batch, tests->rnum - i), context->host.input, 0);
cudaMemcpyAsync(context->device.input, context->host.input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device.stream);
assert(cudaGetLastError() == cudaSuccess);
if (j > 0)
@@ -2358,27 +2540,27 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
assert(cudaGetLastError() == cudaSuccess);
if (j > 0) // we have another result, pull these
{
- for (k = 0; k < params.mini_batch; k++)
+ for (k = 0; k < test_batch; k++)
{
- ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - params.mini_batch);
+ ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - test_batch);
if (test->c != test_returns[(j + 1) % 2].host[k])
++miss;
}
cudaEventElapsedTime(&elapsed_time, iteration, stop);
- FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + params.mini_batch - 1) / params.mini_batch, elapsed_time / 1000);
+ FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + test_batch - 1) / test_batch, elapsed_time / 1000);
}
cudaEventRecord(iteration, context->device.stream);
_cwc_convnet_encode_impl(z.convnet, context->device.input, params.mini_batch, 0, context);
assert(cudaGetLastError() == cudaSuccess);
- _cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream);
+ _cwc_convnet_tests_return_10(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream);
assert(cudaGetLastError() == cudaSuccess);
cudaMemcpyAsync(test_returns[j % 2].host, test_returns[j % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device.stream);
assert(cudaGetLastError() == cudaSuccess);
}
cudaDeviceSynchronize(); // synchronize at this point
- for (i = 0; i <= (tests->rnum - 1) % params.mini_batch; i++)
+ for (i = 0; i <= (tests->rnum - 1) % test_batch; i++)
{
- ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i + (tests->rnum - 1) / params.mini_batch * params.mini_batch);
+ ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i + (tests->rnum - 1) / test_batch * test_batch);
if (test->c != test_returns[(j + 1) % 2].host[i])
++miss;
}
@@ -2387,6 +2569,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
elapsed_time = 0;
cudaEventElapsedTime(&elapsed_time, start, stop);
FLUSH(" - at epoch %03d / %d (%03d - %d) => with miss rate %.2f%% (%.3f sec)\n", z.t + 1, params.max_epoch, z.i + 1, ccv_min(z.i + params.iterations, aligned_batches), miss * 100.0f / tests->rnum, elapsed_time / 1000);
+ exit(0);
ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
}
if (z.t + 1 < params.max_epoch)
--
1.8.1.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment