Created
March 7, 2014 21:39
-
-
Save liuliu/9420735 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 2d2ec7ba797e279a9196d0f6fb409f3781d174bc Mon Sep 17 00:00:00 2001 | |
From: Liu Liu <i@liuliu.me> | |
Date: Sat, 8 Feb 2014 19:25:02 -0500 | |
Subject: for testing with multiple sampling | |
--- | |
lib/cuda/cwc_convnet.cu | 203 +++++++++++++++++++++++++++++++++++++++++++++--- | |
1 file changed, 193 insertions(+), 10 deletions(-) | |
diff --git a/lib/cuda/cwc_convnet.cu b/lib/cuda/cwc_convnet.cu | |
index 094cb7a..017dd0b 100644 | |
--- a/lib/cuda/cwc_convnet.cu | |
+++ b/lib/cuda/cwc_convnet.cu | |
@@ -142,7 +142,7 @@ static void _cwc_convnet_alloc_reserved(ccv_convnet_t* convnet, int batch, ccv_c | |
ccv_convnet_compact(convnet); | |
else if (GPU(convnet)) | |
return; // it is allocated properly, no-op | |
- convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 10); | |
+ convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 7); | |
GPU(convnet)->batch = batch; | |
GPU(convnet)->layer_params = layer_params; | |
GPU(convnet)->device.memory_usage = 0; | |
@@ -1572,6 +1572,71 @@ static void _cwc_convnet_tests_return(int batch, int count, float* a, int* c, co | |
(batch, count, a, c); | |
} | |
+template <int input_per_thread> | |
+__global__ static void _cwc_kern_convnet_tests_return_10(const int batch, const int count, float* a, int* c) | |
+{ | |
+ int i, j; | |
+ const int thidx = threadIdx.x; | |
+ float max_val = a[thidx * 10]; | |
+ for (j = 1; j < 10; j++) | |
+ max_val += a[thidx * 10 + j]; | |
+ int max_idx = 0; | |
+ for (i = 1; i < count; i++) | |
+ { | |
+ float val = a[i * batch + thidx * 10]; | |
+ for (j = 1; j < 10; j++) | |
+ val += a[i * batch + thidx * 10 + j]; | |
+ if (val > max_val) | |
+ max_val = val, max_idx = i; | |
+ } | |
+ c[thidx] = max_idx; | |
+} | |
+ | |
+template <int input_per_thread> | |
+__global__ static void _cwc_kern_convnet_softmax_10(const int batch, const int count, float* a) | |
+{ | |
+ int i; | |
+ extern float shared[]; | |
+ const int thidx = threadIdx.x; | |
+ float max_val = a[thidx]; | |
+ for (i = 1; i < count; i++) | |
+ { | |
+ shared[thidx] = a[i * batch + thidx]; | |
+ if (shared[thidx] > max_val) | |
+ max_val = shared[thidx]; | |
+ } | |
+ float val = 0; | |
+ for (i = 0; i < count; i++) | |
+ { | |
+ shared[thidx] = a[i * batch + thidx]; | |
+ val += (shared[thidx] = expf(shared[thidx] - max_val)); | |
+ a[i * batch + thidx] = shared[thidx]; | |
+ } | |
+ val = 1.0 / val; | |
+ for (i = 0; i < count; i++) | |
+ a[i * batch + thidx] *= val; | |
+} | |
+ | |
+static void _cwc_convnet_tests_return_10(int batch, int count, float* a, int* c, const cudaStream_t& stream) | |
+{ | |
+ dim3 num_blocks_sm(1); | |
+ dim3 threads_per_block_sm(batch / 10 * 10); | |
+ assert(threads_per_block_sm.x <= 1024); | |
+ int shared_memory_size = sizeof(float) * (batch / 10 * 10); | |
+ _cwc_kern_convnet_softmax_10 | |
+ <1> | |
+ <<<num_blocks_sm, threads_per_block_sm, shared_memory_size, stream>>> | |
+ (batch, count, a); | |
+ // because we want to average the output through different inputs, softmax output first | |
+ dim3 num_blocks(1); | |
+ dim3 threads_per_block(batch / 10); | |
+ assert(threads_per_block.x <= 1024); | |
+ _cwc_kern_convnet_tests_return_10 | |
+ <1> | |
+ <<<num_blocks, threads_per_block, 0, stream>>> | |
+ (batch, count, a, c); | |
+} | |
+ | |
template <int momentum_read> | |
__global__ static void _cwc_kern_net_sgd(float* a, float* grad, float* momentum, | |
const int count, | |
@@ -1670,6 +1735,120 @@ static void _cwc_convnet_net_sgd(ccv_convnet_t* convnet, int momentum_read, int | |
} | |
} | |
+static void _cwc_convnet_batch_formation_10(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c) | |
+{ | |
+ int i, k, x; | |
+ assert(size <= batch); | |
+ for (i = 0; i < size; i++) | |
+ { | |
+ ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx ? idx[offset + i] : offset + i); | |
+ if (c) | |
+ c[i] = categorized->c; | |
+ switch (categorized->type) | |
+ { | |
+ case CCV_CATEGORIZED_DENSE_MATRIX: | |
+ assert(rows == categorized->matrix->rows && cols == categorized->matrix->cols && channels == CCV_GET_CHANNEL(categorized->matrix->type)); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i] = categorized->matrix->data.f32[x * channels + k]; | |
+ break; | |
+ case CCV_CATEGORIZED_FILE: | |
+ { | |
+ ccv_dense_matrix_t* image = 0; | |
+ ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR); | |
+ if (image) | |
+ { | |
+ // 1 | |
+ ccv_dense_matrix_t* input = 0; | |
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, 0, dim.height, dim.width); | |
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0); | |
+ ccv_dense_matrix_t* patch = 0; | |
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, 0, rows, cols); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10] = patch->data.f32[x * channels + k]; | |
+ // 2 | |
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 1] = patch->data.f32[x * channels + k]; | |
+ ccv_matrix_free(patch); | |
+ ccv_matrix_free(input); | |
+ // 3 | |
+ input = 0; | |
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, 0, dim.height, dim.width); | |
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0); | |
+ patch = 0; | |
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, 0, rows, cols); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 2] = patch->data.f32[x * channels + k]; | |
+ // 4 | |
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 3] = patch->data.f32[x * channels + k]; | |
+ ccv_matrix_free(patch); | |
+ ccv_matrix_free(input); | |
+ // 5 | |
+ input = 0; | |
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, image->cols - dim.width, dim.height, dim.width); | |
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0); | |
+ patch = 0; | |
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, input->cols - cols, rows, cols); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 4] = patch->data.f32[x * channels + k]; | |
+ // 6 | |
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 5] = patch->data.f32[x * channels + k]; | |
+ ccv_matrix_free(patch); | |
+ ccv_matrix_free(input); | |
+ // 7 | |
+ input = 0; | |
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, image->cols - dim.width, dim.height, dim.width); | |
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0); | |
+ patch = 0; | |
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, input->cols - cols, rows, cols); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 6] = patch->data.f32[x * channels + k]; | |
+ // 8 | |
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 7] = patch->data.f32[x * channels + k]; | |
+ ccv_matrix_free(patch); | |
+ ccv_matrix_free(input); | |
+ // 9 | |
+ input = 0; | |
+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, (image->rows - dim.height) / 2, (image->cols - dim.width) / 2, dim.height, dim.width); | |
+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0); | |
+ patch = 0; | |
+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, (input->rows - rows) / 2, (input->cols - cols) / 2, rows, cols); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 8] = patch->data.f32[x * channels + k]; | |
+ // 10 | |
+ ccv_flip(patch, &patch, 0, CCV_FLIP_X); | |
+ for (k = 0; k < channels; k++) | |
+ for (x = 0; x < rows * cols; x++) | |
+ b[(k * rows * cols + x) * batch + i * 10 + 9] = patch->data.f32[x * channels + k]; | |
+ ccv_matrix_free(patch); | |
+ ccv_matrix_free(input); | |
+ // we loaded it in, deallocate it now | |
+ if (categorized->type != CCV_CATEGORIZED_DENSE_MATRIX) | |
+ ccv_matrix_free(image); | |
+ } else | |
+ printf("cannot load %s.\n", categorized->file.filename); | |
+ break; | |
+ } | |
+ } | |
+ } | |
+} | |
+ | |
static void _cwc_convnet_batch_formation(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c) | |
{ | |
int i, k, x; | |
@@ -2282,7 +2461,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz | |
z.eigenvectors = 0; | |
z.eigenvalues = 0; | |
z.line_no = 0; | |
- int miss; | |
+ int miss, test_batch; | |
float elapsed_time; | |
ccv_function_state_begin(_cwc_convnet_supervised_train_function_state_read, z, filename); | |
_cwc_convnet_mean_formation(categorizeds, z.convnet->input, z.convnet->channels, params.symmetric, &z.convnet->mean_activity); | |
@@ -2343,12 +2522,15 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz | |
cudaDeviceSynchronize(); // synchronize at this point | |
// using context-1's cublas handle because we will wait this handle to finish when the copy to context-0 is required in testing | |
_cwc_convnet_dor_mean_net(z.convnet, params.layer_params, GPU(z.convnet)->contexts[1].device.cublas); | |
+ cudaDeviceSynchronize(); // synchronize at this point | |
+ ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename); | |
// run tests | |
miss = 0; | |
- for (i = j = 0; i < tests->rnum; i += params.mini_batch, j++) | |
+ test_batch = params.mini_batch / 10; | |
+ for (i = j = 0; i < tests->rnum; i += test_batch, j++) | |
{ | |
cwc_convnet_context_t* context = GPU(z.convnet)->contexts + (j % 2); | |
- _cwc_convnet_batch_formation(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(params.mini_batch, tests->rnum - i), context->host.input, 0); | |
+ _cwc_convnet_batch_formation_10(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(test_batch, tests->rnum - i), context->host.input, 0); | |
cudaMemcpyAsync(context->device.input, context->host.input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device.stream); | |
assert(cudaGetLastError() == cudaSuccess); | |
if (j > 0) | |
@@ -2358,27 +2540,27 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz | |
assert(cudaGetLastError() == cudaSuccess); | |
if (j > 0) // we have another result, pull these | |
{ | |
- for (k = 0; k < params.mini_batch; k++) | |
+ for (k = 0; k < test_batch; k++) | |
{ | |
- ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - params.mini_batch); | |
+ ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - test_batch); | |
if (test->c != test_returns[(j + 1) % 2].host[k]) | |
++miss; | |
} | |
cudaEventElapsedTime(&elapsed_time, iteration, stop); | |
- FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + params.mini_batch - 1) / params.mini_batch, elapsed_time / 1000); | |
+ FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + test_batch - 1) / test_batch, elapsed_time / 1000); | |
} | |
cudaEventRecord(iteration, context->device.stream); | |
_cwc_convnet_encode_impl(z.convnet, context->device.input, params.mini_batch, 0, context); | |
assert(cudaGetLastError() == cudaSuccess); | |
- _cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream); | |
+ _cwc_convnet_tests_return_10(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream); | |
assert(cudaGetLastError() == cudaSuccess); | |
cudaMemcpyAsync(test_returns[j % 2].host, test_returns[j % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device.stream); | |
assert(cudaGetLastError() == cudaSuccess); | |
} | |
cudaDeviceSynchronize(); // synchronize at this point | |
- for (i = 0; i <= (tests->rnum - 1) % params.mini_batch; i++) | |
+ for (i = 0; i <= (tests->rnum - 1) % test_batch; i++) | |
{ | |
- ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i + (tests->rnum - 1) / params.mini_batch * params.mini_batch); | |
+ ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i + (tests->rnum - 1) / test_batch * test_batch); | |
if (test->c != test_returns[(j + 1) % 2].host[i]) | |
++miss; | |
} | |
@@ -2387,6 +2569,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz | |
elapsed_time = 0; | |
cudaEventElapsedTime(&elapsed_time, start, stop); | |
FLUSH(" - at epoch %03d / %d (%03d - %d) => with miss rate %.2f%% (%.3f sec)\n", z.t + 1, params.max_epoch, z.i + 1, ccv_min(z.i + params.iterations, aligned_batches), miss * 100.0f / tests->rnum, elapsed_time / 1000); | |
+ exit(0); | |
ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename); | |
} | |
if (z.t + 1 < params.max_epoch) | |
-- | |
1.8.1.2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment