liuliu/multi-sampling patch

## multi-sampling patch
From 2d2ec7ba797e279a9196d0f6fb409f3781d174bc Mon Sep 17 00:00:00 2001
From: Liu Liu <i@liuliu.me>
Date: Sat, 8 Feb 2014 19:25:02 -0500
Subject: for testing with multiple sampling

---
 lib/cuda/cwc_convnet.cu | 203 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 193 insertions(+), 10 deletions(-)

diff --git a/lib/cuda/cwc_convnet.cu b/lib/cuda/cwc_convnet.cu
index 094cb7a..017dd0b 100644
--- a/lib/cuda/cwc_convnet.cu
+++ b/lib/cuda/cwc_convnet.cu
@@ -142,7 +142,7 @@ static void _cwc_convnet_alloc_reserved(ccv_convnet_t* convnet, int batch, ccv_c
        ccv_convnet_compact(convnet);
    else if (GPU(convnet))
        return; // it is allocated properly, no-op
-   convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 10);
+   convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 7);
    GPU(convnet)->batch = batch;
    GPU(convnet)->layer_params = layer_params;
    GPU(convnet)->device.memory_usage = 0;
@@ -1572,6 +1572,71 @@ static void _cwc_convnet_tests_return(int batch, int count, float* a, int* c, co
    (batch, count, a, c);
 }

+template <int input_per_thread>
+__global__ static void _cwc_kern_convnet_tests_return_10(const int batch, const int count, float* a, int* c)
+{
+   int i, j;
+   const int thidx = threadIdx.x;
+   float max_val = a[thidx * 10];
+   for (j = 1; j < 10; j++)
+       max_val += a[thidx * 10 + j];
+   int max_idx = 0;
+   for (i = 1; i < count; i++)
+   {
+       float val = a[i * batch + thidx * 10];
+       for (j = 1; j < 10; j++)
+           val += a[i * batch + thidx * 10 + j];
+       if (val > max_val)
+           max_val = val, max_idx = i;
+   }
+   c[thidx] = max_idx;
+}
+
+template <int input_per_thread>
+__global__ static void _cwc_kern_convnet_softmax_10(const int batch, const int count, float* a)
+{
+   int i;
+   extern float shared[];
+   const int thidx = threadIdx.x;
+   float max_val = a[thidx];
+   for (i = 1; i < count; i++)
+   {
+       shared[thidx] = a[i * batch + thidx];
+       if (shared[thidx] > max_val)
+           max_val = shared[thidx];
+   }
+   float val = 0;
+   for (i = 0; i < count; i++)
+   {
+       shared[thidx] = a[i * batch + thidx];
+       val += (shared[thidx] = expf(shared[thidx] - max_val));
+       a[i * batch + thidx] = shared[thidx];
+   }
+   val = 1.0 / val;
+   for (i = 0; i < count; i++)
+       a[i * batch + thidx] *= val;
+}
+
+static void _cwc_convnet_tests_return_10(int batch, int count, float* a, int* c, const cudaStream_t& stream)
+{
+   dim3 num_blocks_sm(1);
+   dim3 threads_per_block_sm(batch / 10 * 10);
+   assert(threads_per_block_sm.x <= 1024);
+   int shared_memory_size = sizeof(float) * (batch / 10 * 10);
+   _cwc_kern_convnet_softmax_10
+   <1>
+   <<<num_blocks_sm, threads_per_block_sm, shared_memory_size, stream>>>
+   (batch, count, a);
+   // because we want to average the output through different inputs, softmax output first
+   dim3 num_blocks(1);
+   dim3 threads_per_block(batch / 10);
+   assert(threads_per_block.x <= 1024);
+   _cwc_kern_convnet_tests_return_10
+   <1>
+   <<<num_blocks, threads_per_block, 0, stream>>>
+   (batch, count, a, c);
+}
+
 template <int momentum_read>
 __global__ static void _cwc_kern_net_sgd(float* a, float* grad, float* momentum,
        const int count,
@@ -1670,6 +1735,120 @@ static void _cwc_convnet_net_sgd(ccv_convnet_t* convnet, int momentum_read, int
    }
 }

+static void _cwc_convnet_batch_formation_10(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c)
+{
+   int i, k, x;
+   assert(size <= batch);
+   for (i = 0; i < size; i++)
+   {
+       ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx ? idx[offset + i] : offset + i);
+       if (c)
+           c[i] = categorized->c;
+       switch (categorized->type)
+       {
+           case CCV_CATEGORIZED_DENSE_MATRIX:
+               assert(rows == categorized->matrix->rows && cols == categorized->matrix->cols && channels == CCV_GET_CHANNEL(categorized->matrix->type));
+               for (k = 0; k < channels; k++)
+                   for (x = 0; x < rows * cols; x++)
+                       b[(k * rows * cols + x) * batch + i] = categorized->matrix->data.f32[x * channels + k];
+               break;
+           case CCV_CATEGORIZED_FILE:
+           {
+               ccv_dense_matrix_t* image = 0;
+               ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
+               if (image)
+               {
+                   // 1
+                   ccv_dense_matrix_t* input = 0;
+                   ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, 0, dim.height, dim.width);
+                   ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+                   ccv_dense_matrix_t* patch = 0;
+                   ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, 0, rows, cols);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10] = patch->data.f32[x * channels + k];
+                   // 2
+                   ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 1] = patch->data.f32[x * channels + k];
+                   ccv_matrix_free(patch);
+                   ccv_matrix_free(input);
+                   // 3
+                   input = 0;
+                   ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, 0, dim.height, dim.width);
+                   ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+                   patch = 0;
+                   ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, 0, rows, cols);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 2] = patch->data.f32[x * channels + k];
+                   // 4
+                   ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 3] = patch->data.f32[x * channels + k];
+                   ccv_matrix_free(patch);
+                   ccv_matrix_free(input);
+                   // 5
+                   input = 0;
+                   ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, image->cols - dim.width, dim.height, dim.width);
+                   ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+                   patch = 0;
+                   ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, input->cols - cols, rows, cols);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 4] = patch->data.f32[x * channels + k];
+                   // 6
+                   ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 5] = patch->data.f32[x * channels + k];
+                   ccv_matrix_free(patch);
+                   ccv_matrix_free(input);
+                   // 7
+                   input = 0;
+                   ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, image->cols - dim.width, dim.height, dim.width);
+                   ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+                   patch = 0;
+                   ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, input->cols - cols, rows, cols);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 6] = patch->data.f32[x * channels + k];
+                   // 8
+                   ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 7] = patch->data.f32[x * channels + k];
+                   ccv_matrix_free(patch);
+                   ccv_matrix_free(input);
+                   // 9
+                   input = 0;
+                   ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, (image->rows - dim.height) / 2, (image->cols - dim.width) / 2, dim.height, dim.width);
+                   ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+                   patch = 0;
+                   ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, (input->rows - rows) / 2, (input->cols - cols) / 2, rows, cols);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 8] = patch->data.f32[x * channels + k];
+                   // 10
+                   ccv_flip(patch, &patch, 0, CCV_FLIP_X);
+                   for (k = 0; k < channels; k++)
+                       for (x = 0; x < rows * cols; x++)
+                           b[(k * rows * cols + x) * batch + i * 10 + 9] = patch->data.f32[x * channels + k];
+                   ccv_matrix_free(patch);
+                   ccv_matrix_free(input);
+                   // we loaded it in, deallocate it now
+                   if (categorized->type != CCV_CATEGORIZED_DENSE_MATRIX)
+                       ccv_matrix_free(image);
+               } else
+                   printf("cannot load %s.\n", categorized->file.filename);
+               break;
+           }
+       }
+   }
+}
+
 static void _cwc_convnet_batch_formation(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c)
 {
    int i, k, x;
@@ -2282,7 +2461,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
    z.eigenvectors = 0;
    z.eigenvalues = 0;
    z.line_no = 0;
-   int miss;
+   int miss, test_batch;
    float elapsed_time;
    ccv_function_state_begin(_cwc_convnet_supervised_train_function_state_read, z, filename);
    _cwc_convnet_mean_formation(categorizeds, z.convnet->input, z.convnet->channels, params.symmetric, &z.convnet->mean_activity);
@@ -2343,12 +2522,15 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
            cudaDeviceSynchronize(); // synchronize at this point
            // using context-1's cublas handle because we will wait this handle to finish when the copy to context-0 is required in testing
            _cwc_convnet_dor_mean_net(z.convnet, params.layer_params, GPU(z.convnet)->contexts[1].device.cublas);
+           cudaDeviceSynchronize(); // synchronize at this point
+           ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
            // run tests
            miss = 0;
-           for (i = j = 0; i < tests->rnum; i += params.mini_batch, j++)
+           test_batch = params.mini_batch / 10;
+           for (i = j = 0; i < tests->rnum; i += test_batch, j++)
            {
                cwc_convnet_context_t* context = GPU(z.convnet)->contexts + (j % 2);
-               _cwc_convnet_batch_formation(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(params.mini_batch, tests->rnum - i), context->host.input, 0);
+               _cwc_convnet_batch_formation_10(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(test_batch, tests->rnum - i), context->host.input, 0);
                cudaMemcpyAsync(context->device.input, context->host.input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device.stream);
                assert(cudaGetLastError() == cudaSuccess);
                if (j > 0)
@@ -2358,27 +2540,27 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
                assert(cudaGetLastError() == cudaSuccess);
                if (j > 0) // we have another result, pull these
                {
-                   for (k = 0; k < params.mini_batch; k++)
+                   for (k = 0; k < test_batch; k++)
                    {
-                       ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - params.mini_batch);
+                       ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - test_batch);
                        if (test->c != test_returns[(j + 1) % 2].host[k])
                            ++miss;
                    }
                    cudaEventElapsedTime(&elapsed_time, iteration, stop);
-                   FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + params.mini_batch - 1) / params.mini_batch, elapsed_time / 1000);
+                   FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + test_batch - 1) / test_batch, elapsed_time / 1000);
                }
                cudaEventRecord(iteration, context->device.stream);
                _cwc_convnet_encode_impl(z.convnet, context->device.input, params.mini_batch, 0, context);
                assert(cudaGetLastError() == cudaSuccess);
-               _cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream);
+               _cwc_convnet_tests_return_10(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream);
                assert(cudaGetLastError() == cudaSuccess);
                cudaMemcpyAsync(test_returns[j % 2].host, test_returns[j % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device.stream);
                assert(cudaGetLastError() == cudaSuccess);
            }
            cudaDeviceSynchronize(); // synchronize at this point
-           for (i = 0; i <= (tests->rnum - 1) % params.mini_batch; i++)
+           for (i = 0; i <= (tests->rnum - 1) % test_batch; i++)
            {
-               ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i + (tests->rnum - 1) / params.mini_batch * params.mini_batch);
+               ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i + (tests->rnum - 1) / test_batch * test_batch);
                if (test->c != test_returns[(j + 1) % 2].host[i])
                    ++miss;
            }
@@ -2387,6 +2569,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
            elapsed_time = 0;
            cudaEventElapsedTime(&elapsed_time, start, stop);
            FLUSH(" - at epoch %03d / %d (%03d - %d) => with miss rate %.2f%% (%.3f sec)\n", z.t + 1, params.max_epoch, z.i + 1, ccv_min(z.i + params.iterations, aligned_batches), miss * 100.0f / tests->rnum, elapsed_time / 1000);
+           exit(0);
            ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
        }
        if (z.t + 1 < params.max_epoch)
--
1.8.1.2
	From 2d2ec7ba797e279a9196d0f6fb409f3781d174bc Mon Sep 17 00:00:00 2001
	From: Liu Liu <i@liuliu.me>
	Date: Sat, 8 Feb 2014 19:25:02 -0500
	Subject: for testing with multiple sampling

	---
	lib/cuda/cwc_convnet.cu \| 203 +++++++++++++++++++++++++++++++++++++++++++++---
	1 file changed, 193 insertions(+), 10 deletions(-)

	diff --git a/lib/cuda/cwc_convnet.cu b/lib/cuda/cwc_convnet.cu
	index 094cb7a..017dd0b 100644
	--- a/lib/cuda/cwc_convnet.cu
	+++ b/lib/cuda/cwc_convnet.cu
	@@ -142,7 +142,7 @@ static void _cwc_convnet_alloc_reserved(ccv_convnet_t* convnet, int batch, ccv_c
	ccv_convnet_compact(convnet);
	else if (GPU(convnet))
	return; // it is allocated properly, no-op
	- convnet->reserved = (cwc_convnet_t)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float) convnet->count * 10);
	+ convnet->reserved = (cwc_convnet_t)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float) convnet->count * 7);
	GPU(convnet)->batch = batch;
	GPU(convnet)->layer_params = layer_params;
	GPU(convnet)->device.memory_usage = 0;
	@@ -1572,6 +1572,71 @@ static void _cwc_convnet_tests_return(int batch, int count, float* a, int* c, co
	(batch, count, a, c);
	}

	+template <int input_per_thread>
	+__global__ static void _cwc_kern_convnet_tests_return_10(const int batch, const int count, float* a, int* c)
	+{
	+ int i, j;
	+ const int thidx = threadIdx.x;
	+ float max_val = a[thidx * 10];
	+ for (j = 1; j < 10; j++)
	+ max_val += a[thidx * 10 + j];
	+ int max_idx = 0;
	+ for (i = 1; i < count; i++)
	+ {
	+ float val = a[i * batch + thidx * 10];
	+ for (j = 1; j < 10; j++)
	+ val += a[i * batch + thidx * 10 + j];
	+ if (val > max_val)
	+ max_val = val, max_idx = i;
	+ }
	+ c[thidx] = max_idx;
	+}
	+
	+template <int input_per_thread>
	+__global__ static void _cwc_kern_convnet_softmax_10(const int batch, const int count, float* a)
	+{
	+ int i;
	+ extern float shared[];
	+ const int thidx = threadIdx.x;
	+ float max_val = a[thidx];
	+ for (i = 1; i < count; i++)
	+ {
	+ shared[thidx] = a[i * batch + thidx];
	+ if (shared[thidx] > max_val)
	+ max_val = shared[thidx];
	+ }
	+ float val = 0;
	+ for (i = 0; i < count; i++)
	+ {
	+ shared[thidx] = a[i * batch + thidx];
	+ val += (shared[thidx] = expf(shared[thidx] - max_val));
	+ a[i * batch + thidx] = shared[thidx];
	+ }
	+ val = 1.0 / val;
	+ for (i = 0; i < count; i++)
	+ a[i * batch + thidx] *= val;
	+}
	+
	+static void _cwc_convnet_tests_return_10(int batch, int count, float* a, int* c, const cudaStream_t& stream)
	+{
	+ dim3 num_blocks_sm(1);
	+ dim3 threads_per_block_sm(batch / 10 * 10);
	+ assert(threads_per_block_sm.x <= 1024);
	+ int shared_memory_size = sizeof(float) * (batch / 10 * 10);
	+ _cwc_kern_convnet_softmax_10
	+ <1>
	+ <<<num_blocks_sm, threads_per_block_sm, shared_memory_size, stream>>>
	+ (batch, count, a);
	+ // because we want to average the output through different inputs, softmax output first
	+ dim3 num_blocks(1);
	+ dim3 threads_per_block(batch / 10);
	+ assert(threads_per_block.x <= 1024);
	+ _cwc_kern_convnet_tests_return_10
	+ <1>
	+ <<<num_blocks, threads_per_block, 0, stream>>>
	+ (batch, count, a, c);
	+}
	+
	template <int momentum_read>
	__global__ static void _cwc_kern_net_sgd(float* a, float* grad, float* momentum,
	const int count,
	@@ -1670,6 +1735,120 @@ static void _cwc_convnet_net_sgd(ccv_convnet_t* convnet, int momentum_read, int
	}
	}

	+static void _cwc_convnet_batch_formation_10(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c)
	+{
	+ int i, k, x;
	+ assert(size <= batch);
	+ for (i = 0; i < size; i++)
	+ {
	+ ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx ? idx[offset + i] : offset + i);
	+ if (c)
	+ c[i] = categorized->c;
	+ switch (categorized->type)
	+ {
	+ case CCV_CATEGORIZED_DENSE_MATRIX:
	+ assert(rows == categorized->matrix->rows && cols == categorized->matrix->cols && channels == CCV_GET_CHANNEL(categorized->matrix->type));
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i] = categorized->matrix->data.f32[x * channels + k];
	+ break;
	+ case CCV_CATEGORIZED_FILE:
	+ {
	+ ccv_dense_matrix_t* image = 0;
	+ ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE \| CCV_IO_RGB_COLOR);
	+ if (image)
	+ {
	+ // 1
	+ ccv_dense_matrix_t* input = 0;
	+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, 0, dim.height, dim.width);
	+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
	+ ccv_dense_matrix_t* patch = 0;
	+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, 0, rows, cols);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10] = patch->data.f32[x * channels + k];
	+ // 2
	+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 1] = patch->data.f32[x * channels + k];
	+ ccv_matrix_free(patch);
	+ ccv_matrix_free(input);
	+ // 3
	+ input = 0;
	+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, 0, dim.height, dim.width);
	+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
	+ patch = 0;
	+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, 0, rows, cols);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 2] = patch->data.f32[x * channels + k];
	+ // 4
	+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 3] = patch->data.f32[x * channels + k];
	+ ccv_matrix_free(patch);
	+ ccv_matrix_free(input);
	+ // 5
	+ input = 0;
	+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, 0, image->cols - dim.width, dim.height, dim.width);
	+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
	+ patch = 0;
	+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, 0, input->cols - cols, rows, cols);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 4] = patch->data.f32[x * channels + k];
	+ // 6
	+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 5] = patch->data.f32[x * channels + k];
	+ ccv_matrix_free(patch);
	+ ccv_matrix_free(input);
	+ // 7
	+ input = 0;
	+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, image->rows - dim.height, image->cols - dim.width, dim.height, dim.width);
	+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
	+ patch = 0;
	+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, input->rows - rows, input->cols - cols, rows, cols);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 6] = patch->data.f32[x * channels + k];
	+ // 8
	+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 7] = patch->data.f32[x * channels + k];
	+ ccv_matrix_free(patch);
	+ ccv_matrix_free(input);
	+ // 9
	+ input = 0;
	+ ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, (image->rows - dim.height) / 2, (image->cols - dim.width) / 2, dim.height, dim.width);
	+ ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
	+ patch = 0;
	+ ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, (input->rows - rows) / 2, (input->cols - cols) / 2, rows, cols);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 8] = patch->data.f32[x * channels + k];
	+ // 10
	+ ccv_flip(patch, &patch, 0, CCV_FLIP_X);
	+ for (k = 0; k < channels; k++)
	+ for (x = 0; x < rows * cols; x++)
	+ b[(k * rows * cols + x) * batch + i * 10 + 9] = patch->data.f32[x * channels + k];
	+ ccv_matrix_free(patch);
	+ ccv_matrix_free(input);
	+ // we loaded it in, deallocate it now
	+ if (categorized->type != CCV_CATEGORIZED_DENSE_MATRIX)
	+ ccv_matrix_free(image);
	+ } else
	+ printf("cannot load %s.\n", categorized->file.filename);
	+ break;
	+ }
	+ }
	+ }
	+}
	+
	static void _cwc_convnet_batch_formation(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int symmetric, int batch, int offset, int size, float* b, int* c)
	{
	int i, k, x;
	@@ -2282,7 +2461,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
	z.eigenvectors = 0;
	z.eigenvalues = 0;
	z.line_no = 0;
	- int miss;
	+ int miss, test_batch;
	float elapsed_time;
	ccv_function_state_begin(_cwc_convnet_supervised_train_function_state_read, z, filename);
	_cwc_convnet_mean_formation(categorizeds, z.convnet->input, z.convnet->channels, params.symmetric, &z.convnet->mean_activity);
	@@ -2343,12 +2522,15 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
	cudaDeviceSynchronize(); // synchronize at this point
	// using context-1's cublas handle because we will wait this handle to finish when the copy to context-0 is required in testing
	_cwc_convnet_dor_mean_net(z.convnet, params.layer_params, GPU(z.convnet)->contexts[1].device.cublas);
	+ cudaDeviceSynchronize(); // synchronize at this point
	+ ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
	// run tests
	miss = 0;
	- for (i = j = 0; i < tests->rnum; i += params.mini_batch, j++)
	+ test_batch = params.mini_batch / 10;
	+ for (i = j = 0; i < tests->rnum; i += test_batch, j++)
	{
	cwc_convnet_context_t* context = GPU(z.convnet)->contexts + (j % 2);
	- _cwc_convnet_batch_formation(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(params.mini_batch, tests->rnum - i), context->host.input, 0);
	+ _cwc_convnet_batch_formation_10(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.symmetric, params.mini_batch, i, ccv_min(test_batch, tests->rnum - i), context->host.input, 0);
	cudaMemcpyAsync(context->device.input, context->host.input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device.stream);
	assert(cudaGetLastError() == cudaSuccess);
	if (j > 0)
	@@ -2358,27 +2540,27 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
	assert(cudaGetLastError() == cudaSuccess);
	if (j > 0) // we have another result, pull these
	{
	- for (k = 0; k < params.mini_batch; k++)
	+ for (k = 0; k < test_batch; k++)
	{
	- ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - params.mini_batch);
	+ ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, k + i - test_batch);
	if (test->c != test_returns[(j + 1) % 2].host[k])
	++miss;
	}
	cudaEventElapsedTime(&elapsed_time, iteration, stop);
	- FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + params.mini_batch - 1) / params.mini_batch, elapsed_time / 1000);
	+ FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + test_batch - 1) / test_batch, elapsed_time / 1000);
	}
	cudaEventRecord(iteration, context->device.stream);
	_cwc_convnet_encode_impl(z.convnet, context->device.input, params.mini_batch, 0, context);
	assert(cudaGetLastError() == cudaSuccess);
	- _cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream);
	+ _cwc_convnet_tests_return_10(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream);
	assert(cudaGetLastError() == cudaSuccess);
	cudaMemcpyAsync(test_returns[j % 2].host, test_returns[j % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device.stream);
	assert(cudaGetLastError() == cudaSuccess);
	}
	cudaDeviceSynchronize(); // synchronize at this point
	- for (i = 0; i <= (tests->rnum - 1) % params.mini_batch; i++)
	+ for (i = 0; i <= (tests->rnum - 1) % test_batch; i++)
	{
	- ccv_categorized_t* test = (ccv_categorized_t)ccv_array_get(tests, i + (tests->rnum - 1) / params.mini_batch params.mini_batch);
	+ ccv_categorized_t* test = (ccv_categorized_t)ccv_array_get(tests, i + (tests->rnum - 1) / test_batch test_batch);
	if (test->c != test_returns[(j + 1) % 2].host[i])
	++miss;
	}
	@@ -2387,6 +2569,7 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
	elapsed_time = 0;
	cudaEventElapsedTime(&elapsed_time, start, stop);
	FLUSH(" - at epoch %03d / %d (%03d - %d) => with miss rate %.2f%% (%.3f sec)\n", z.t + 1, params.max_epoch, z.i + 1, ccv_min(z.i + params.iterations, aligned_batches), miss * 100.0f / tests->rnum, elapsed_time / 1000);
	+ exit(0);
	ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
	}
	if (z.t + 1 < params.max_epoch)
	--
	1.8.1.2