panmari/Output_linux_gcc4.8.txt

## Output_linux_gcc4.8.txt
OpenCV version: 3.0.0-dev
OpenCV VCS version: 3.0.0-556-g33f5ac5
Build type: release
Parallel framework: pthreads
CPU features: mmx sse sse2 sse3
OpenCL Platforms:
    NVIDIA CUDA
        dGPU: GeForce GTX TITAN (OpenCL 1.2 CUDA)
Current OpenCL device:
    Type = dGPU
    Name = GeForce GTX TITAN
    Version = OpenCL 1.2 CUDA
    Compute units = 14
    Max work group size = 1024
    Local memory size = 48 kB
    Max memory allocation size = 1 GB 511 MB 848 kB
    Double support = Yes
    Host unified memory = No
    Has AMD Blas = No
    Has AMD Fft = No
    Preferred vector width char = 1
    Preferred vector width short = 1
    Preferred vector width int = 1
    Preferred vector width long = 1
    Preferred vector width float = 1
    Preferred vector width double = 1
Note: Google Test filter = performance_test*
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from performance_test
[ RUN      ] performance_test.performance_test_division_of_3d_by_1d
Size 		Method 1 	Method 2 	Method 3	Method 4
[2 x 2] 	0.051497 	0.005057 	0.001207 	0.034242
[10 x 10] 	0.01181 	0.002343 	0.00148 	0.006504
[100 x 100] 	0.173996 	0.176055 	0.071535 	0.159307
[1000 x 1000] 	12.8465 	17.618 	7.50787 	10.1962
[2000 x 2000] 	46.1933 	70.5931 	29.2439 	28.4596
[       OK ] performance_test.performance_test_division_of_3d_by_1d (477 ms)
[----------] 1 test from performance_test (477 ms total)

[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (477 ms total)
[  PASSED  ] 1 test.

## Output_windows_visualstudio2013
OpenCV version: 3.0.0-dev
OpenCV VCS version: 3.0.0-580-g0f1fdd8
Build type: release
Parallel framework: ms-concurrency
CPU features: popcnt mmx sse sse2 sse3 ssse3 sse4.1 sse4.2
OpenCL Platforms:
    NVIDIA CUDA
        dGPU: GeForce GTX 960 (OpenCL 1.2 CUDA)
Current OpenCL device:
    Type = dGPU
    Name = GeForce GTX 960
    Version = OpenCL 1.2 CUDA
    Compute units = 8
    Max work group size = 1024
    Local memory size = 48 kB
    Max memory allocation size = 512 MB
    Double support = Yes
    Host unified memory = No
    Has AMD Blas = No
    Has AMD Fft = No
    Preferred vector width char = 1
    Preferred vector width short = 1
    Preferred vector width int = 1
    Preferred vector width long = 1
    Preferred vector width float = 1
    Preferred vector width double = 1
    [----------] 1 test from performance_test
[ RUN      ] performance_test.performance_test_division_of_3d_by_1d
Size            Method 1        Method 2        Method 3        Method 4
[2 x 2]         0.0185935       0.00364579      0.00109374      0.00765615
[10 x 10]       0.0109374       0.00218747      0.00145831      0.00546868
[100 x 100]     0.172446        0.143644        0.0765615       0.10828
[1000 x 1000]   17.9836         15.5434         9.18447         13.7391
[2000 x 2000]   72.7801         69.2262         34.0422         56.9764
[       OK ] performance_test.performance_test_division_of_3d_by_1d (686 ms)
[----------] 1 test from performance_test (687 ms total)

## PeformanceTest.cpp
#include "opencv2/ts.hpp"
#include "opencv2/imgproc/imgproc.hpp"

using namespace std;
using namespace cv;

TEST(performance_test, performance_test_division_of_3d_by_1d) {
    vector<Size> sizes{Size(2, 2), Size(10, 10), Size(100, 100), Size(1000, 1000), Size(2000, 2000)};

    cout << "Size \t\tMethod 1 \tMethod 2 \tMethod 3" << "\tMethod 4" << endl;

    for (int is = 0; is < sizes.size(); ++is) {

        Size sz = sizes[is];
        Mat weighted_sum(sz, CV_32FC3);
        randu(weighted_sum, 0.0, 200.0);

        Mat weights(sz, CV_32FC1);
        randu(weights, 1.0, 10.0);

        Mat ws1 = weighted_sum.clone();
        Mat ws2 = weighted_sum.clone();
        Mat ws3 = weighted_sum.clone();
        Mat ws4 = weighted_sum.clone();

        // Method 1 @panmari
        double tic1 = double(getTickCount());
        Mat rec1;
        vector<Mat> channels(3);
        split(ws1, channels);
        for (Mat chan : channels) {
            divide(chan, weights, chan);
        }
        merge(channels, rec1);

        double toc1 = (double(getTickCount() - tic1)) * 1000. / getTickFrequency();

        // Method 2 @Miki
        double tic2 = double(getTickCount());
        Mat rec2 = ws2.reshape(3, 1);
        Mat ww = weights.reshape(1, 1);
        for (int i = 0; i < rec2.cols; ++i) {
            float w = ww.at<float>(0, i);
            Vec3f *v = rec2.ptr<Vec3f>(0, i);
            v->val[0] /= w;
            v->val[1] /= w;
            v->val[2] /= w;
        }
        rec2 = rec2.reshape(3, ws2.rows);

        double toc2 = (double(getTickCount() - tic2)) * 1000. / getTickFrequency();

        // Method 3 @Miki (+ @Micka)
        double tic3 = double(getTickCount());
        Mat3f rec3 = ws3.reshape(3, 1);
        //Mat3f rec3 = ws3.reshape(3, 1).clone(); // To not override original image
        Mat1f ww3 = weights.reshape(1, 1);

        Vec3f* prec3 = rec3.ptr<Vec3f>(0);
        float* pww = ww3.ptr<float>(0);

        for (int i = 0; i < rec3.cols; ++i)
        {
            float scale = 1. / (*pww);
            (*prec3)[0] *= scale;
            (*prec3)[1] *= scale;
            (*prec3)[2] *= scale;

            ++prec3; ++pww;
        }
        rec3 = rec3.reshape(3, ws3.rows);

        double toc3 = (double(getTickCount() - tic3)) * 1000. / getTickFrequency();

        // Method 4 @Micka
        double tic4 = double(getTickCount());
        Mat3f rec4;
        Mat3f w3ch;
        cvtColor(weights, w3ch, COLOR_GRAY2BGR);
        divide(ws4, w3ch, rec4);

        double toc4 = (double(getTickCount() - tic4)) * 1000. / getTickFrequency();

        cout << sz << " \t" << toc1 << " \t" << toc2 << " \t" << toc3 << " \t" << toc4 << endl;

        // Check for equality of methods.
        Mat diff;
        absdiff(rec1, rec2, diff);
        EXPECT_EQ(0, countNonZero(diff.reshape(1)));

        absdiff(rec1, rec3, diff);
        threshold(diff, diff, 1e-4, 1, THRESH_BINARY);
        EXPECT_EQ(0, countNonZero(diff.reshape(1)));

        absdiff(rec1, rec4, diff);
        EXPECT_EQ(0, countNonZero(diff.reshape(1)));
    }
}
	OpenCV version: 3.0.0-dev
	OpenCV VCS version: 3.0.0-556-g33f5ac5
	Build type: release
	Parallel framework: pthreads
	CPU features: mmx sse sse2 sse3
	OpenCL Platforms:
	NVIDIA CUDA
	dGPU: GeForce GTX TITAN (OpenCL 1.2 CUDA)
	Current OpenCL device:
	Type = dGPU
	Name = GeForce GTX TITAN
	Version = OpenCL 1.2 CUDA
	Compute units = 14
	Max work group size = 1024
	Local memory size = 48 kB
	Max memory allocation size = 1 GB 511 MB 848 kB
	Double support = Yes
	Host unified memory = No
	Has AMD Blas = No
	Has AMD Fft = No
	Preferred vector width char = 1
	Preferred vector width short = 1
	Preferred vector width int = 1
	Preferred vector width long = 1
	Preferred vector width float = 1
	Preferred vector width double = 1
	Note: Google Test filter = performance_test*
	[==========] Running 1 test from 1 test case.
	[----------] Global test environment set-up.
	[----------] 1 test from performance_test
	[ RUN ] performance_test.performance_test_division_of_3d_by_1d
	Size Method 1 Method 2 Method 3 Method 4
	[2 x 2] 0.051497 0.005057 0.001207 0.034242
	[10 x 10] 0.01181 0.002343 0.00148 0.006504
	[100 x 100] 0.173996 0.176055 0.071535 0.159307
	[1000 x 1000] 12.8465 17.618 7.50787 10.1962
	[2000 x 2000] 46.1933 70.5931 29.2439 28.4596
	[ OK ] performance_test.performance_test_division_of_3d_by_1d (477 ms)
	[----------] 1 test from performance_test (477 ms total)

	[----------] Global test environment tear-down
	[==========] 1 test from 1 test case ran. (477 ms total)
	[ PASSED ] 1 test.
	OpenCV version: 3.0.0-dev
	OpenCV VCS version: 3.0.0-580-g0f1fdd8
	Build type: release
	Parallel framework: ms-concurrency
	CPU features: popcnt mmx sse sse2 sse3 ssse3 sse4.1 sse4.2
	OpenCL Platforms:
	NVIDIA CUDA
	dGPU: GeForce GTX 960 (OpenCL 1.2 CUDA)
	Current OpenCL device:
	Type = dGPU
	Name = GeForce GTX 960
	Version = OpenCL 1.2 CUDA
	Compute units = 8
	Max work group size = 1024
	Local memory size = 48 kB
	Max memory allocation size = 512 MB
	Double support = Yes
	Host unified memory = No
	Has AMD Blas = No
	Has AMD Fft = No
	Preferred vector width char = 1
	Preferred vector width short = 1
	Preferred vector width int = 1
	Preferred vector width long = 1
	Preferred vector width float = 1
	Preferred vector width double = 1
	[----------] 1 test from performance_test
	[ RUN ] performance_test.performance_test_division_of_3d_by_1d
	Size Method 1 Method 2 Method 3 Method 4
	[2 x 2] 0.0185935 0.00364579 0.00109374 0.00765615
	[10 x 10] 0.0109374 0.00218747 0.00145831 0.00546868
	[100 x 100] 0.172446 0.143644 0.0765615 0.10828
	[1000 x 1000] 17.9836 15.5434 9.18447 13.7391
	[2000 x 2000] 72.7801 69.2262 34.0422 56.9764
	[ OK ] performance_test.performance_test_division_of_3d_by_1d (686 ms)
	[----------] 1 test from performance_test (687 ms total)
	#include "opencv2/ts.hpp"
	#include "opencv2/imgproc/imgproc.hpp"

	using namespace std;
	using namespace cv;

	TEST(performance_test, performance_test_division_of_3d_by_1d) {
	vector<Size> sizes{Size(2, 2), Size(10, 10), Size(100, 100), Size(1000, 1000), Size(2000, 2000)};

	cout << "Size \t\tMethod 1 \tMethod 2 \tMethod 3" << "\tMethod 4" << endl;

	for (int is = 0; is < sizes.size(); ++is) {

	Size sz = sizes[is];
	Mat weighted_sum(sz, CV_32FC3);
	randu(weighted_sum, 0.0, 200.0);

	Mat weights(sz, CV_32FC1);
	randu(weights, 1.0, 10.0);

	Mat ws1 = weighted_sum.clone();
	Mat ws2 = weighted_sum.clone();
	Mat ws3 = weighted_sum.clone();
	Mat ws4 = weighted_sum.clone();

	// Method 1 @panmari
	double tic1 = double(getTickCount());
	Mat rec1;
	vector<Mat> channels(3);
	split(ws1, channels);
	for (Mat chan : channels) {
	divide(chan, weights, chan);
	}
	merge(channels, rec1);

	double toc1 = (double(getTickCount() - tic1)) * 1000. / getTickFrequency();

	// Method 2 @Miki
	double tic2 = double(getTickCount());
	Mat rec2 = ws2.reshape(3, 1);
	Mat ww = weights.reshape(1, 1);
	for (int i = 0; i < rec2.cols; ++i) {
	float w = ww.at<float>(0, i);
	Vec3f *v = rec2.ptr<Vec3f>(0, i);
	v->val[0] /= w;
	v->val[1] /= w;
	v->val[2] /= w;
	}
	rec2 = rec2.reshape(3, ws2.rows);

	double toc2 = (double(getTickCount() - tic2)) * 1000. / getTickFrequency();

	// Method 3 @Miki (+ @Micka)
	double tic3 = double(getTickCount());
	Mat3f rec3 = ws3.reshape(3, 1);
	//Mat3f rec3 = ws3.reshape(3, 1).clone(); // To not override original image
	Mat1f ww3 = weights.reshape(1, 1);

	Vec3f* prec3 = rec3.ptr<Vec3f>(0);
	float* pww = ww3.ptr<float>(0);

	for (int i = 0; i < rec3.cols; ++i)
	{
	float scale = 1. / (*pww);
	(prec3)[0] = scale;
	(prec3)[1] = scale;
	(prec3)[2] = scale;

	++prec3; ++pww;
	}
	rec3 = rec3.reshape(3, ws3.rows);

	double toc3 = (double(getTickCount() - tic3)) * 1000. / getTickFrequency();

	// Method 4 @Micka
	double tic4 = double(getTickCount());
	Mat3f rec4;
	Mat3f w3ch;
	cvtColor(weights, w3ch, COLOR_GRAY2BGR);
	divide(ws4, w3ch, rec4);

	double toc4 = (double(getTickCount() - tic4)) * 1000. / getTickFrequency();

	cout << sz << " \t" << toc1 << " \t" << toc2 << " \t" << toc3 << " \t" << toc4 << endl;

	// Check for equality of methods.
	Mat diff;
	absdiff(rec1, rec2, diff);
	EXPECT_EQ(0, countNonZero(diff.reshape(1)));

	absdiff(rec1, rec3, diff);
	threshold(diff, diff, 1e-4, 1, THRESH_BINARY);
	EXPECT_EQ(0, countNonZero(diff.reshape(1)));

	absdiff(rec1, rec4, diff);
	EXPECT_EQ(0, countNonZero(diff.reshape(1)));
	}
	}