MtShan/ReadMe.txt

## ReadMe.txt
========================================================================
    CONSOLE APPLICATION : ConsoleApplication1 Project Overview
========================================================================

AppWizard has created this ConsoleApplication1 application for you.

This file contains a summary of what you will find in each of the files that
make up your ConsoleApplication1 application.


ConsoleApplication1.vcxproj
    This is the main project file for VC++ projects generated using an Application Wizard.
    It contains information about the version of Visual C++ that generated the file, and
    information about the platforms, configurations, and project features selected with the
    Application Wizard.

ConsoleApplication1.vcxproj.filters
    This is the filters file for VC++ projects generated using an Application Wizard.
    It contains information about the association between the files in your project
    and the filters. This association is used in the IDE to show grouping of files with
    similar extensions under a specific node (for e.g. ".cpp" files are associated with the
    "Source Files" filter).

ConsoleApplication1.cpp
    This is the main application source file.

/////////////////////////////////////////////////////////////////////////////
Other standard files:

StdAfx.h, StdAfx.cpp
    These files are used to build a precompiled header (PCH) file
    named ConsoleApplication1.pch and a precompiled types file named StdAfx.obj.

/////////////////////////////////////////////////////////////////////////////
Other notes:

AppWizard uses "TODO:" comments to indicate parts of the source code you
should add to or customize.

/////////////////////////////////////////////////////////////////////////////

## simd.cpp
// ConsoleApplication1.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

float dot_normal(
	_In_reads_(n) const float* __restrict vec1,
	_In_reads_(n) const float* __restrict vec2,
	int n)
{
	float sum = 0;
	for (auto i = 0; i < n; i++)
	{
		sum += vec1[i] * vec2[i];
	}
	return sum;
}

float dot_sse(
	_In_reads_(n) const __m128* __restrict vec1,
	_In_reads_(n) const __m128* __restrict vec2,
	int n)
{
	auto u = _mm_setzero_ps();
	for (auto i = 0; i < n; i++)
	{
		auto x = _mm_mul_ps(vec1[i], vec2[i]);
		u = _mm_add_ps(u, x);
	}
	auto t = reinterpret_cast<float*>(&u);
	return t[0] + t[1] + t[2] + t[3];
}

float dot_avx(
	_In_reads_(n) const __m256* __restrict vec1,
	_In_reads_(n) const __m256* __restrict vec2,
	int n)
{
	auto u = _mm256_setzero_ps();
	for (auto i = 0; i < n; i++)
	{
		auto x = _mm256_mul_ps(vec1[i], vec2[i]);
		u = _mm256_add_ps(u, x);
	}
	auto t = reinterpret_cast<float*>(&u);
	return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
}

float dot_avx_2(
	_In_reads_(n) const __m256* __restrict vec1,
	_In_reads_(n) const __m256* __restrict vec2,
	int n)
{
	if (n % 2 == 0)
	{
		throw std::invalid_argument("n must be multiple of 2.");
	}
	auto u1 = _mm256_setzero_ps();
	auto u2 = _mm256_setzero_ps();
	for (auto i = 0; i < n; i += 2)
	{
		auto x1 = _mm256_mul_ps(vec1[i], vec2[i]);
		auto x2 = _mm256_mul_ps(vec1[i + 1], vec2[i + 1]);
		u1 = _mm256_add_ps(u1, x1);
		u2 = _mm256_add_ps(u2, x2);
	}
	u1 = _mm256_add_ps(u1, u2);

	auto t = reinterpret_cast<float*>(&u1);
	return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
}

template<class Function>
double calc_for_a_moment(Function&& t, int nLoop)
{
	LARGE_INTEGER frequency;
	QueryPerformanceFrequency(&frequency);

	LARGE_INTEGER startingTime;
	QueryPerformanceCounter(&startingTime);

	concurrency::combinable<float> sum;
	concurrency::parallel_for(0, nLoop, [t, &sum](int i)
	{
		sum.local() += t();
	}, concurrency::static_partitioner());
	LARGE_INTEGER endingTime;
	QueryPerformanceCounter(&endingTime);
	auto elapsedMicroseconds = ((endingTime.QuadPart - startingTime.QuadPart) * 1000000.0) / frequency.QuadPart;

	//printf("result=%f\n", sum.combine(std::plus<float>()) / nLoop);

	return elapsedMicroseconds / nLoop;
}

bool IsAvxSupported()
{
	//ref: http://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/
	bool avxSupported = false;

	// Checking for AVX requires 3 things:
	// 1) CPUID indicates that the OS uses XSAVE and XRSTORE
	//     instructions (allowing saving YMM registers on context
	//     switch)
	// 2) CPUID indicates support for AVX
	// 3) XGETBV indicates the AVX registers will be saved and
	//     restored on context switch
	//
	// Note that XGETBV is only available on 686 or later CPUs, so
	// the instruction needs to be conditionally run.
	int cpuInfo[4];
	__cpuid(cpuInfo, 1);

	bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) || false;
	bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false;

	if (osUsesXSAVE_XRSTORE && cpuAVXSuport)
	{
		// Check if the OS will save the YMM registers
		unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
		avxSupported = (xcrFeatureMask & 0x6) == 0x6;
	}
	return avxSupported;
}

int _tmain(int argc, _TCHAR* argv [])
{
	_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);

	bool avxSupported = IsAvxSupported();

	const int len_begin = 16;
	const int len_end = 512 * 1024;
	const int len_fact = 2;
	const int run_loops = 1000;
	_ASSERT(len_begin % 16 == 0 && len_fact % 2 == 0);

	std::mt19937 rng;
	std::uniform_real_distribution<> dst(-1, 1);

	printf("len, tNormal, tSse, tAvx, tAvx2\n");

	for (auto len = len_begin; len <= len_end; len *= len_fact)
	{
		auto vec1 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float) * len, 32));
		auto vec2 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float) * len, 32));
		std::generate(vec1, vec1 + len, [&rng, &dst]() { return dst(rng); });
		std::generate(vec2, vec2 + len, [&rng, &dst]() { return dst(rng); });

		auto tNormal =
			calc_for_a_moment([vec1, vec2, len]() {
			return dot_normal(vec1, vec2, len);
		}, run_loops);
		auto tSse =
			calc_for_a_moment([vec1, vec2, len]() {
			return dot_sse(
				reinterpret_cast<__m128*>(vec1),
				reinterpret_cast<__m128*>(vec2),
				len / 4);
		}, run_loops);
		double tAvx = 0;
		double tAvx2 = 0;
		if (avxSupported)
		{
			tAvx =
				calc_for_a_moment([vec1, vec2, len]() {
				return dot_avx(
					reinterpret_cast<__m256*>(vec1),
					reinterpret_cast<__m256*>(vec2),
					len / 8);
			}, run_loops);
			tAvx2 =
				calc_for_a_moment([vec1, vec2, len]() {
				return dot_avx_2(
					reinterpret_cast<__m256*>(vec1),
					reinterpret_cast<__m256*>(vec2),
					len / 8);
			}, run_loops);
		}

		printf("%d, %f, %f, %f, %f\n", len, tNormal, tSse, tAvx, tAvx2);

		_aligned_free(vec1);
		_aligned_free(vec2);
	}
	return 0;
}

## stdafx.cpp
// stdafx.cpp : source file that includes just the standard includes
// ConsoleApplication1.pch will be the pre-compiled header
// stdafx.obj will contain the pre-compiled type information

#include "stdafx.h"

// TODO: reference any additional headers you need in STDAFX.H
// and not in this file

## stdafx.h
// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently
//

#pragma once

#include "targetver.h"

#include <stdio.h>
#include <tchar.h>


// TODO: reference additional headers your program requires here
#define _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>

#include <windows.h>
#include <ppl.h>
#include <xmmintrin.h>
#include <immintrin.h>

#include <exception>
#include <random>
#include <algorithm>

## targetvar.h
#pragma once

// Including SDKDDKVer.h defines the highest available Windows platform.

// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.

#include <SDKDDKVer.h>
	========================================================================
	CONSOLE APPLICATION : ConsoleApplication1 Project Overview
	========================================================================

	AppWizard has created this ConsoleApplication1 application for you.

	This file contains a summary of what you will find in each of the files that
	make up your ConsoleApplication1 application.


	ConsoleApplication1.vcxproj
	This is the main project file for VC++ projects generated using an Application Wizard.
	It contains information about the version of Visual C++ that generated the file, and
	information about the platforms, configurations, and project features selected with the
	Application Wizard.

	ConsoleApplication1.vcxproj.filters
	This is the filters file for VC++ projects generated using an Application Wizard.
	It contains information about the association between the files in your project
	and the filters. This association is used in the IDE to show grouping of files with
	similar extensions under a specific node (for e.g. ".cpp" files are associated with the
	"Source Files" filter).

	ConsoleApplication1.cpp
	This is the main application source file.

	/////////////////////////////////////////////////////////////////////////////
	Other standard files:

	StdAfx.h, StdAfx.cpp
	These files are used to build a precompiled header (PCH) file
	named ConsoleApplication1.pch and a precompiled types file named StdAfx.obj.

	/////////////////////////////////////////////////////////////////////////////
	Other notes:

	AppWizard uses "TODO:" comments to indicate parts of the source code you
	should add to or customize.

	/////////////////////////////////////////////////////////////////////////////
	// ConsoleApplication1.cpp : Defines the entry point for the console application.
	//

	#include "stdafx.h"

	float dot_normal(
	_In_reads_(n) const float* __restrict vec1,
	_In_reads_(n) const float* __restrict vec2,
	int n)
	{
	float sum = 0;
	for (auto i = 0; i < n; i++)
	{
	sum += vec1[i] * vec2[i];
	}
	return sum;
	}

	float dot_sse(
	_In_reads_(n) const __m128* __restrict vec1,
	_In_reads_(n) const __m128* __restrict vec2,
	int n)
	{
	auto u = _mm_setzero_ps();
	for (auto i = 0; i < n; i++)
	{
	auto x = _mm_mul_ps(vec1[i], vec2[i]);
	u = _mm_add_ps(u, x);
	}
	auto t = reinterpret_cast<float*>(&u);
	return t[0] + t[1] + t[2] + t[3];
	}

	float dot_avx(
	_In_reads_(n) const __m256* __restrict vec1,
	_In_reads_(n) const __m256* __restrict vec2,
	int n)
	{
	auto u = _mm256_setzero_ps();
	for (auto i = 0; i < n; i++)
	{
	auto x = _mm256_mul_ps(vec1[i], vec2[i]);
	u = _mm256_add_ps(u, x);
	}
	auto t = reinterpret_cast<float*>(&u);
	return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
	}

	float dot_avx_2(
	_In_reads_(n) const __m256* __restrict vec1,
	_In_reads_(n) const __m256* __restrict vec2,
	int n)
	{
	if (n % 2 == 0)
	{
	throw std::invalid_argument("n must be multiple of 2.");
	}
	auto u1 = _mm256_setzero_ps();
	auto u2 = _mm256_setzero_ps();
	for (auto i = 0; i < n; i += 2)
	{
	auto x1 = _mm256_mul_ps(vec1[i], vec2[i]);
	auto x2 = _mm256_mul_ps(vec1[i + 1], vec2[i + 1]);
	u1 = _mm256_add_ps(u1, x1);
	u2 = _mm256_add_ps(u2, x2);
	}
	u1 = _mm256_add_ps(u1, u2);

	auto t = reinterpret_cast<float*>(&u1);
	return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
	}

	template<class Function>
	double calc_for_a_moment(Function&& t, int nLoop)
	{
	LARGE_INTEGER frequency;
	QueryPerformanceFrequency(&frequency);

	LARGE_INTEGER startingTime;
	QueryPerformanceCounter(&startingTime);

	concurrency::combinable<float> sum;
	concurrency::parallel_for(0, nLoop, [t, &sum](int i)
	{
	sum.local() += t();
	}, concurrency::static_partitioner());
	LARGE_INTEGER endingTime;
	QueryPerformanceCounter(&endingTime);
	auto elapsedMicroseconds = ((endingTime.QuadPart - startingTime.QuadPart) * 1000000.0) / frequency.QuadPart;

	//printf("result=%f\n", sum.combine(std::plus<float>()) / nLoop);

	return elapsedMicroseconds / nLoop;
	}

	bool IsAvxSupported()
	{
	//ref: http://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/
	bool avxSupported = false;

	// Checking for AVX requires 3 things:
	// 1) CPUID indicates that the OS uses XSAVE and XRSTORE
	// instructions (allowing saving YMM registers on context
	// switch)
	// 2) CPUID indicates support for AVX
	// 3) XGETBV indicates the AVX registers will be saved and
	// restored on context switch
	//
	// Note that XGETBV is only available on 686 or later CPUs, so
	// the instruction needs to be conditionally run.
	int cpuInfo[4];
	__cpuid(cpuInfo, 1);

	bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) \|\| false;
	bool cpuAVXSuport = cpuInfo[2] & (1 << 28) \|\| false;

	if (osUsesXSAVE_XRSTORE && cpuAVXSuport)
	{
	// Check if the OS will save the YMM registers
	unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
	avxSupported = (xcrFeatureMask & 0x6) == 0x6;
	}
	return avxSupported;
	}

	int _tmain(int argc, _TCHAR* argv [])
	{
	_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF \| _CRTDBG_LEAK_CHECK_DF);

	bool avxSupported = IsAvxSupported();

	const int len_begin = 16;
	const int len_end = 512 * 1024;
	const int len_fact = 2;
	const int run_loops = 1000;
	_ASSERT(len_begin % 16 == 0 && len_fact % 2 == 0);

	std::mt19937 rng;
	std::uniform_real_distribution<> dst(-1, 1);

	printf("len, tNormal, tSse, tAvx, tAvx2\n");

	for (auto len = len_begin; len <= len_end; len *= len_fact)
	{
	auto vec1 = reinterpret_cast<float>(_aligned_malloc(sizeof(float) len, 32));
	auto vec2 = reinterpret_cast<float>(_aligned_malloc(sizeof(float) len, 32));
	std::generate(vec1, vec1 + len, [&rng, &dst]() { return dst(rng); });
	std::generate(vec2, vec2 + len, [&rng, &dst]() { return dst(rng); });

	auto tNormal =
	calc_for_a_moment([vec1, vec2, len]() {
	return dot_normal(vec1, vec2, len);
	}, run_loops);
	auto tSse =
	calc_for_a_moment([vec1, vec2, len]() {
	return dot_sse(
	reinterpret_cast<__m128*>(vec1),
	reinterpret_cast<__m128*>(vec2),
	len / 4);
	}, run_loops);
	double tAvx = 0;
	double tAvx2 = 0;
	if (avxSupported)
	{
	tAvx =
	calc_for_a_moment([vec1, vec2, len]() {
	return dot_avx(
	reinterpret_cast<__m256*>(vec1),
	reinterpret_cast<__m256*>(vec2),
	len / 8);
	}, run_loops);
	tAvx2 =
	calc_for_a_moment([vec1, vec2, len]() {
	return dot_avx_2(
	reinterpret_cast<__m256*>(vec1),
	reinterpret_cast<__m256*>(vec2),
	len / 8);
	}, run_loops);
	}

	printf("%d, %f, %f, %f, %f\n", len, tNormal, tSse, tAvx, tAvx2);

	_aligned_free(vec1);
	_aligned_free(vec2);
	}
	return 0;
	}
	// stdafx.cpp : source file that includes just the standard includes
	// ConsoleApplication1.pch will be the pre-compiled header
	// stdafx.obj will contain the pre-compiled type information

	#include "stdafx.h"

	// TODO: reference any additional headers you need in STDAFX.H
	// and not in this file
	// stdafx.h : include file for standard system include files,
	// or project specific include files that are used frequently, but
	// are changed infrequently
	//

	#pragma once

	#include "targetver.h"

	#include <stdio.h>
	#include <tchar.h>



	// TODO: reference additional headers your program requires here
	#define _CRTDBG_MAP_ALLOC
	#include <stdlib.h>
	#include <crtdbg.h>

	#include <windows.h>
	#include <ppl.h>
	#include <xmmintrin.h>
	#include <immintrin.h>

	#include <exception>
	#include <random>
	#include <algorithm>
	#pragma once

	// Including SDKDDKVer.h defines the highest available Windows platform.

	// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
	// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.

	#include <SDKDDKVer.h>