This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======================================================================== | |
CONSOLE APPLICATION : ConsoleApplication1 Project Overview | |
======================================================================== | |
AppWizard has created this ConsoleApplication1 application for you. | |
This file contains a summary of what you will find in each of the files that | |
make up your ConsoleApplication1 application. | |
ConsoleApplication1.vcxproj | |
This is the main project file for VC++ projects generated using an Application Wizard. | |
It contains information about the version of Visual C++ that generated the file, and | |
information about the platforms, configurations, and project features selected with the | |
Application Wizard. | |
ConsoleApplication1.vcxproj.filters | |
This is the filters file for VC++ projects generated using an Application Wizard. | |
It contains information about the association between the files in your project | |
and the filters. This association is used in the IDE to show grouping of files with | |
similar extensions under a specific node (for e.g. ".cpp" files are associated with the | |
"Source Files" filter). | |
ConsoleApplication1.cpp | |
This is the main application source file. | |
///////////////////////////////////////////////////////////////////////////// | |
Other standard files: | |
StdAfx.h, StdAfx.cpp | |
These files are used to build a precompiled header (PCH) file | |
named ConsoleApplication1.pch and a precompiled types file named StdAfx.obj. | |
///////////////////////////////////////////////////////////////////////////// | |
Other notes: | |
AppWizard uses "TODO:" comments to indicate parts of the source code you | |
should add to or customize. | |
///////////////////////////////////////////////////////////////////////////// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ConsoleApplication1.cpp : Defines the entry point for the console application. | |
// | |
#include "stdafx.h" | |
float dot_normal( | |
_In_reads_(n) const float* __restrict vec1, | |
_In_reads_(n) const float* __restrict vec2, | |
int n) | |
{ | |
float sum = 0; | |
for (auto i = 0; i < n; i++) | |
{ | |
sum += vec1[i] * vec2[i]; | |
} | |
return sum; | |
} | |
float dot_sse( | |
_In_reads_(n) const __m128* __restrict vec1, | |
_In_reads_(n) const __m128* __restrict vec2, | |
int n) | |
{ | |
auto u = _mm_setzero_ps(); | |
for (auto i = 0; i < n; i++) | |
{ | |
auto x = _mm_mul_ps(vec1[i], vec2[i]); | |
u = _mm_add_ps(u, x); | |
} | |
auto t = reinterpret_cast<float*>(&u); | |
return t[0] + t[1] + t[2] + t[3]; | |
} | |
float dot_avx( | |
_In_reads_(n) const __m256* __restrict vec1, | |
_In_reads_(n) const __m256* __restrict vec2, | |
int n) | |
{ | |
auto u = _mm256_setzero_ps(); | |
for (auto i = 0; i < n; i++) | |
{ | |
auto x = _mm256_mul_ps(vec1[i], vec2[i]); | |
u = _mm256_add_ps(u, x); | |
} | |
auto t = reinterpret_cast<float*>(&u); | |
return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; | |
} | |
float dot_avx_2( | |
_In_reads_(n) const __m256* __restrict vec1, | |
_In_reads_(n) const __m256* __restrict vec2, | |
int n) | |
{ | |
if (n % 2 == 0) | |
{ | |
throw std::invalid_argument("n must be multiple of 2."); | |
} | |
auto u1 = _mm256_setzero_ps(); | |
auto u2 = _mm256_setzero_ps(); | |
for (auto i = 0; i < n; i += 2) | |
{ | |
auto x1 = _mm256_mul_ps(vec1[i], vec2[i]); | |
auto x2 = _mm256_mul_ps(vec1[i + 1], vec2[i + 1]); | |
u1 = _mm256_add_ps(u1, x1); | |
u2 = _mm256_add_ps(u2, x2); | |
} | |
u1 = _mm256_add_ps(u1, u2); | |
auto t = reinterpret_cast<float*>(&u1); | |
return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; | |
} | |
template<class Function> | |
double calc_for_a_moment(Function&& t, int nLoop) | |
{ | |
LARGE_INTEGER frequency; | |
QueryPerformanceFrequency(&frequency); | |
LARGE_INTEGER startingTime; | |
QueryPerformanceCounter(&startingTime); | |
concurrency::combinable<float> sum; | |
concurrency::parallel_for(0, nLoop, [t, &sum](int i) | |
{ | |
sum.local() += t(); | |
}, concurrency::static_partitioner()); | |
LARGE_INTEGER endingTime; | |
QueryPerformanceCounter(&endingTime); | |
auto elapsedMicroseconds = ((endingTime.QuadPart - startingTime.QuadPart) * 1000000.0) / frequency.QuadPart; | |
//printf("result=%f\n", sum.combine(std::plus<float>()) / nLoop); | |
return elapsedMicroseconds / nLoop; | |
} | |
bool IsAvxSupported() | |
{ | |
//ref: http://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/ | |
bool avxSupported = false; | |
// Checking for AVX requires 3 things: | |
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE | |
// instructions (allowing saving YMM registers on context | |
// switch) | |
// 2) CPUID indicates support for AVX | |
// 3) XGETBV indicates the AVX registers will be saved and | |
// restored on context switch | |
// | |
// Note that XGETBV is only available on 686 or later CPUs, so | |
// the instruction needs to be conditionally run. | |
int cpuInfo[4]; | |
__cpuid(cpuInfo, 1); | |
bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) || false; | |
bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false; | |
if (osUsesXSAVE_XRSTORE && cpuAVXSuport) | |
{ | |
// Check if the OS will save the YMM registers | |
unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); | |
avxSupported = (xcrFeatureMask & 0x6) == 0x6; | |
} | |
return avxSupported; | |
} | |
int _tmain(int argc, _TCHAR* argv []) | |
{ | |
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); | |
bool avxSupported = IsAvxSupported(); | |
const int len_begin = 16; | |
const int len_end = 512 * 1024; | |
const int len_fact = 2; | |
const int run_loops = 1000; | |
_ASSERT(len_begin % 16 == 0 && len_fact % 2 == 0); | |
std::mt19937 rng; | |
std::uniform_real_distribution<> dst(-1, 1); | |
printf("len, tNormal, tSse, tAvx, tAvx2\n"); | |
for (auto len = len_begin; len <= len_end; len *= len_fact) | |
{ | |
auto vec1 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float) * len, 32)); | |
auto vec2 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float) * len, 32)); | |
std::generate(vec1, vec1 + len, [&rng, &dst]() { return dst(rng); }); | |
std::generate(vec2, vec2 + len, [&rng, &dst]() { return dst(rng); }); | |
auto tNormal = | |
calc_for_a_moment([vec1, vec2, len]() { | |
return dot_normal(vec1, vec2, len); | |
}, run_loops); | |
auto tSse = | |
calc_for_a_moment([vec1, vec2, len]() { | |
return dot_sse( | |
reinterpret_cast<__m128*>(vec1), | |
reinterpret_cast<__m128*>(vec2), | |
len / 4); | |
}, run_loops); | |
double tAvx = 0; | |
double tAvx2 = 0; | |
if (avxSupported) | |
{ | |
tAvx = | |
calc_for_a_moment([vec1, vec2, len]() { | |
return dot_avx( | |
reinterpret_cast<__m256*>(vec1), | |
reinterpret_cast<__m256*>(vec2), | |
len / 8); | |
}, run_loops); | |
tAvx2 = | |
calc_for_a_moment([vec1, vec2, len]() { | |
return dot_avx_2( | |
reinterpret_cast<__m256*>(vec1), | |
reinterpret_cast<__m256*>(vec2), | |
len / 8); | |
}, run_loops); | |
} | |
printf("%d, %f, %f, %f, %f\n", len, tNormal, tSse, tAvx, tAvx2); | |
_aligned_free(vec1); | |
_aligned_free(vec2); | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// stdafx.cpp : source file that includes just the standard includes | |
// ConsoleApplication1.pch will be the pre-compiled header | |
// stdafx.obj will contain the pre-compiled type information | |
#include "stdafx.h" | |
// TODO: reference any additional headers you need in STDAFX.H | |
// and not in this file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// stdafx.h : include file for standard system include files, | |
// or project specific include files that are used frequently, but | |
// are changed infrequently | |
// | |
#pragma once | |
#include "targetver.h" | |
#include <stdio.h> | |
#include <tchar.h> | |
// TODO: reference additional headers your program requires here | |
#define _CRTDBG_MAP_ALLOC | |
#include <stdlib.h> | |
#include <crtdbg.h> | |
#include <windows.h> | |
#include <ppl.h> | |
#include <xmmintrin.h> | |
#include <immintrin.h> | |
#include <exception> | |
#include <random> | |
#include <algorithm> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
// Including SDKDDKVer.h defines the highest available Windows platform. | |
// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and | |
// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. | |
#include <SDKDDKVer.h> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment