Skip to content

Instantly share code, notes, and snippets.

@MtShan
Forked from belltailjp/simd.cpp
Last active February 9, 2016 08:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MtShan/375a371f0f215db85540 to your computer and use it in GitHub Desktop.
Save MtShan/375a371f0f215db85540 to your computer and use it in GitHub Desktop.
========================================================================
CONSOLE APPLICATION : ConsoleApplication1 Project Overview
========================================================================
AppWizard has created this ConsoleApplication1 application for you.
This file contains a summary of what you will find in each of the files that
make up your ConsoleApplication1 application.
ConsoleApplication1.vcxproj
This is the main project file for VC++ projects generated using an Application Wizard.
It contains information about the version of Visual C++ that generated the file, and
information about the platforms, configurations, and project features selected with the
Application Wizard.
ConsoleApplication1.vcxproj.filters
This is the filters file for VC++ projects generated using an Application Wizard.
It contains information about the association between the files in your project
and the filters. This association is used in the IDE to show grouping of files with
similar extensions under a specific node (for e.g. ".cpp" files are associated with the
"Source Files" filter).
ConsoleApplication1.cpp
This is the main application source file.
/////////////////////////////////////////////////////////////////////////////
Other standard files:
StdAfx.h, StdAfx.cpp
These files are used to build a precompiled header (PCH) file
named ConsoleApplication1.pch and a precompiled types file named StdAfx.obj.
/////////////////////////////////////////////////////////////////////////////
Other notes:
AppWizard uses "TODO:" comments to indicate parts of the source code you
should add to or customize.
/////////////////////////////////////////////////////////////////////////////
// ConsoleApplication1.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
float dot_normal(
_In_reads_(n) const float* __restrict vec1,
_In_reads_(n) const float* __restrict vec2,
int n)
{
float sum = 0;
for (auto i = 0; i < n; i++)
{
sum += vec1[i] * vec2[i];
}
return sum;
}
float dot_sse(
_In_reads_(n) const __m128* __restrict vec1,
_In_reads_(n) const __m128* __restrict vec2,
int n)
{
auto u = _mm_setzero_ps();
for (auto i = 0; i < n; i++)
{
auto x = _mm_mul_ps(vec1[i], vec2[i]);
u = _mm_add_ps(u, x);
}
auto t = reinterpret_cast<float*>(&u);
return t[0] + t[1] + t[2] + t[3];
}
float dot_avx(
_In_reads_(n) const __m256* __restrict vec1,
_In_reads_(n) const __m256* __restrict vec2,
int n)
{
auto u = _mm256_setzero_ps();
for (auto i = 0; i < n; i++)
{
auto x = _mm256_mul_ps(vec1[i], vec2[i]);
u = _mm256_add_ps(u, x);
}
auto t = reinterpret_cast<float*>(&u);
return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
}
float dot_avx_2(
_In_reads_(n) const __m256* __restrict vec1,
_In_reads_(n) const __m256* __restrict vec2,
int n)
{
if (n % 2 == 0)
{
throw std::invalid_argument("n must be multiple of 2.");
}
auto u1 = _mm256_setzero_ps();
auto u2 = _mm256_setzero_ps();
for (auto i = 0; i < n; i += 2)
{
auto x1 = _mm256_mul_ps(vec1[i], vec2[i]);
auto x2 = _mm256_mul_ps(vec1[i + 1], vec2[i + 1]);
u1 = _mm256_add_ps(u1, x1);
u2 = _mm256_add_ps(u2, x2);
}
u1 = _mm256_add_ps(u1, u2);
auto t = reinterpret_cast<float*>(&u1);
return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
}
template<class Function>
double calc_for_a_moment(Function&& t, int nLoop)
{
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
LARGE_INTEGER startingTime;
QueryPerformanceCounter(&startingTime);
concurrency::combinable<float> sum;
concurrency::parallel_for(0, nLoop, [t, &sum](int i)
{
sum.local() += t();
}, concurrency::static_partitioner());
LARGE_INTEGER endingTime;
QueryPerformanceCounter(&endingTime);
auto elapsedMicroseconds = ((endingTime.QuadPart - startingTime.QuadPart) * 1000000.0) / frequency.QuadPart;
//printf("result=%f\n", sum.combine(std::plus<float>()) / nLoop);
return elapsedMicroseconds / nLoop;
}
bool IsAvxSupported()
{
//ref: http://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/
bool avxSupported = false;
// Checking for AVX requires 3 things:
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE
// instructions (allowing saving YMM registers on context
// switch)
// 2) CPUID indicates support for AVX
// 3) XGETBV indicates the AVX registers will be saved and
// restored on context switch
//
// Note that XGETBV is only available on 686 or later CPUs, so
// the instruction needs to be conditionally run.
int cpuInfo[4];
__cpuid(cpuInfo, 1);
bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) || false;
bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false;
if (osUsesXSAVE_XRSTORE && cpuAVXSuport)
{
// Check if the OS will save the YMM registers
unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
avxSupported = (xcrFeatureMask & 0x6) == 0x6;
}
return avxSupported;
}
int _tmain(int argc, _TCHAR* argv [])
{
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
bool avxSupported = IsAvxSupported();
const int len_begin = 16;
const int len_end = 512 * 1024;
const int len_fact = 2;
const int run_loops = 1000;
_ASSERT(len_begin % 16 == 0 && len_fact % 2 == 0);
std::mt19937 rng;
std::uniform_real_distribution<> dst(-1, 1);
printf("len, tNormal, tSse, tAvx, tAvx2\n");
for (auto len = len_begin; len <= len_end; len *= len_fact)
{
auto vec1 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float) * len, 32));
auto vec2 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float) * len, 32));
std::generate(vec1, vec1 + len, [&rng, &dst]() { return dst(rng); });
std::generate(vec2, vec2 + len, [&rng, &dst]() { return dst(rng); });
auto tNormal =
calc_for_a_moment([vec1, vec2, len]() {
return dot_normal(vec1, vec2, len);
}, run_loops);
auto tSse =
calc_for_a_moment([vec1, vec2, len]() {
return dot_sse(
reinterpret_cast<__m128*>(vec1),
reinterpret_cast<__m128*>(vec2),
len / 4);
}, run_loops);
double tAvx = 0;
double tAvx2 = 0;
if (avxSupported)
{
tAvx =
calc_for_a_moment([vec1, vec2, len]() {
return dot_avx(
reinterpret_cast<__m256*>(vec1),
reinterpret_cast<__m256*>(vec2),
len / 8);
}, run_loops);
tAvx2 =
calc_for_a_moment([vec1, vec2, len]() {
return dot_avx_2(
reinterpret_cast<__m256*>(vec1),
reinterpret_cast<__m256*>(vec2),
len / 8);
}, run_loops);
}
printf("%d, %f, %f, %f, %f\n", len, tNormal, tSse, tAvx, tAvx2);
_aligned_free(vec1);
_aligned_free(vec2);
}
return 0;
}
// stdafx.cpp : source file that includes just the standard includes
// ConsoleApplication1.pch will be the pre-compiled header
// stdafx.obj will contain the pre-compiled type information
#include "stdafx.h"
// TODO: reference any additional headers you need in STDAFX.H
// and not in this file
// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently
//
#pragma once
#include "targetver.h"
#include <stdio.h>
#include <tchar.h>
// TODO: reference additional headers your program requires here
#define _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>
#include <windows.h>
#include <ppl.h>
#include <xmmintrin.h>
#include <immintrin.h>
#include <exception>
#include <random>
#include <algorithm>
#pragma once
// Including SDKDDKVer.h defines the highest available Windows platform.
// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
#include <SDKDDKVer.h>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment