Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
#include <iostream>
#include <intrin.h>
#include <cmath>
#include <ctime>
#include <random>
#include "Chrono.h"
// An 8 wide float
#define float8 __m256
// Set 1 value to all 8 lanes
#define set1(a) _mm256_set1_ps((a))
// Add
#define add8(a,b) _mm256_add_ps((a),(b))
// Min
#define min8(a,b) _mm256_min_ps((a),(b))
// Shuffles
#define permute8(a,b,c) _mm256_permute2f128_ps((a),(b),(c))
#define blend8(a,b,c) _mm256_blend_ps((a),(b),(c))
#define shuffle8(a,b,c) _mm256_shuffle_ps((a),(b),(c))
// Length of test arrays 2^22
#define LEN 4194304
// Padding for aligned arrays
#define BUFFER 8
using namespace std;
/* HELPERS */
// For a given pointer, return the 32 bit aligned version
float* alignMem(float *ptr) {
float *aptr = ptr;
int align = (((unsigned long long) ptr) & 31) / 4;
aptr += 8 - align;
return aptr;
};
// Make an aligned array of the given length.
// The randomize parameter fills the array with
// random values.
float* makeArray(int len, bool randomize = true) {
float *list = new float[len + BUFFER];
float *alist = alignMem(list);
for (int i = 0; i < len && randomize; i++) {
alist[i] = (float) (rand() % 1000) / 1000.0f;
}
return alist;
};
// Prints a float array to the console
void printArray(float *list, int len) {
cout << "[ ";
for (int i = 0; i < len; i++) {
cout << list[i] << " ";
}
cout << " ]" << endl;
};
/* EXPERIMENTS */
// Take two float arrays, a & b, and sum them into a third array, r
void addVal(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i++) {
res[i] = list1[i] + list2[i];
}
};
// Same as above, but using AVX to chunk the workload by 8
void addValAVX(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i += 8) {
// Get address of start of each float8 group
float8 *a = (float8*) &list1[i];
float8 *b = (float8*) &list2[i];
float8 *r = (float8*) &res[i];
// Sum the groups of 8 into the result group
*r = add8(*a, *b);
}
};
// Take two float array and interleave their values so the result
// array contains even indexed elements from a, and odd indexed elements from b
void blendVal(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i++) {
res[i] = (i % 2 == 0) ? list1[i] : list2[i];
}
};
// Same as above, but using AVX to chunk the workload by groups of 8,
// and by using the 'blend' command to choose each element without an
// 'if-else' condition
void blendValAVX(float *list1, float *list2, float *res, int len) {
// Mask. A 0 means from list1, a 1 means from list 2
// You have to convert your binary mask to hex to compile it!
const int mask = 0xAA; // 0b10101010
for (int i = 0; i < len; i += 8) {
// Get address of start of each float8 group
float8 *a = (float8*) &list1[i];
float8 *b = (float8*) &list2[i];
float8 *r = (float8*) &res[i];
// Select from a & b using the mask
*r = blend8(*a, *b, mask);
}
};
// Like above, take two float arrays, but this time take the first two values
// from list a, then the second two from list b, then the third two from list a,
// and so on...
void shuffleVal(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i++) {
// This results in a stream of true/false
// TT FF TT FF TT FF ... repeating
if ((i / 2) % 2 == 0) {
res[i] = list1[i];
}
else
{
res[i] = list2[i];
}
}
};
// Does the above shuffle using AVX to grab each pair of numbers
void shuffleValAVX(float *list1, float *list2, float *res, int len) {
// Mask. Grab the first and second elements from a,
// then the third and fourth elements from b
const int mask = 0xE4; // 0b 11 10 01 00
for (int i = 0; i < len; i += 8) {
// Get address of start of each float8 group
float8 *a = (float8*) &list1[i];
float8 *b = (float8*) &list2[i];
float8 *r = (float8*) &res[i];
// Select from a and b using the mask
*r = shuffle8(*a, *b, mask);
}
};
// For an array of floats, compute the minumum value
float minVal(float *list1, int len) {
// Start with the highest number possibly in the list
float m = (float) RAND_MAX;
for (int i = 0; i < len; i++) {
// For each value check if it is less than the
// current minimum, and replace it if it is
if (m > list1[i]) m = list1[i];
}
return m;
};
// Find the minimum value in the array, this time using AVX
// to chunk the workload into groups of 8 which are solved
// in parallel
float minValAVX(float *list, int len) {
const int shufflemask = 0xB1; // 0b 10 11 00 01
const int shufflemask2 = 0x4E; // 0b 01 00 11 10
const int permutemask = 0x21; // 0b 00 10 00 01
// Creat buffer for results
int sublen = len / 8;
float *sub = new float[sublen + BUFFER];
float *sublist = alignMem(sub);
float8 res, b;
float *res0 = (float*) &res;
bool reduce = true;
do {
if (len < 8) {
memset(&sublist[len], 0, (8 - len) * sizeof(float));
reduce = false;
}
// Loop over all float8's in the list
int j = 0;
for (int i = 0; i < len; i += 8) {
// Grab the next float8 and compute the sum
res = *(float8*) &list[i];
// Shuffle 1 & find sum
b = shuffle8(res, res, shufflemask);
res = min8(res, b);
// Shuffle 2 & find sum
b = shuffle8(res, res, shufflemask2);
res = min8(res, b);
// Permute & find sum
b = permute8(res, res, permutemask);
res = min8(res, b);
// Chuck the result into the next index of the result array
sublist[j] = *res0;
j++;
}
len = j;
list = sublist;
} while (reduce);
float m = sublist[0];
delete[] sub;
return m;
};
// For an array of floats, compute the sum
float sumVal(float *list1, int len) {
float sum = 0;
for (int i = 0; i < len; i++) {
sum += list1[i];
}
return sum;
};
// Find the sum of the array, this time using AVX
// to chunk the workload into groups of 8 which are solved
// in parallel
float sumValAVX(float *list, int len) {
const int shufflemask = 0xB1; // 0b 10 11 00 01
const int shufflemask2 = 0x4E; // 0b 01 00 11 10
const int permutemask = 0x21; // 0b 00 10 00 01
// Creat buffer for results
int sublen = len / 8;
float *sub = new float[sublen + BUFFER];
float *sublist = alignMem(sub);
float8 res, b;
float *res0 = (float*) &res;
bool reduce = true;
do {
if (len < 8) {
memset(&list[len], 0, ((8 - len) + 1) * sizeof(float));
reduce = false;
}
// Loop over all float8's in the list
int j = 0;
for (int i = 0; i < len; i += 8) {
// Grab the next float8 and compute the sum
res = *(float8*) &list[i];
// Shuffle 1 & find sum
b = shuffle8(res, res, shufflemask);
res = add8(res, b);
// Shuffle 2 & find sum
b = shuffle8(res, res, shufflemask2);
res = add8(res, b);
// Permute & find sum
b = permute8(res, res, permutemask);
res = add8(res, b);
// Chuck the result into the next index of the result array
sublist[j] = *res0;
j++;
}
len = j;
list = sublist;
} while (reduce);
float sum = sublist[0];
delete [] sub;
return sum;
};
/* RUN */
int main() {
srand(time(NULL));
// Make some random data to play with...
cout << "Array 1" << endl;
float *list1 = makeArray(LEN);
//printArray(list1, LEN);
cout << "Array 2" << endl;
float *list2 = makeArray(LEN);
//printArray(list2, LEN);
float *res = makeArray(LEN, false);
// Start the clock!
Chrono c;
/* Test the Add function! */
cout << endl << "Add Test" << endl;
addVal(list1, list2, res, LEN);
//cout << " "; printArray(res, LEN);
c.PrintElapsedTime_us("\n Time: ");
addValAVX(list1, list2, res, LEN);
//cout << "AVX "; printArray(res, LEN);
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Blend function! */
cout << endl << "Blend Test" << endl;
blendVal(list1, list2, res, LEN);
//cout << " "; printArray(res, LEN);
c.PrintElapsedTime_us("\n Time: ");
blendValAVX(list1, list2, res, LEN);
//cout << "AVX "; printArray(res, LEN);
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Shuffle function! */
cout << endl << "Shuffle Test" << endl;
shuffleVal(list1, list2, res, LEN);
//cout << " "; printArray(res, LEN);
c.PrintElapsedTime_us("\n Time: ");
shuffleValAVX(list1, list2, res, LEN);
//cout << "AVX "; printArray(res, LEN);
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Min function! */
cout << endl << "Min Test" << endl;
cout << "\n " << minVal(list1, LEN) << endl;
c.PrintElapsedTime_us("\n Time: ");
cout << "\nAVX " << minValAVX(list1, LEN) << endl;
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Sum function! */
cout << endl << "Sum Test" << endl;
cout << "\n " << sumVal(list1, LEN) << endl;
c.PrintElapsedTime_us("\n Time: ");
cout << "\nAVX " << sumValAVX(list1, LEN) << endl;
c.PrintElapsedTime_us("\nAVX Time: ");
cout << endl << "Len: " << LEN << endl;
system("PAUSE");
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment