Skip to content

Instantly share code, notes, and snippets.

@hohoCode
Last active December 15, 2015 08:29
Show Gist options
  • Save hohoCode/5231592 to your computer and use it in GitHub Desktop.
Save hohoCode/5231592 to your computer and use it in GitHub Desktop.
SSE Code
inline item_bound sequential_SSE(int *sentence_num, unsigned long *bit_index, int shortlen, int short_start, int short_end, int longlen, int long_start, int long_end, int distance, int *res_sentence_num, unsigned long *res_bit_index, int *globalIndexStart)
{
int sind = short_start, lind = long_start;
int shortLenReg = ((shortlen / 4)) * 4 - 4;
int longLenReg = ((longlen / 4)) * 4 - 4;
bool flager = true;
if (shortLenReg < 0 || longLenReg < 0) {
flager = false;
} else {
shortLenReg += short_start;
longLenReg += long_start;
}
int lend = sentence_num[longLenReg + 3];
int send = sentence_num[shortLenReg + 3];
int a_max = 0, b_max = 0, a_min = 0, b_min = 0;
int mask[4];
int b;
unsigned long result_bit_index = 0;
int item = 0;
item_bound resultBoundary;
resultBoundary.low = *globalIndexStart;
resultBoundary.up = *globalIndexStart;
__m128i short_a;
__m128i long_b;
__m128i cmp_mask1;
__m128i cmp_mask2;
__m128i cmp_mask3;
__m128i cmp_mask4;
__m128i cmp_mask;
__m128i temper1;
__m128i temper2;
__m128i temper3;
__m128i temper4;
if (flager) {
//////////////Load 4 integers at one time, for two tokens sperately with SSE intrinstics - Vectorized codes.
short_a = _mm_load_si128((__m128i *) & sentence_num[sind]);
long_b = _mm_load_si128((__m128i *) & sentence_num[lind]);
a_max = _mm_extract_epi32(short_a, 3);
b_max = _mm_extract_epi32(long_b, 3);
do {
//////////////Iterative set the four integers, and do the comparisons on the basis of 4 ints.
temper1 = _mm_shuffle_epi32(long_b, _MM_SHUFFLE(0, 0, 0, 0));
temper2 = _mm_shuffle_epi32(long_b, _MM_SHUFFLE(1, 1, 1, 1));
temper3 = _mm_shuffle_epi32(long_b, _MM_SHUFFLE(2, 2, 2, 2));
temper4 = _mm_shuffle_epi32(long_b, _MM_SHUFFLE(3, 3, 3, 3));
cmp_mask4 = _mm_cmpeq_epi32(short_a, temper4);
cmp_mask3 = _mm_cmpeq_epi32(short_a, temper3);
cmp_mask2 = _mm_cmpeq_epi32(short_a, temper2);
cmp_mask1 = _mm_cmpeq_epi32(short_a, temper1);
cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2), _mm_or_si128(cmp_mask3, cmp_mask4));
unsigned int masker = _mm_movemask_ps((__m128) cmp_mask);
if (masker != 0) {
////////////////Move 128bits into 4 bits for results checking,
////////////////only sentences that can be merged is inside this if condition
mask[0] = _mm_movemask_ps((__m128) cmp_mask1);
mask[1] = _mm_movemask_ps((__m128) cmp_mask2);
mask[2] = _mm_movemask_ps((__m128) cmp_mask3);
mask[3] = _mm_movemask_ps((__m128) cmp_mask4);
for (int ccc = 0; ccc < 4; ccc++) {
if (mask[ccc] != 0) {
for (int bitpos = 0; bitpos < 4; ++bitpos) {
// extract the i-th bit
b = ((mask[ccc] >> bitpos) & 1);
if (b == 1) {
result_bit_index = 0;
result_bit_index = bit_index[sind + bitpos]; //short
result_bit_index = (result_bit_index >> 1) & bit_index[lind + ccc];
if (result_bit_index != 0) {
if (bitpos == 3) {
item = a_max;
} else {
item = _mm_extract_epi32(short_a, bitpos);
}
///////////Puts results into this array once merges can be done
res_bit_index[*globalIndexStart] = result_bit_index;
res_sentence_num[*globalIndexStart] = item;
(*globalIndexStart)++;
}
}
}
}
}
}
if (a_max <= b_max) {
sind += 4;
short_a = _mm_load_si128((__m128i *) & sentence_num[sind]);
a_max = _mm_extract_epi32(short_a, 3);
} else {
lind += 4;
long_b = _mm_load_si128((__m128i *) & sentence_num[lind]);
b_max = _mm_extract_epi32(long_b, 3);
}
}
while (sind <= shortLenReg && lind <= longLenReg);
}
////////////////////Sequential - For the remaining parts that cannot have multiple of 4 integers.
int outer_starter;
int outer_ender;
int prev;
int inner_ender;
if (!flager) {
outer_starter = sind;
outer_ender = short_end;
prev = lind;
inner_ender = long_end;
} else {
if (lind == longLenReg + 4) {
outer_starter = lind;
outer_ender = long_end;
prev = sind;
inner_ender = short_end;
distance *= (-1);
} else if (sind == shortLenReg + 4) {
outer_starter = sind;
outer_ender = short_end;
prev = lind;
inner_ender = long_end;
} else {
cerr << "Can this be possible?!" << endl;
exit(0);
}
}
for (int i = outer_starter; i <= outer_ender; i++) {
item = sentence_num[i];
for (int j = prev; j <= inner_ender; j++) {
prev = j;
if (item < sentence_num[j]) {
break;
} else if (item == sentence_num[j]) {
///Found it
result_bit_index = 0;
if (distance < 0) {
result_bit_index = bit_index[j]; //long - A
result_bit_index = (result_bit_index >> 1) & bit_index[i];
} else {
result_bit_index = bit_index[i]; //short
result_bit_index = (result_bit_index >> 1) & bit_index[j];
}
if (result_bit_index != 0) {
res_bit_index[*globalIndexStart] = result_bit_index;
res_sentence_num[*globalIndexStart] = item;
(*globalIndexStart)++;
}
break;
}
}
}
resultBoundary.up = *globalIndexStart;
return resultBoundary;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment