-
-
Save monkins1010/da38310c88ce80acb4a879be611839ce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// mul2.cpp : Defines the entry point for the console application. | |
// | |
#include "stdafx.h" | |
// ConsoleApplication1.cpp : This file contains the 'main' function. Program execution begins and ends there. | |
// | |
#pragma warning (disable : 4146) | |
//#include "pch.h" | |
#include <iostream> | |
#include <emmintrin.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#define FIELD_WIDTH 16 | |
#define FIELD_SIZE (1 << FIELD_WIDTH) | |
#define Q (FIELD_SIZE - 1) | |
#define gf2_16_irreducible ((1 << 12) | (1<<3) | (1<<1) | (1<<0)) | |
// irreducible polynomial is | |
// x^17 + x^12 + x^3 + x + 1 | |
uint16_t gf2_16_log_table[FIELD_SIZE]; | |
uint16_t gf2_16_exp_table[2 * FIELD_SIZE]; | |
//void init_tables(); | |
// This function computes, in a branch-free way, the value: | |
// (x == 0) ? 0xFFFF : 0x0000 | |
inline uint16_t zeroMask(uint16_t x) { | |
uint32_t y = (uint32_t)x; | |
uint32_t z = (y - 1) >> 16; | |
return (uint16_t)z; | |
} | |
// Compute polynomial addition in GF(2^16) | |
inline uint16_t gf2_16_add(uint16_t x, uint16_t y) { | |
return (x ^ y); | |
} | |
inline uint16_t logsum_modQ(uint16_t a, uint16_t b) { | |
uint32_t sum = ((uint32_t)a) + ((uint32_t)b); | |
uint16_t d = ((uint16_t)sum) + ((uint16_t)(sum >> 16)); | |
return d; | |
} | |
// Compute exp( (a + b)%Q ) | |
inline uint16_t gf2_16_expadd(uint16_t zmask, uint16_t a, uint16_t b) { | |
uint32_t sum = ((uint32_t)a) + ((uint32_t)b); | |
uint16_t t = gf2_16_exp_table[sum]; | |
return (zmask & t) ^ t; | |
} | |
// Compute exp( (a + b + c)%Q ) | |
inline uint16_t gf2_16_expadd3(uint16_t zmask, uint16_t a, uint16_t b, uint16_t c) { | |
uint32_t sum = ((uint32_t)logsum_modQ(a, b)) + ((uint32_t)c); | |
uint16_t t = gf2_16_exp_table[sum]; | |
return (zmask & t) ^ t; | |
} | |
// Compute polynomial multiplication in GF(2^16) | |
inline uint16_t gf2_16_mult(uint16_t x, uint16_t y) { | |
uint16_t a = gf2_16_log_table[x]; | |
uint16_t b = gf2_16_log_table[y]; | |
uint16_t z = zeroMask(x) | zeroMask(y); | |
return gf2_16_expadd(z, a, b); | |
} | |
// Compute polynomial division in GF(2^16). | |
// In other words, find q such that | |
// mult(q, y) = x (mod irreducible) | |
// | |
// Precondition: y != 0 | |
inline uint16_t gf2_16_div(uint16_t x, uint16_t y) { | |
uint16_t a = gf2_16_log_table[x]; | |
uint16_t b = gf2_16_log_table[y]; | |
uint16_t z = zeroMask(x); | |
return gf2_16_expadd(z, a, Q - b); | |
} | |
// precondition x != 0 | |
inline uint16_t gf2_16_inv(uint16_t x) { | |
uint16_t a = gf2_16_log_table[x]; | |
uint16_t b = Q - a; | |
return gf2_16_exp_table[b]; | |
} | |
inline uint16_t gf2_16_exp(uint16_t x) { | |
return gf2_16_exp_table[x]; | |
} | |
inline uint16_t gf2_16_log(uint16_t x) { | |
return gf2_16_log_table[x]; | |
} | |
// We want to calculate a^x, for 0 <= x < 2^16, and a != 0. | |
// | |
// Let Q = 2^16-1, which is the order of the mutiplicative | |
// group of GF(2^16). It is a fact of this multiplicative group | |
// that a^Q = 1. Now, let l = log(a), then | |
// a^x = exp(log(a))^x = exp(l)^x = exp( l*x ). | |
// Moreover, because of the previous fact, | |
// this is equal to exp( (l*x)%Q ). | |
// | |
// Finally, it is another amazing fact of modular arithmetic | |
// that we can calculate (z%Q) very easily when z | |
// is a 32-bit number: we can simply add the high | |
// 16-bits to the low-16 bits. This result may carry into | |
// a 17-bit number. This is fine, as our exponential table | |
// is defined for twice the field size, i.e., for 17-bit indices. | |
inline uint16_t gf2_16_pow(uint16_t a, uint16_t x) { | |
uint32_t log; | |
// First calculate the multiplied logarithm. | |
log = (uint32_t)gf2_16_log_table[a]; | |
log = log * (uint32_t)x; | |
// Now take the modulus by (2^16-1). The following | |
// works by some magic of modular arithmetic. | |
log = (log >> 16) + (log & 0xFFFF); | |
return gf2_16_exp_table[log]; | |
} | |
inline uint16_t nextPower(uint16_t b) { | |
return | |
(b >> 15) ? | |
((b << 1) ^ gf2_16_irreducible) : | |
(b << 1); | |
} | |
void init_tables() { | |
// avoid multiple init | |
if (gf2_16_exp_table[0] != 0) return; | |
unsigned int i = 0; | |
uint16_t b = 1; | |
for (; i < Q; i++) { | |
gf2_16_exp_table[i] = b; | |
gf2_16_log_table[b] = i; | |
b = nextPower(b); | |
} | |
for (; i < (2 * FIELD_SIZE); i++) { | |
gf2_16_exp_table[i] = b; | |
b = nextPower(b); | |
} | |
} | |
__m128i gf2_128_mult(__m128i a, __m128i b) { | |
uint16_t a0, a1, a2, a3, a4, a5, a6, a7; | |
uint16_t b0, b1, b2, b3, b4, b5, b6, b7; | |
uint16_t za0, za1, za2, za3, za4, za5, za6, za7; | |
uint16_t zb0, zb1, zb2, zb3, zb4, zb5, zb6, zb7; | |
uint32_t alog0, alog1, alog2, alog3; | |
uint32_t alog4, alog5, alog6, alog7; | |
uint32_t blog0, blog1, blog2, blog3; | |
uint32_t blog4, blog5, blog6, blog7; | |
uint16_t c0, c1, c2, c3, c4, c5, c6, c7; | |
uint16_t c8, c9, c10, c11, c12, c13, c14; | |
uint16_t d0, d1, d2, d3, d4, d5, d6, d7; | |
a0 = ((uint16_t*)&a)[0]; | |
a1 = ((uint16_t*)&a)[1]; | |
a2 = ((uint16_t*)&a)[2]; | |
a3 = ((uint16_t*)&a)[3]; | |
a4 = ((uint16_t*)&a)[4]; | |
a5 = ((uint16_t*)&a)[5]; | |
a6 = ((uint16_t*)&a)[6]; | |
a7 = ((uint16_t*)&a)[7]; | |
b0 = ((uint16_t*)&b)[0]; | |
b1 = ((uint16_t*)&b)[1]; | |
b2 = ((uint16_t*)&b)[2]; | |
b3 = ((uint16_t*)&b)[3]; | |
b4 = ((uint16_t*)&b)[4]; | |
b5 = ((uint16_t*)&b)[5]; | |
b6 = ((uint16_t*)&b)[6]; | |
b7 = ((uint16_t*)&b)[7]; | |
alog0 = gf2_16_log_table[a0]; | |
alog1 = gf2_16_log_table[a1]; | |
alog2 = gf2_16_log_table[a2]; | |
alog3 = gf2_16_log_table[a3]; | |
alog4 = gf2_16_log_table[a4]; | |
alog5 = gf2_16_log_table[a5]; | |
alog6 = gf2_16_log_table[a6]; | |
alog7 = gf2_16_log_table[a7]; | |
blog0 = gf2_16_log_table[b0]; | |
blog1 = gf2_16_log_table[b1]; | |
blog2 = gf2_16_log_table[b2]; | |
blog3 = gf2_16_log_table[b3]; | |
blog4 = gf2_16_log_table[b4]; | |
blog5 = gf2_16_log_table[b5]; | |
blog6 = gf2_16_log_table[b6]; | |
blog7 = gf2_16_log_table[b7]; | |
za0 = zeroMask(a0); | |
za1 = zeroMask(a1); | |
za2 = zeroMask(a2); | |
za3 = zeroMask(a3); | |
za4 = zeroMask(a4); | |
za5 = zeroMask(a5); | |
za6 = zeroMask(a6); | |
za7 = zeroMask(a7); | |
zb0 = zeroMask(b0); | |
zb1 = zeroMask(b1); | |
zb2 = zeroMask(b2); | |
zb3 = zeroMask(b3); | |
zb4 = zeroMask(b4); | |
zb5 = zeroMask(b5); | |
zb6 = zeroMask(b6); | |
zb7 = zeroMask(b7); | |
c0 = gf2_16_expadd(za0 | zb0, alog0, blog0); | |
c1 = gf2_16_expadd(za0 | zb1, alog0, blog1); | |
c1 ^= gf2_16_expadd(za1 | zb0, alog1, blog0); | |
c2 = gf2_16_expadd(za0 | zb2, alog0, blog2); | |
c2 ^= gf2_16_expadd(za1 | zb1, alog1, blog1); | |
c2 ^= gf2_16_expadd(za2 | zb0, alog2, blog0); | |
c3 = gf2_16_expadd(za0 | zb3, alog0, blog3); | |
c3 ^= gf2_16_expadd(za1 | zb2, alog1, blog2); | |
c3 ^= gf2_16_expadd(za2 | zb1, alog2, blog1); | |
c3 ^= gf2_16_expadd(za3 | zb0, alog3, blog0); | |
c4 = gf2_16_expadd(za0 | zb4, alog0, blog4); | |
c4 ^= gf2_16_expadd(za1 | zb3, alog1, blog3); | |
c4 ^= gf2_16_expadd(za2 | zb2, alog2, blog2); | |
c4 ^= gf2_16_expadd(za3 | zb1, alog3, blog1); | |
c4 ^= gf2_16_expadd(za4 | zb0, alog4, blog0); | |
c5 = gf2_16_expadd(za0 | zb5, alog0, blog5); | |
c5 ^= gf2_16_expadd(za1 | zb4, alog1, blog4); | |
c5 ^= gf2_16_expadd(za2 | zb3, alog2, blog3); | |
c5 ^= gf2_16_expadd(za3 | zb2, alog3, blog2); | |
c5 ^= gf2_16_expadd(za4 | zb1, alog4, blog1); | |
c5 ^= gf2_16_expadd(za5 | zb0, alog5, blog0); | |
c6 = gf2_16_expadd(za0 | zb6, alog0, blog6); | |
c6 ^= gf2_16_expadd(za1 | zb5, alog1, blog5); | |
c6 ^= gf2_16_expadd(za2 | zb4, alog2, blog4); | |
c6 ^= gf2_16_expadd(za3 | zb3, alog3, blog3); | |
c6 ^= gf2_16_expadd(za4 | zb2, alog4, blog2); | |
c6 ^= gf2_16_expadd(za5 | zb1, alog5, blog1); | |
c6 ^= gf2_16_expadd(za6 | zb0, alog6, blog0); | |
c7 = gf2_16_expadd(za0 | zb7, alog0, blog7); | |
c7 ^= gf2_16_expadd(za1 | zb6, alog1, blog6); | |
c7 ^= gf2_16_expadd(za2 | zb5, alog2, blog5); | |
c7 ^= gf2_16_expadd(za3 | zb4, alog3, blog4); | |
c7 ^= gf2_16_expadd(za4 | zb3, alog4, blog3); | |
c7 ^= gf2_16_expadd(za5 | zb2, alog5, blog2); | |
c7 ^= gf2_16_expadd(za6 | zb1, alog6, blog1); | |
c7 ^= gf2_16_expadd(za7 | zb0, alog7, blog0); | |
c8 = gf2_16_expadd(za1 | zb7, alog1, blog7); | |
c8 ^= gf2_16_expadd(za2 | zb6, alog2, blog6); | |
c8 ^= gf2_16_expadd(za3 | zb5, alog3, blog5); | |
c8 ^= gf2_16_expadd(za4 | zb4, alog4, blog4); | |
c8 ^= gf2_16_expadd(za5 | zb3, alog5, blog3); | |
c8 ^= gf2_16_expadd(za6 | zb2, alog6, blog2); | |
c8 ^= gf2_16_expadd(za7 | zb1, alog7, blog1); | |
c9 = gf2_16_expadd(za2 | zb7, alog2, blog7); | |
c9 ^= gf2_16_expadd(za3 | zb6, alog3, blog6); | |
c9 ^= gf2_16_expadd(za4 | zb5, alog4, blog5); | |
c9 ^= gf2_16_expadd(za5 | zb4, alog5, blog4); | |
c9 ^= gf2_16_expadd(za6 | zb3, alog6, blog3); | |
c9 ^= gf2_16_expadd(za7 | zb2, alog7, blog2); | |
c10 = gf2_16_expadd(za3 | zb7, alog3, blog7); | |
c10 ^= gf2_16_expadd(za4 | zb6, alog4, blog6); | |
c10 ^= gf2_16_expadd(za5 | zb5, alog5, blog5); | |
c10 ^= gf2_16_expadd(za6 | zb4, alog6, blog4); | |
c10 ^= gf2_16_expadd(za7 | zb3, alog7, blog3); | |
c11 = gf2_16_expadd(za4 | zb7, alog4, blog7); | |
c11 ^= gf2_16_expadd(za5 | zb6, alog5, blog6); | |
c11 ^= gf2_16_expadd(za6 | zb5, alog6, blog5); | |
c11 ^= gf2_16_expadd(za7 | zb4, alog7, blog4); | |
c12 = gf2_16_expadd(za5 | zb7, alog5, blog7); | |
c12 ^= gf2_16_expadd(za6 | zb6, alog6, blog6); | |
c12 ^= gf2_16_expadd(za7 | zb5, alog7, blog5); | |
c13 = gf2_16_expadd(za6 | zb7, alog6, blog7); | |
c13 ^= gf2_16_expadd(za7 | zb6, alog7, blog6); | |
c14 = gf2_16_expadd(za7 | zb7, alog7, blog7); | |
// Now, modular reduction | |
uint16_t log8 = 3; // gf2_16_log_table[8]; | |
uint16_t c14x8 = gf2_16_expadd(zeroMask(c14), gf2_16_log_table[c14], log8); | |
uint16_t c13x8 = gf2_16_expadd(zeroMask(c13), gf2_16_log_table[c13], log8); | |
uint16_t c12x8 = gf2_16_expadd(zeroMask(c12), gf2_16_log_table[c12], log8); | |
uint16_t c11x8 = gf2_16_expadd(zeroMask(c11), gf2_16_log_table[c11], log8); | |
uint16_t c10x8 = gf2_16_expadd(zeroMask(c10), gf2_16_log_table[c10], log8); | |
uint16_t c9x8 = gf2_16_expadd(zeroMask(c9), gf2_16_log_table[c9], log8); | |
uint16_t c8x8 = gf2_16_expadd(zeroMask(c8), gf2_16_log_table[c8], log8); | |
d7 = c14 ^ c12 ^ c7; | |
d6 = c14x8 ^ c13 ^ c11 ^ c6; | |
d5 = c13x8 ^ c12 ^ c10 ^ c5; | |
d4 = c14 ^ c12x8 ^ c11 ^ c9 ^ c4; | |
d3 = c13 ^ c11x8 ^ c10 ^ c8 ^ c3; | |
d2 = c14 ^ c10x8 ^ c9 ^ c2; | |
d1 = c14x8 ^ c13 ^ c9x8 ^ c8 ^ c1; | |
d0 = c13x8 ^ c8x8 ^ c0; | |
__m128i d; | |
((uint16_t*)&d)[7] = d7; | |
((uint16_t*)&d)[6] = d6; | |
((uint16_t*)&d)[5] = d5; | |
((uint16_t*)&d)[4] = d4; | |
((uint16_t*)&d)[3] = d3; | |
((uint16_t*)&d)[2] = d2; | |
((uint16_t*)&d)[1] = d1; | |
((uint16_t*)&d)[0] = d0; | |
return d; | |
} | |
__m128i _mm_clmulepi64_si128_emu(__m128i ai, __m128i bi, int imm) | |
{ | |
uint64_t a = *((uint64_t*)&ai + (imm & 1)); | |
uint64_t b = *((uint64_t*)&bi + ((imm & 0x10) >> 4)); | |
uint8_t i; //window size s = 4, | |
//uint64_t two_s = 16; //2^s | |
//uint64_t smask = 15; //s 15 | |
uint64_t u[16]; | |
uint64_t r[2]; | |
uint64_t tmp; | |
uint64_t ifmask; | |
//Precomputation | |
u[0] = 0; | |
u[1] = b; | |
//#pragma unroll | |
for (i = 2; i < 16; i += 2) { | |
u[i] = u[i >> 1] << 1; //even indices: left shift | |
u[i + 1] = u[i] ^ b; //odd indices: xor b | |
} | |
//Multiply | |
r[0] = u[a & 15]; //first window only affects lower word | |
r[1] = 0; | |
//#pragma unroll | |
for (i = 4; i < 64; i += 4) { | |
tmp = u[a >> i & 15]; | |
r[0] ^= tmp << i; | |
r[1] ^= tmp >> (64 - i); | |
} | |
//Repair | |
uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110 | |
//#pragma unroll | |
for (i = 1; i < 4; i++) { | |
tmp = ((a & m) >> i); | |
m &= m << 1; //shift mask to exclude all bit j': j' mod s = i | |
ifmask = -((b >> (64 - i)) & 1); //if the (64-i)th bit of b is 1 | |
r[1] ^= (tmp & ifmask); | |
} | |
__m128i out; | |
((uint64_t*)&out)[0] = r[0]; | |
((uint64_t*)&out)[1] = r[1]; | |
return out; | |
} | |
int main() | |
{ | |
//uint64_t junk[4] = { 0x23ab2293110083ad,0x23ab2293110083ad,0x23ab2293110083ad,0x23ab2293110083ad }; | |
//uint64_t junk2[4] = { 0x23ab229110083ad,0x0,0x23ab2293110083ad,0x0 }; | |
uint64_t junk[2] = { 0xff1, 0x0 }; | |
uint64_t junk2[2] = { 0xff1, 0x0 }; | |
init_tables(); | |
const int product2 = 0x00; | |
__m128i result3 = gf2_128_mult(((__m128i*)&junk[0])[0], ((__m128i*)&junk2[0])[0]); | |
__m128i result1 = _mm_clmulepi64_si128_emu(((__m128i*)&junk[0])[0], ((__m128i*)&junk2[0])[0], product2); | |
__m128i result2 = _mm_clmulepi64_si128(((__m128i*)&junk[0])[0], ((__m128i*)&junk2[0])[0], 0x00); | |
printf_s("[Emulatad]0x%I64x times 0x%I64x without a carry bit : 0x%I64x%I64x\n", | |
junk[0], junk2[0], result1.m128i_u64[0], result1.m128i_u64[1]); | |
printf_s("[Intrinsic]0x%I64x times 0x%I64x without a carry bit : 0x%I64x%I64x\n", | |
junk[0], junk2[0], result2.m128i_u64[0], result2.m128i_u64[1]); | |
printf_s("[gfmul128]0x%I64x times 0x%I64x without a carry bit : 0x%I64x%I64x\n", | |
junk[0], junk2[0], result3.m128i_u64[0], result3.m128i_u64[1]); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment