Skip to content

Instantly share code, notes, and snippets.

@monkins1010
Created January 26, 2019 16:35
Show Gist options
  • Save monkins1010/da38310c88ce80acb4a879be611839ce to your computer and use it in GitHub Desktop.
Save monkins1010/da38310c88ce80acb4a879be611839ce to your computer and use it in GitHub Desktop.
// mul2.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
// ConsoleApplication1.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#pragma warning (disable : 4146)
//#include "pch.h"
#include <iostream>
#include <emmintrin.h>
#include <stdio.h>
#include <stdint.h>
#define FIELD_WIDTH 16
#define FIELD_SIZE (1 << FIELD_WIDTH)
#define Q (FIELD_SIZE - 1)
#define gf2_16_irreducible ((1 << 12) | (1<<3) | (1<<1) | (1<<0))
// irreducible polynomial is
// x^17 + x^12 + x^3 + x + 1
uint16_t gf2_16_log_table[FIELD_SIZE];
uint16_t gf2_16_exp_table[2 * FIELD_SIZE];
//void init_tables();
// This function computes, in a branch-free way, the value:
// (x == 0) ? 0xFFFF : 0x0000
inline uint16_t zeroMask(uint16_t x) {
uint32_t y = (uint32_t)x;
uint32_t z = (y - 1) >> 16;
return (uint16_t)z;
}
// Compute polynomial addition in GF(2^16)
inline uint16_t gf2_16_add(uint16_t x, uint16_t y) {
return (x ^ y);
}
inline uint16_t logsum_modQ(uint16_t a, uint16_t b) {
uint32_t sum = ((uint32_t)a) + ((uint32_t)b);
uint16_t d = ((uint16_t)sum) + ((uint16_t)(sum >> 16));
return d;
}
// Compute exp( (a + b)%Q )
inline uint16_t gf2_16_expadd(uint16_t zmask, uint16_t a, uint16_t b) {
uint32_t sum = ((uint32_t)a) + ((uint32_t)b);
uint16_t t = gf2_16_exp_table[sum];
return (zmask & t) ^ t;
}
// Compute exp( (a + b + c)%Q )
inline uint16_t gf2_16_expadd3(uint16_t zmask, uint16_t a, uint16_t b, uint16_t c) {
uint32_t sum = ((uint32_t)logsum_modQ(a, b)) + ((uint32_t)c);
uint16_t t = gf2_16_exp_table[sum];
return (zmask & t) ^ t;
}
// Compute polynomial multiplication in GF(2^16)
inline uint16_t gf2_16_mult(uint16_t x, uint16_t y) {
uint16_t a = gf2_16_log_table[x];
uint16_t b = gf2_16_log_table[y];
uint16_t z = zeroMask(x) | zeroMask(y);
return gf2_16_expadd(z, a, b);
}
// Compute polynomial division in GF(2^16).
// In other words, find q such that
// mult(q, y) = x (mod irreducible)
//
// Precondition: y != 0
inline uint16_t gf2_16_div(uint16_t x, uint16_t y) {
uint16_t a = gf2_16_log_table[x];
uint16_t b = gf2_16_log_table[y];
uint16_t z = zeroMask(x);
return gf2_16_expadd(z, a, Q - b);
}
// precondition x != 0
inline uint16_t gf2_16_inv(uint16_t x) {
uint16_t a = gf2_16_log_table[x];
uint16_t b = Q - a;
return gf2_16_exp_table[b];
}
inline uint16_t gf2_16_exp(uint16_t x) {
return gf2_16_exp_table[x];
}
inline uint16_t gf2_16_log(uint16_t x) {
return gf2_16_log_table[x];
}
// We want to calculate a^x, for 0 <= x < 2^16, and a != 0.
//
// Let Q = 2^16-1, which is the order of the mutiplicative
// group of GF(2^16). It is a fact of this multiplicative group
// that a^Q = 1. Now, let l = log(a), then
// a^x = exp(log(a))^x = exp(l)^x = exp( l*x ).
// Moreover, because of the previous fact,
// this is equal to exp( (l*x)%Q ).
//
// Finally, it is another amazing fact of modular arithmetic
// that we can calculate (z%Q) very easily when z
// is a 32-bit number: we can simply add the high
// 16-bits to the low-16 bits. This result may carry into
// a 17-bit number. This is fine, as our exponential table
// is defined for twice the field size, i.e., for 17-bit indices.
inline uint16_t gf2_16_pow(uint16_t a, uint16_t x) {
uint32_t log;
// First calculate the multiplied logarithm.
log = (uint32_t)gf2_16_log_table[a];
log = log * (uint32_t)x;
// Now take the modulus by (2^16-1). The following
// works by some magic of modular arithmetic.
log = (log >> 16) + (log & 0xFFFF);
return gf2_16_exp_table[log];
}
inline uint16_t nextPower(uint16_t b) {
return
(b >> 15) ?
((b << 1) ^ gf2_16_irreducible) :
(b << 1);
}
void init_tables() {
// avoid multiple init
if (gf2_16_exp_table[0] != 0) return;
unsigned int i = 0;
uint16_t b = 1;
for (; i < Q; i++) {
gf2_16_exp_table[i] = b;
gf2_16_log_table[b] = i;
b = nextPower(b);
}
for (; i < (2 * FIELD_SIZE); i++) {
gf2_16_exp_table[i] = b;
b = nextPower(b);
}
}
__m128i gf2_128_mult(__m128i a, __m128i b) {
uint16_t a0, a1, a2, a3, a4, a5, a6, a7;
uint16_t b0, b1, b2, b3, b4, b5, b6, b7;
uint16_t za0, za1, za2, za3, za4, za5, za6, za7;
uint16_t zb0, zb1, zb2, zb3, zb4, zb5, zb6, zb7;
uint32_t alog0, alog1, alog2, alog3;
uint32_t alog4, alog5, alog6, alog7;
uint32_t blog0, blog1, blog2, blog3;
uint32_t blog4, blog5, blog6, blog7;
uint16_t c0, c1, c2, c3, c4, c5, c6, c7;
uint16_t c8, c9, c10, c11, c12, c13, c14;
uint16_t d0, d1, d2, d3, d4, d5, d6, d7;
a0 = ((uint16_t*)&a)[0];
a1 = ((uint16_t*)&a)[1];
a2 = ((uint16_t*)&a)[2];
a3 = ((uint16_t*)&a)[3];
a4 = ((uint16_t*)&a)[4];
a5 = ((uint16_t*)&a)[5];
a6 = ((uint16_t*)&a)[6];
a7 = ((uint16_t*)&a)[7];
b0 = ((uint16_t*)&b)[0];
b1 = ((uint16_t*)&b)[1];
b2 = ((uint16_t*)&b)[2];
b3 = ((uint16_t*)&b)[3];
b4 = ((uint16_t*)&b)[4];
b5 = ((uint16_t*)&b)[5];
b6 = ((uint16_t*)&b)[6];
b7 = ((uint16_t*)&b)[7];
alog0 = gf2_16_log_table[a0];
alog1 = gf2_16_log_table[a1];
alog2 = gf2_16_log_table[a2];
alog3 = gf2_16_log_table[a3];
alog4 = gf2_16_log_table[a4];
alog5 = gf2_16_log_table[a5];
alog6 = gf2_16_log_table[a6];
alog7 = gf2_16_log_table[a7];
blog0 = gf2_16_log_table[b0];
blog1 = gf2_16_log_table[b1];
blog2 = gf2_16_log_table[b2];
blog3 = gf2_16_log_table[b3];
blog4 = gf2_16_log_table[b4];
blog5 = gf2_16_log_table[b5];
blog6 = gf2_16_log_table[b6];
blog7 = gf2_16_log_table[b7];
za0 = zeroMask(a0);
za1 = zeroMask(a1);
za2 = zeroMask(a2);
za3 = zeroMask(a3);
za4 = zeroMask(a4);
za5 = zeroMask(a5);
za6 = zeroMask(a6);
za7 = zeroMask(a7);
zb0 = zeroMask(b0);
zb1 = zeroMask(b1);
zb2 = zeroMask(b2);
zb3 = zeroMask(b3);
zb4 = zeroMask(b4);
zb5 = zeroMask(b5);
zb6 = zeroMask(b6);
zb7 = zeroMask(b7);
c0 = gf2_16_expadd(za0 | zb0, alog0, blog0);
c1 = gf2_16_expadd(za0 | zb1, alog0, blog1);
c1 ^= gf2_16_expadd(za1 | zb0, alog1, blog0);
c2 = gf2_16_expadd(za0 | zb2, alog0, blog2);
c2 ^= gf2_16_expadd(za1 | zb1, alog1, blog1);
c2 ^= gf2_16_expadd(za2 | zb0, alog2, blog0);
c3 = gf2_16_expadd(za0 | zb3, alog0, blog3);
c3 ^= gf2_16_expadd(za1 | zb2, alog1, blog2);
c3 ^= gf2_16_expadd(za2 | zb1, alog2, blog1);
c3 ^= gf2_16_expadd(za3 | zb0, alog3, blog0);
c4 = gf2_16_expadd(za0 | zb4, alog0, blog4);
c4 ^= gf2_16_expadd(za1 | zb3, alog1, blog3);
c4 ^= gf2_16_expadd(za2 | zb2, alog2, blog2);
c4 ^= gf2_16_expadd(za3 | zb1, alog3, blog1);
c4 ^= gf2_16_expadd(za4 | zb0, alog4, blog0);
c5 = gf2_16_expadd(za0 | zb5, alog0, blog5);
c5 ^= gf2_16_expadd(za1 | zb4, alog1, blog4);
c5 ^= gf2_16_expadd(za2 | zb3, alog2, blog3);
c5 ^= gf2_16_expadd(za3 | zb2, alog3, blog2);
c5 ^= gf2_16_expadd(za4 | zb1, alog4, blog1);
c5 ^= gf2_16_expadd(za5 | zb0, alog5, blog0);
c6 = gf2_16_expadd(za0 | zb6, alog0, blog6);
c6 ^= gf2_16_expadd(za1 | zb5, alog1, blog5);
c6 ^= gf2_16_expadd(za2 | zb4, alog2, blog4);
c6 ^= gf2_16_expadd(za3 | zb3, alog3, blog3);
c6 ^= gf2_16_expadd(za4 | zb2, alog4, blog2);
c6 ^= gf2_16_expadd(za5 | zb1, alog5, blog1);
c6 ^= gf2_16_expadd(za6 | zb0, alog6, blog0);
c7 = gf2_16_expadd(za0 | zb7, alog0, blog7);
c7 ^= gf2_16_expadd(za1 | zb6, alog1, blog6);
c7 ^= gf2_16_expadd(za2 | zb5, alog2, blog5);
c7 ^= gf2_16_expadd(za3 | zb4, alog3, blog4);
c7 ^= gf2_16_expadd(za4 | zb3, alog4, blog3);
c7 ^= gf2_16_expadd(za5 | zb2, alog5, blog2);
c7 ^= gf2_16_expadd(za6 | zb1, alog6, blog1);
c7 ^= gf2_16_expadd(za7 | zb0, alog7, blog0);
c8 = gf2_16_expadd(za1 | zb7, alog1, blog7);
c8 ^= gf2_16_expadd(za2 | zb6, alog2, blog6);
c8 ^= gf2_16_expadd(za3 | zb5, alog3, blog5);
c8 ^= gf2_16_expadd(za4 | zb4, alog4, blog4);
c8 ^= gf2_16_expadd(za5 | zb3, alog5, blog3);
c8 ^= gf2_16_expadd(za6 | zb2, alog6, blog2);
c8 ^= gf2_16_expadd(za7 | zb1, alog7, blog1);
c9 = gf2_16_expadd(za2 | zb7, alog2, blog7);
c9 ^= gf2_16_expadd(za3 | zb6, alog3, blog6);
c9 ^= gf2_16_expadd(za4 | zb5, alog4, blog5);
c9 ^= gf2_16_expadd(za5 | zb4, alog5, blog4);
c9 ^= gf2_16_expadd(za6 | zb3, alog6, blog3);
c9 ^= gf2_16_expadd(za7 | zb2, alog7, blog2);
c10 = gf2_16_expadd(za3 | zb7, alog3, blog7);
c10 ^= gf2_16_expadd(za4 | zb6, alog4, blog6);
c10 ^= gf2_16_expadd(za5 | zb5, alog5, blog5);
c10 ^= gf2_16_expadd(za6 | zb4, alog6, blog4);
c10 ^= gf2_16_expadd(za7 | zb3, alog7, blog3);
c11 = gf2_16_expadd(za4 | zb7, alog4, blog7);
c11 ^= gf2_16_expadd(za5 | zb6, alog5, blog6);
c11 ^= gf2_16_expadd(za6 | zb5, alog6, blog5);
c11 ^= gf2_16_expadd(za7 | zb4, alog7, blog4);
c12 = gf2_16_expadd(za5 | zb7, alog5, blog7);
c12 ^= gf2_16_expadd(za6 | zb6, alog6, blog6);
c12 ^= gf2_16_expadd(za7 | zb5, alog7, blog5);
c13 = gf2_16_expadd(za6 | zb7, alog6, blog7);
c13 ^= gf2_16_expadd(za7 | zb6, alog7, blog6);
c14 = gf2_16_expadd(za7 | zb7, alog7, blog7);
// Now, modular reduction
uint16_t log8 = 3; // gf2_16_log_table[8];
uint16_t c14x8 = gf2_16_expadd(zeroMask(c14), gf2_16_log_table[c14], log8);
uint16_t c13x8 = gf2_16_expadd(zeroMask(c13), gf2_16_log_table[c13], log8);
uint16_t c12x8 = gf2_16_expadd(zeroMask(c12), gf2_16_log_table[c12], log8);
uint16_t c11x8 = gf2_16_expadd(zeroMask(c11), gf2_16_log_table[c11], log8);
uint16_t c10x8 = gf2_16_expadd(zeroMask(c10), gf2_16_log_table[c10], log8);
uint16_t c9x8 = gf2_16_expadd(zeroMask(c9), gf2_16_log_table[c9], log8);
uint16_t c8x8 = gf2_16_expadd(zeroMask(c8), gf2_16_log_table[c8], log8);
d7 = c14 ^ c12 ^ c7;
d6 = c14x8 ^ c13 ^ c11 ^ c6;
d5 = c13x8 ^ c12 ^ c10 ^ c5;
d4 = c14 ^ c12x8 ^ c11 ^ c9 ^ c4;
d3 = c13 ^ c11x8 ^ c10 ^ c8 ^ c3;
d2 = c14 ^ c10x8 ^ c9 ^ c2;
d1 = c14x8 ^ c13 ^ c9x8 ^ c8 ^ c1;
d0 = c13x8 ^ c8x8 ^ c0;
__m128i d;
((uint16_t*)&d)[7] = d7;
((uint16_t*)&d)[6] = d6;
((uint16_t*)&d)[5] = d5;
((uint16_t*)&d)[4] = d4;
((uint16_t*)&d)[3] = d3;
((uint16_t*)&d)[2] = d2;
((uint16_t*)&d)[1] = d1;
((uint16_t*)&d)[0] = d0;
return d;
}
__m128i _mm_clmulepi64_si128_emu(__m128i ai, __m128i bi, int imm)
{
uint64_t a = *((uint64_t*)&ai + (imm & 1));
uint64_t b = *((uint64_t*)&bi + ((imm & 0x10) >> 4));
uint8_t i; //window size s = 4,
//uint64_t two_s = 16; //2^s
//uint64_t smask = 15; //s 15
uint64_t u[16];
uint64_t r[2];
uint64_t tmp;
uint64_t ifmask;
//Precomputation
u[0] = 0;
u[1] = b;
//#pragma unroll
for (i = 2; i < 16; i += 2) {
u[i] = u[i >> 1] << 1; //even indices: left shift
u[i + 1] = u[i] ^ b; //odd indices: xor b
}
//Multiply
r[0] = u[a & 15]; //first window only affects lower word
r[1] = 0;
//#pragma unroll
for (i = 4; i < 64; i += 4) {
tmp = u[a >> i & 15];
r[0] ^= tmp << i;
r[1] ^= tmp >> (64 - i);
}
//Repair
uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
//#pragma unroll
for (i = 1; i < 4; i++) {
tmp = ((a & m) >> i);
m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
ifmask = -((b >> (64 - i)) & 1); //if the (64-i)th bit of b is 1
r[1] ^= (tmp & ifmask);
}
__m128i out;
((uint64_t*)&out)[0] = r[0];
((uint64_t*)&out)[1] = r[1];
return out;
}
int main()
{
//uint64_t junk[4] = { 0x23ab2293110083ad,0x23ab2293110083ad,0x23ab2293110083ad,0x23ab2293110083ad };
//uint64_t junk2[4] = { 0x23ab229110083ad,0x0,0x23ab2293110083ad,0x0 };
uint64_t junk[2] = { 0xff1, 0x0 };
uint64_t junk2[2] = { 0xff1, 0x0 };
init_tables();
const int product2 = 0x00;
__m128i result3 = gf2_128_mult(((__m128i*)&junk[0])[0], ((__m128i*)&junk2[0])[0]);
__m128i result1 = _mm_clmulepi64_si128_emu(((__m128i*)&junk[0])[0], ((__m128i*)&junk2[0])[0], product2);
__m128i result2 = _mm_clmulepi64_si128(((__m128i*)&junk[0])[0], ((__m128i*)&junk2[0])[0], 0x00);
printf_s("[Emulatad]0x%I64x times 0x%I64x without a carry bit : 0x%I64x%I64x\n",
junk[0], junk2[0], result1.m128i_u64[0], result1.m128i_u64[1]);
printf_s("[Intrinsic]0x%I64x times 0x%I64x without a carry bit : 0x%I64x%I64x\n",
junk[0], junk2[0], result2.m128i_u64[0], result2.m128i_u64[1]);
printf_s("[gfmul128]0x%I64x times 0x%I64x without a carry bit : 0x%I64x%I64x\n",
junk[0], junk2[0], result3.m128i_u64[0], result3.m128i_u64[1]);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment