Created
January 15, 2014 22:02
-
-
Save chris-gc/8445626 to your computer and use it in GitHub Desktop.
Efficient, simple GF(256) math
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//// Utility: GF(256) Multiply and Divide functions | |
/* | |
Branchless multiply and divide construction from | |
"Fast Software Implementations of Finite Field Operations (Extended Abstract)" | |
by Cheng Huang and Lihao Xu | |
Small corrections made to paper (Q = 255): | |
+ The EXP_TABLE needs to have 512*2+1 elements to handle 0*0 = 0 case. | |
+ Element 255*2 should be set to 1. | |
After these corrections it works properly and reduces the execution time | |
to 58% of the usual version that uses branches to handle zero input. | |
These tables were generated using polynomial 0x15F. Maybe it's voodoo but | |
random GF(256) matrices with this polynomial tended to be more invertible. | |
There are 16 generator polynomials for GF(256), and 0x1F5 was a close second | |
in terms of rate of invertibility. | |
INV_TABLE[x] was also generated to accelerate GF256Divide(1, x). | |
*/ | |
static const u16 LOG_TABLE[256] = { | |
512, 255, 1, 122, 2, 244, 123, 181, 3, 48, 245, 224, 124, 84, 182, 111, | |
4, 233, 49, 19, 246, 107, 225, 206, 125, 56, 85, 170, 183, 91, 112, 250, | |
5, 117, 234, 10, 50, 156, 20, 213, 247, 203, 108, 178, 226, 37, 207, 210, | |
126, 150, 57, 100, 86, 141, 171, 40, 184, 73, 92, 164, 113, 146, 251, 229, | |
6, 96, 118, 15, 235, 193, 11, 13, 51, 68, 157, 195, 21, 31, 214, 237, | |
248, 168, 204, 17, 109, 222, 179, 120, 227, 162, 38, 98, 208, 176, 211, 8, | |
127, 188, 151, 239, 58, 132, 101, 216, 87, 80, 142, 33, 172, 27, 41, 23, | |
185, 77, 74, 197, 93, 65, 165, 159, 114, 200, 147, 70, 252, 45, 230, 53, | |
7, 175, 97, 161, 119, 221, 16, 167, 236, 30, 194, 67, 12, 192, 14, 95, | |
52, 44, 69, 199, 158, 64, 196, 76, 22, 26, 32, 79, 215, 131, 238, 187, | |
249, 90, 169, 55, 205, 106, 18, 232, 110, 83, 223, 47, 180, 243, 121, 254, | |
228, 145, 163, 72, 39, 140, 99, 149, 209, 36, 177, 202, 212, 155, 9, 116, | |
128, 61, 189, 218, 152, 137, 240, 103, 59, 135, 133, 134, 102, 136, 217, 60, | |
88, 104, 81, 241, 143, 138, 34, 153, 173, 219, 28, 190, 42, 62, 24, 129, | |
186, 130, 78, 25, 75, 63, 198, 43, 94, 191, 66, 29, 166, 220, 160, 174, | |
115, 154, 201, 35, 148, 139, 71, 144, 253, 242, 46, 82, 231, 105, 54, 89 | |
}; | |
static const u8 EXP_TABLE[512*2+1] = { | |
1, 2, 4, 8, 16, 32, 64, 128, 95, 190, 35, 70, 140, 71, 142, 67, | |
134, 83, 166, 19, 38, 76, 152, 111, 222, 227, 153, 109, 218, 235, 137, 77, | |
154, 107, 214, 243, 185, 45, 90, 180, 55, 110, 220, 231, 145, 125, 250, 171, | |
9, 18, 36, 72, 144, 127, 254, 163, 25, 50, 100, 200, 207, 193, 221, 229, | |
149, 117, 234, 139, 73, 146, 123, 246, 179, 57, 114, 228, 151, 113, 226, 155, | |
105, 210, 251, 169, 13, 26, 52, 104, 208, 255, 161, 29, 58, 116, 232, 143, | |
65, 130, 91, 182, 51, 102, 204, 199, 209, 253, 165, 21, 42, 84, 168, 15, | |
30, 60, 120, 240, 191, 33, 66, 132, 87, 174, 3, 6, 12, 24, 48, 96, | |
192, 223, 225, 157, 101, 202, 203, 201, 205, 197, 213, 245, 181, 53, 106, 212, | |
247, 177, 61, 122, 244, 183, 49, 98, 196, 215, 241, 189, 37, 74, 148, 119, | |
238, 131, 89, 178, 59, 118, 236, 135, 81, 162, 27, 54, 108, 216, 239, 129, | |
93, 186, 43, 86, 172, 7, 14, 28, 56, 112, 224, 159, 97, 194, 219, 233, | |
141, 69, 138, 75, 150, 115, 230, 147, 121, 242, 187, 41, 82, 164, 23, 46, | |
92, 184, 47, 94, 188, 39, 78, 156, 103, 206, 195, 217, 237, 133, 85, 170, | |
11, 22, 44, 88, 176, 63, 126, 252, 167, 17, 34, 68, 136, 79, 158, 99, | |
198, 211, 249, 173, 5, 10, 20, 40, 80, 160, 31, 62, 124, 248, 175, 1, | |
2, 4, 8, 16, 32, 64, 128, 95, 190, 35, 70, 140, 71, 142, 67, 134, | |
83, 166, 19, 38, 76, 152, 111, 222, 227, 153, 109, 218, 235, 137, 77, 154, | |
107, 214, 243, 185, 45, 90, 180, 55, 110, 220, 231, 145, 125, 250, 171, 9, | |
18, 36, 72, 144, 127, 254, 163, 25, 50, 100, 200, 207, 193, 221, 229, 149, | |
117, 234, 139, 73, 146, 123, 246, 179, 57, 114, 228, 151, 113, 226, 155, 105, | |
210, 251, 169, 13, 26, 52, 104, 208, 255, 161, 29, 58, 116, 232, 143, 65, | |
130, 91, 182, 51, 102, 204, 199, 209, 253, 165, 21, 42, 84, 168, 15, 30, | |
60, 120, 240, 191, 33, 66, 132, 87, 174, 3, 6, 12, 24, 48, 96, 192, | |
223, 225, 157, 101, 202, 203, 201, 205, 197, 213, 245, 181, 53, 106, 212, 247, | |
177, 61, 122, 244, 183, 49, 98, 196, 215, 241, 189, 37, 74, 148, 119, 238, | |
131, 89, 178, 59, 118, 236, 135, 81, 162, 27, 54, 108, 216, 239, 129, 93, | |
186, 43, 86, 172, 7, 14, 28, 56, 112, 224, 159, 97, 194, 219, 233, 141, | |
69, 138, 75, 150, 115, 230, 147, 121, 242, 187, 41, 82, 164, 23, 46, 92, | |
184, 47, 94, 188, 39, 78, 156, 103, 206, 195, 217, 237, 133, 85, 170, 11, | |
22, 44, 88, 176, 63, 126, 252, 167, 17, 34, 68, 136, 79, 158, 99, 198, | |
211, 249, 173, 5, 10, 20, 40, 80, 160, 31, 62, 124, 248, 175, 1, 0, | |
}; | |
static const u8 INV_TABLE[256] = { | |
0, 1, 175, 202, 248, 70, 101, 114, 124, 46, 35, 77, 157, 54, 57, 247, | |
62, 152, 23, 136, 190, 244, 137, 18, 225, 147, 27, 26, 179, 59, 212, 32, | |
31, 213, 76, 10, 164, 182, 68, 220, 95, 144, 122, 113, 235, 195, 9, 125, | |
223, 253, 230, 189, 162, 120, 13, 156, 246, 14, 178, 29, 106, 84, 16, 153, | |
160, 119, 197, 198, 38, 221, 5, 249, 82, 159, 91, 207, 34, 11, 110, 166, | |
128, 104, 72, 158, 61, 107, 151, 201, 218, 116, 206, 74, 171, 155, 145, 40, | |
192, 139, 209, 134, 115, 6, 241, 180, 81, 129, 60, 85, 169, 176, 78, 167, | |
123, 43, 7, 100, 89, 219, 161, 65, 53, 163, 42, 112, 8, 47, 227, 187, | |
80, 105, 148, 232, 205, 214, 99, 208, 19, 22, 193, 97, 173, 229, 211, 238, | |
41, 94, 224, 25, 130, 233, 200, 86, 17, 63, 170, 93, 55, 12, 83, 73, | |
64, 118, 52, 121, 36, 183, 79, 111, 177, 108, 154, 92, 228, 140, 203, 2, | |
109, 168, 58, 28, 103, 240, 37, 165, 250, 217, 226, 127, 231, 51, 20, 245, | |
96, 138, 234, 45, 199, 66, 67, 196, 150, 87, 3, 174, 215, 132, 90, 75, | |
135, 98, 239, 142, 30, 33, 133, 204, 251, 185, 88, 117, 39, 69, 252, 48, | |
146, 24, 186, 126, 172, 141, 50, 188, 131, 149, 194, 44, 255, 243, 143, 210, | |
181, 102, 254, 237, 21, 191, 56, 15, 4, 71, 184, 216, 222, 49, 242, 236 | |
}; | |
#ifdef CAT_EXP_BIG_TABLES | |
#include <stdlib.h> // malloc | |
static u8 * CAT_RESTRICT GF_MUL_TABLE = 0; | |
static u8 * CAT_RESTRICT GF_DIV_TABLE = 0; | |
// Unpack 256x256 multiplication tables | |
static void GF256Init() { | |
// If not initialized yet, | |
if (!GF_MUL_TABLE) { | |
// Allocate table memory 65KB | |
GF_MUL_TABLE = (u8 *)malloc(256 * 256); | |
GF_DIV_TABLE = (u8 *)malloc(256 * 256); | |
// For each subtable, | |
u8 *ptr = GF_MUL_TABLE; | |
for (int ii = 0; ii < 256; ++ii) { | |
const u8 log_ii = LOG_TABLE[ii]; | |
// Calculate ii * jj | |
for (int jj = 0; jj < 256; ++jj) { | |
*ptr++ = EXP_TABLE[log_ii + LOG_TABLE[jj]]; | |
} | |
} | |
// For each subtable, | |
ptr = GF_DIV_TABLE; | |
for (int ii = 0; ii < 256; ++ii) { | |
const u8 log_ii = 255 - LOG_TABLE[ii]; | |
// Calculate ii * jj | |
for (int jj = 0; jj < 256; ++jj) { | |
*ptr++ = EXP_TABLE[LOG_TABLE[jj] + log_ii]; | |
} | |
} | |
} | |
} | |
static CAT_INLINE u8 GF256Multiply(u8 x, u8 y) | |
{ | |
return GF_MUL_TABLE[((u32)x << 8) + y]; | |
} | |
static CAT_INLINE u8 GF256Divide(u8 x, u8 y) | |
{ | |
// x /= y | |
return GF_DIV_TABLE[((u32)y << 8) + x]; | |
} | |
// Performs "dest[] += src[] * x" operation in GF(256) | |
static void GF256MemMulAdd(void * CAT_RESTRICT vdest, u8 x, const void * CAT_RESTRICT vsrc, int bytes) | |
{ | |
u8 * CAT_RESTRICT dest = reinterpret_cast<u8*>( vdest ); | |
const u8 * CAT_RESTRICT src = reinterpret_cast<const u8*>( vsrc ); | |
const u8 * CAT_RESTRICT table = GF_MUL_TABLE + ((u32)x << 8); | |
// For each block of 8 bytes, | |
while (bytes >= 8) | |
{ | |
#ifdef CAT_ENDIAN_LITTLE | |
// This optimization works because it reduces the number of memory | |
// accesses on desktops by almost half. | |
u64 x = table[src[0]]; | |
x |= (u64)table[src[1]] << 8; | |
x |= (u64)table[src[2]] << 16; | |
x |= (u64)table[src[3]] << 24; | |
x |= (u64)table[src[4]] << 32; | |
x |= (u64)table[src[5]] << 40; | |
x |= (u64)table[src[6]] << 48; | |
x |= (u64)table[src[7]] << 56; | |
*(u64*)dest ^= x; | |
#else | |
dest[0] ^= table[src[0]]; | |
dest[1] ^= table[src[1]]; | |
dest[2] ^= table[src[2]]; | |
dest[3] ^= table[src[3]]; | |
dest[4] ^= table[src[4]]; | |
dest[5] ^= table[src[5]]; | |
dest[6] ^= table[src[6]]; | |
dest[7] ^= table[src[7]]; | |
#endif // CAT_ENDIAN_LITTLE | |
src += 8; | |
dest += 8; | |
bytes -= 8; | |
} | |
// For each byte, | |
while (bytes-- > 0) | |
{ | |
// Multiply source byte by x and add it to destination byte | |
*dest++ ^= table[*src++]; | |
} | |
} | |
// Performs "dest[] /= x" operation in GF(256) | |
static void GF256MemDivide(void * CAT_RESTRICT vdest, u8 x, int bytes) | |
{ | |
u8 * CAT_RESTRICT dest = reinterpret_cast<u8*>( vdest ); | |
const u8 * CAT_RESTRICT table = GF_DIV_TABLE + ((u32)x << 8); | |
// For each block of 8 bytes, | |
while (bytes >= 8) | |
{ | |
#ifdef CAT_ENDIAN_LITTLE | |
// This optimization works because it reduces the number of memory | |
// accesses on desktops by almost half. | |
u64 x = table[dest[0]]; | |
x |= (u64)table[dest[1]] << 8; | |
x |= (u64)table[dest[2]] << 16; | |
x |= (u64)table[dest[3]] << 24; | |
x |= (u64)table[dest[4]] << 32; | |
x |= (u64)table[dest[5]] << 40; | |
x |= (u64)table[dest[6]] << 48; | |
x |= (u64)table[dest[7]] << 56; | |
*(u64*)dest = x; | |
#else | |
dest[0] = table[dest[0]]; | |
dest[1] = table[dest[1]]; | |
dest[2] = table[dest[2]]; | |
dest[3] = table[dest[3]]; | |
dest[4] = table[dest[4]]; | |
dest[5] = table[dest[5]]; | |
dest[6] = table[dest[6]]; | |
dest[7] = table[dest[7]]; | |
#endif // CAT_ENDIAN_LITTLE | |
dest += 8; | |
bytes -= 8; | |
} | |
// For each byte, | |
while (bytes-- > 0) | |
{ | |
// Multiply source byte by x and add it to destination byte | |
*dest = table[*dest]; | |
++dest; | |
} | |
} | |
#else // CAT_EXP_BIG_TABLES | |
static CAT_INLINE u8 GF256Multiply(u8 x, u8 y) | |
{ | |
return EXP_TABLE[LOG_TABLE[x] + LOG_TABLE[y]]; | |
} | |
static CAT_INLINE u8 GF256Divide(u8 x, u8 y) | |
{ | |
// Precondition: y != 0 | |
return EXP_TABLE[LOG_TABLE[x] + 255 - LOG_TABLE[y]]; | |
} | |
// Performs "dest[] += src[] * x" operation in GF(256) | |
static void GF256MemMulAdd(void * CAT_RESTRICT vdest, u8 x, const void * CAT_RESTRICT vsrc, int bytes) | |
{ | |
u8 * CAT_RESTRICT dest = reinterpret_cast<u8*>( vdest ); | |
const u8 * CAT_RESTRICT src = reinterpret_cast<const u8*>( vsrc ); | |
int log_x = LOG_TABLE[x]; | |
// For each block of 8 bytes, | |
while (bytes >= 8) | |
{ | |
/* | |
For smaller messages, this function takes | |
up 50% of execution time. Unfortunately I | |
do not see a way to make it run any faster. | |
*/ | |
dest[0] ^= EXP_TABLE[LOG_TABLE[src[0]] + log_x]; | |
dest[1] ^= EXP_TABLE[LOG_TABLE[src[1]] + log_x]; | |
dest[2] ^= EXP_TABLE[LOG_TABLE[src[2]] + log_x]; | |
dest[3] ^= EXP_TABLE[LOG_TABLE[src[3]] + log_x]; | |
dest[4] ^= EXP_TABLE[LOG_TABLE[src[4]] + log_x]; | |
dest[5] ^= EXP_TABLE[LOG_TABLE[src[5]] + log_x]; | |
dest[6] ^= EXP_TABLE[LOG_TABLE[src[6]] + log_x]; | |
dest[7] ^= EXP_TABLE[LOG_TABLE[src[7]] + log_x]; | |
src += 8; | |
dest += 8; | |
bytes -= 8; | |
} | |
// For each byte, | |
while (bytes-- > 0) | |
{ | |
// Multiply source byte by x and add it to destination byte | |
*dest++ ^= EXP_TABLE[LOG_TABLE[*src++] + log_x]; | |
} | |
} | |
// Performs "dest[] /= x" operation in GF(256) | |
static void GF256MemDivide(void * CAT_RESTRICT vdest, u8 x, int bytes) | |
{ | |
u8 * CAT_RESTRICT dest = reinterpret_cast<u8*>( vdest ); | |
int log_x = 255 - LOG_TABLE[x]; | |
// For each block of 8 bytes, | |
while (bytes >= 8) | |
{ | |
dest[0] = EXP_TABLE[LOG_TABLE[dest[0]] + log_x]; | |
dest[1] = EXP_TABLE[LOG_TABLE[dest[1]] + log_x]; | |
dest[2] = EXP_TABLE[LOG_TABLE[dest[2]] + log_x]; | |
dest[3] = EXP_TABLE[LOG_TABLE[dest[3]] + log_x]; | |
dest[4] = EXP_TABLE[LOG_TABLE[dest[4]] + log_x]; | |
dest[5] = EXP_TABLE[LOG_TABLE[dest[5]] + log_x]; | |
dest[6] = EXP_TABLE[LOG_TABLE[dest[6]] + log_x]; | |
dest[7] = EXP_TABLE[LOG_TABLE[dest[7]] + log_x]; | |
dest += 8; | |
bytes -= 8; | |
} | |
// For each byte, | |
while (bytes-- > 0) | |
{ | |
// Multiply source byte by x and add it to destination byte | |
*dest = EXP_TABLE[LOG_TABLE[*dest] + log_x]; | |
++dest; | |
} | |
} | |
#endif // CAT_EXP_BIG_TABLES |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment