Created
November 13, 2020 09:58
-
-
Save bjoto/dc22d593aa3ac63c2c90632de5ed82e0 to your computer and use it in GitHub Desktop.
csum_partial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://lwn.net/Articles/679137/ | |
// make csum && ./csum | |
// cc csum.c -o csum | |
// generic: ffe3 65507 | |
// x86 : ffffffe3 -29 | |
#include <stdio.h> | |
typedef unsigned int __wsum; | |
static inline unsigned short from32to16(unsigned int x) | |
{ | |
/* add up 16-bit and 16-bit for 16+c bit */ | |
x = (x & 0xffff) + (x >> 16); | |
/* add up carry.. */ | |
x = (x & 0xffff) + (x >> 16); | |
return x; | |
} | |
static unsigned int do_csum(const unsigned char *buff, int len) | |
{ | |
int odd; | |
unsigned int result = 0; | |
if (len <= 0) | |
goto out; | |
odd = 1 & (unsigned long) buff; | |
if (odd) { | |
result += (*buff << 8); | |
len--; | |
buff++; | |
} | |
if (len >= 2) { | |
if (2 & (unsigned long) buff) { | |
result += *(unsigned short *) buff; | |
len -= 2; | |
buff += 2; | |
} | |
if (len >= 4) { | |
const unsigned char *end = buff + ((unsigned)len & ~3); | |
unsigned int carry = 0; | |
do { | |
unsigned int w = *(unsigned int *) buff; | |
buff += 4; | |
result += carry; | |
result += w; | |
carry = (w > result); | |
} while (buff < end); | |
result += carry; | |
result = (result & 0xffff) + (result >> 16); | |
} | |
if (len & 2) { | |
result += *(unsigned short *) buff; | |
buff += 2; | |
} | |
} | |
if (len & 1) | |
result += *buff; | |
result = from32to16(result); | |
if (odd) | |
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | |
out: | |
return result; | |
} | |
__wsum csum_partial(const void *buff, int len, __wsum wsum) | |
{ | |
unsigned int sum = (unsigned int)wsum; | |
unsigned int result = do_csum(buff, len); | |
/* add in old sum, and carry.. */ | |
result += sum; | |
if (sum > result) | |
result += 1; | |
return (__wsum)result; | |
} | |
// | |
static inline unsigned opt_add32_with_carry(unsigned a, unsigned b) | |
{ | |
asm("addl %2,%0\n\t" | |
"adcl $0,%0" | |
: "=r" (a) | |
: "0" (a), "rm" (b)); | |
return a; | |
} | |
static inline unsigned short opt_from32to16(unsigned a) | |
{ | |
unsigned short b = a >> 16; | |
asm("addw %w2,%w0\n\t" | |
"adcw $0,%w0\n" | |
: "=r" (b) | |
: "0" (b), "r" (a)); | |
return b; | |
} | |
/* | |
* Do a 64-bit checksum on an arbitrary memory area. | |
* Returns a 32bit checksum. | |
* | |
* This isn't as time critical as it used to be because many NICs | |
* do hardware checksumming these days. | |
* | |
* Things tried and found to not make it faster: | |
* Manual Prefetching | |
* Unrolling to an 128 bytes inner loop. | |
* Using interleaving with more registers to break the carry chains. | |
*/ | |
static unsigned opt_do_csum(const unsigned char *buff, unsigned len) | |
{ | |
unsigned odd, count; | |
unsigned long result = 0; | |
if (len == 0) | |
return result; | |
odd = 1 & (unsigned long) buff; | |
if (odd) { | |
result = *buff << 8; | |
len--; | |
buff++; | |
} | |
count = len >> 1; /* nr of 16-bit words.. */ | |
if (count) { | |
if (2 & (unsigned long) buff) { | |
result += *(unsigned short *)buff; | |
count--; | |
len -= 2; | |
buff += 2; | |
} | |
count >>= 1; /* nr of 32-bit words.. */ | |
if (count) { | |
unsigned long zero; | |
unsigned count64; | |
if (4 & (unsigned long) buff) { | |
result += *(unsigned int *) buff; | |
count--; | |
len -= 4; | |
buff += 4; | |
} | |
count >>= 1; /* nr of 64-bit words.. */ | |
/* main loop using 64byte blocks */ | |
zero = 0; | |
count64 = count >> 3; | |
while (count64) { | |
asm("addq 0*8(%[src]),%[res]\n\t" | |
"adcq 1*8(%[src]),%[res]\n\t" | |
"adcq 2*8(%[src]),%[res]\n\t" | |
"adcq 3*8(%[src]),%[res]\n\t" | |
"adcq 4*8(%[src]),%[res]\n\t" | |
"adcq 5*8(%[src]),%[res]\n\t" | |
"adcq 6*8(%[src]),%[res]\n\t" | |
"adcq 7*8(%[src]),%[res]\n\t" | |
"adcq %[zero],%[res]" | |
: [res] "=r" (result) | |
: [src] "r" (buff), [zero] "r" (zero), | |
"[res]" (result)); | |
buff += 64; | |
count64--; | |
} | |
/* last up to 7 8byte blocks */ | |
count %= 8; | |
while (count) { | |
asm("addq %1,%0\n\t" | |
"adcq %2,%0\n" | |
: "=r" (result) | |
: "m" (*(unsigned long *)buff), | |
"r" (zero), "0" (result)); | |
--count; | |
buff += 8; | |
} | |
result = opt_add32_with_carry(result>>32, | |
result&0xffffffff); | |
if (len & 4) { | |
result += *(unsigned int *) buff; | |
buff += 4; | |
} | |
} | |
if (len & 2) { | |
result += *(unsigned short *) buff; | |
buff += 2; | |
} | |
} | |
if (len & 1) | |
result += *buff; | |
result = opt_add32_with_carry(result>>32, result & 0xffffffff); | |
if (odd) { | |
result = opt_from32to16(result); | |
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | |
} | |
return result; | |
} | |
__wsum opt_csum_partial(const void *buff, int len, __wsum sum) | |
{ | |
return ( __wsum)opt_add32_with_carry(opt_do_csum(buff, len), | |
(unsigned int)sum); | |
} | |
int main() | |
{ | |
unsigned arr[] = {~0x1c}; | |
printf("generic: %x %d\n", csum_partial(arr, 4, 0), csum_partial(arr, 4, 0)); | |
printf("x86 : %x %d\n", opt_csum_partial(arr, 4, 0), opt_csum_partial(arr, 4, 0)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment