Skip to content

Instantly share code, notes, and snippets.

@Sc00bz
Created March 4, 2023 02:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Sc00bz/fbc9939629f70a8f9b92a1ed07904328 to your computer and use it in GitHub Desktop.
Save Sc00bz/fbc9939629f70a8f9b92a1ed07904328 to your computer and use it in GitHub Desktop.
"z[] += x[] * y" with ADX instruction set *untested*
# Related to https://abyssdomain.expert/@filippo/109925743627302756
# // len(z) == len(x)... or len(z) <= len(x) and everything is based on len(z)
# // z[] += x[] * y
# // c = overflow (ie c is "z[len(z)]")
# func addMulVVW(z, x []uint, y uint) (c uint)
# {
lea RSI,[x] # RSI = "&x"
lea RDI,[z] # RDI = "&z"
mov RCX,"len(z)" # RCX = len(z)
mov R11,RCX # R11 = RCX
shr RCX,1 # RCX /= 2
mov RDX,[y] # RDX = y
xor RAX,RAX # RAX = 0
xor R10,R10 # R10 = 0
test R11,R11 # if (R11 != 0)
jz [done] # {
test RCX,RCX # if (RCX != 0)
jz [skip] # {
test R11,1 # // This is for "jz [done]" after the loop
# for (; RCX != 0; RCX--)
next: # {
mulx R9,R8,[RSI+8*RAX] # R9, R8 = bits.Mul(x[RAX], y)
adcx R8,[RDI+8*RAX] # R8, CF = bits.Add(R8, z[RAX], CF)
adox R8,R10 # R8, OF = bits.Add(R8, R10, OF)
mov [RDI+8*RAX],R8 # z[RAX] = R8
mulx R10,R8,[RSI+8*RAX+8] # R10, R8 = bits.Mul(x[RAX + 1], y)
adcx R8,[RDI+8*RAX+8] # R8, CF = bits.Add(R8, z[RAX], CF)
adox R8,R9 # R8, OF = bits.Add(R8, R10, OF)
mov [RDI+8*RAX+8],R8 # z[RAX + 1] = R8
lea RAX,[RAX+2] # RAX += 2
loop [next] # }
# }
jz [done] # if (R11 % 2 != 0)
skip: # {
mulx R9,R8,[RSI+8*RAX] # R9, R8 = bits.Mul(x[RAX], y)
adcx R8,[RDI+8*RAX] # R8, CF = bits.Add(R8, z[RAX], CF)
adox R8,R10 # R8, OF = bits.Add(R8, R10, OF)
mov [RDI+8*RAX],R8 # z[RAX] = R8
mov R10,R9 # R10 = R9
# }
done: # }
adcx R10,RCX # R10 += CF // Note RCX == 0
adox R10,RCX # R10 += OF // Also "R10 += CF + OF" can't overflow
mov [c],R10 # c = R10
# }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment