Created
July 27, 2016 22:06
-
-
Save valera-rozuvan/28a0ec1dd706a66cf1f08138939d0db1 to your computer and use it in GitHub Desktop.
All assembly code from thargor6/mb3d project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function FastLocateByte(const Where; Start, BSize: Integer; What: Word): Integer; assembler; pascal; | |
asm | |
push edi | |
mov ecx, [bsize] | |
sub ecx, [start] | |
jz @notfound // No data to search | |
mov edi, [where] | |
add edi, [start] | |
mov ax, [what] | |
@search: | |
repne scasb | |
je @found | |
@notfound: | |
mov eax, -1 | |
jmp @end | |
@found: | |
mov eax, edi | |
dec eax | |
sub eax, [where] | |
@end: | |
pop edi | |
end; | |
function FastLocate2Bytes(const where; start, bsize: integer; what: word):integer; assembler; pascal; far; | |
asm | |
push edi | |
mov ecx, [bsize] | |
sub ecx, [start] | |
jz @notfound // No data to search | |
mov edi, [where] | |
add edi, [start] | |
mov ax, [what] | |
@search: | |
repne scasb | |
je @found | |
@notfound: | |
mov eax, -1 | |
jmp @end | |
@found: | |
cmp [edi], ah | |
jne @search | |
mov eax, edi | |
dec eax | |
sub eax, [where] | |
@end: | |
pop edi | |
end; | |
function FastLocateDWord(var Where; BSize: Integer; What: LongInt): Integer; assembler; register; | |
asm | |
push edi | |
mov edi, eax | |
mov eax, ecx | |
mov ecx, edx | |
mov edx, edi | |
@search: | |
repne scasd | |
je @found | |
@notfound: | |
mov eax, -1 | |
jmp @end | |
@found: | |
mov eax, edi | |
sub eax, edx | |
shr eax, 2 | |
dec eax | |
@end: | |
pop edi | |
end; | |
procedure ZeroMem( var dest; sizeof: integer ); assembler; register; | |
asm | |
push edi { protect edi } | |
mov edi, eax { edi=@dest } | |
xor eax, eax { eax=0 } | |
mov ecx, edx | |
shr ecx, 2 | |
rep stosd | |
mov ecx, edx | |
bt ecx, 1 | |
jnc @stobyte | |
stosw | |
@stobyte: | |
bt ecx, 0 | |
jnc @ende | |
stosb | |
@ende: | |
pop edi | |
end; | |
procedure FillDWord(var Dest; Count: Integer; Value: Cardinal); assembler; register; | |
asm | |
push edi // protect edi | |
mov edi, eax // edi=@dest | |
mov eax, ecx // eax=Value | |
mov ecx, edx | |
rep stosd | |
pop edi | |
end; | |
procedure FastFillChar( var dest; sizeof: integer; fill: byte ); assembler; register; | |
asm | |
push edi { protect edi } | |
mov edi, eax { edi=@dest } | |
mov ch, cl | |
mov ax, cx | |
bswap eax | |
mov ax, cx | |
mov ecx, edx | |
shr ecx, 2 | |
rep stosd | |
mov ecx, edx | |
bt ecx, 1 | |
jnc @stobyte | |
stosw | |
@stobyte: | |
bt ecx, 0 | |
jnc @ende | |
stosb | |
@ende: | |
pop edi | |
end; | |
function GetSwap2(A: Word): Word; assembler; register; | |
asm | |
mov cl, al | |
mov al, ah | |
mov ah, cl | |
end; | |
procedure Swap4(var A: Cardinal); assembler; register; | |
asm | |
mov ecx, [eax] | |
bswap ecx | |
mov [eax], ecx | |
end; | |
function GetSwap4(A: Cardinal): Cardinal; assembler; register; | |
asm | |
bswap eax | |
end; | |
procedure SwapDWords(var A,B); assembler; register; | |
asm | |
push ebx | |
mov ebx, [eax] | |
mov ecx, [edx] | |
mov [eax], ecx | |
mov [edx], ebx | |
pop ebx | |
end; | |
function NotZeroSVec(sv: TPSVec): LongBool; //eax 0, $FFFFFFFF | |
asm | |
mov edx, [eax] | |
or edx, [eax + 4] | |
or edx, [eax + 8] | |
xor eax, eax | |
test edx, edx | |
jz @@1 | |
mov eax, $FFFFFFFF | |
@@1: | |
end; | |
procedure Clamp0SvecSSE(sv1: TPSVec); | |
asm | |
movups xmm0, [eax] | |
xorps xmm1, xmm1 | |
maxps xmm0, xmm1 | |
movups [eax], xmm0 | |
end; | |
procedure FlipVecs(V1, V2: TPVec3D); | |
asm | |
fld qword [eax] | |
fld qword [eax + 8] | |
fld qword [eax + 16] | |
fld qword [edx] | |
fld qword [edx + 8] | |
fld qword [edx + 16] | |
fstp qword [eax + 16] | |
fstp qword [eax + 8] | |
fstp qword [eax] | |
fstp qword [edx + 16] | |
fstp qword [edx + 8] | |
fstp qword [edx] | |
end; | |
function YofSVec(sv: TPSVec): Single; | |
asm // Result := sv[0] * s03 + sv[1] * s059 + sv[2] * s011; | |
fld dword [eax] | |
fmul s03 | |
fld dword [eax + 4] | |
fmul s059 | |
faddp | |
fld dword [eax + 8] | |
fmul s011 | |
faddp | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@1 | |
fstp st | |
fldz | |
@@1: | |
end; | |
function MaxOfSVec(sv: TPSVec): Single; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
push edx | |
movss xmm0, [eax] | |
maxss xmm0, [eax + 4] | |
maxss xmm0, [eax + 8] | |
movss [esp], xmm0 | |
fld dword [esp] | |
pop edx | |
ret | |
@@1: | |
mov edx, eax | |
fld dword [eax] | |
fcom dword [edx + 4] | |
fnstsw ax | |
and ah, 41H | |
jz @@up1 | |
fstp st | |
fld dword [edx + 4] | |
@@up1: | |
fcom dword [edx + 8] | |
fnstsw ax | |
and ah, 41H | |
jz @@up2 | |
fstp st | |
fld dword [edx + 8] | |
@@up2: | |
end; | |
function D7Bequal(d1, d2: Double7B): LongBool; | |
asm | |
push ecx | |
mov ecx, [eax] | |
cmp ecx, [edx] | |
jne @@1 | |
mov cx, [eax + 4] | |
cmp cx, word [edx + 4] | |
jne @@1 | |
mov cl, [eax + 6] | |
cmp cl, byte [edx + 6] | |
jne @@1 | |
mov eax, $FFFFFFFF | |
jmp @@2 | |
@@1: | |
xor eax, eax | |
@@2: | |
pop ecx | |
end; | |
function D7BtoDouble(const D7B: Double7B): Double; | |
asm | |
add esp, -8 | |
xor edx, edx | |
mov [esp], edx | |
mov edx, [eax] | |
mov [esp + 1], edx | |
mov edx, [eax + 3] | |
mov [esp + 4], edx | |
fld qword [esp] | |
add esp, 8 | |
end; | |
function DoubleToD7B(const D: Double): Double7B; | |
asm | |
mov edx, [ebp + 9] | |
mov [eax], edx | |
mov edx, [ebp + 12] | |
mov [eax + 3], edx | |
end; | |
procedure MakeWNormalsFromDVec(PsiLight: TPLNormals; PDVec: TPVec3D); | |
asm | |
fld qword [edx] | |
fld st | |
fmul st, st //x²,x | |
fld qword [edx + 8] | |
fld st | |
fmul st, st //y²,y,x²,x | |
faddp st(2), st //y,x²+y²,x | |
fld qword [edx + 16] | |
fld st | |
fmul st, st //z²,z,y,x²+y²,x | |
faddp st(3), st //z,y,x²+y²+z²,x | |
fxch st(2) //x²+y²+z²,y,z,x | |
fadd d1em100 | |
fsqrt | |
fdivr d32767 | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //y',z',x' | |
fistp word [eax + 2] | |
fistp word [eax + 4] | |
fistp word [eax] | |
end; | |
function FastIntPow(const base: Single; const expo: Integer): Single; //powers with expo in 2^x x in[1..much] for spec painting, if ipol, expo could be float! | |
asm | |
fld dword [ebp + 8] | |
mov edx, eax | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@1 | |
fstp st | |
fldz | |
jmp @@end | |
@@1: | |
shr edx, 1 | |
@@2: | |
fmul st, st | |
shr edx, 1 | |
jnz @@2 | |
@@end: | |
end; | |
function Clamp0D(const d: Double): Double; | |
asm | |
fld qword [ebp + 8] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@end | |
fstp st | |
fldz | |
@@end: | |
end; | |
function Clamp01S(const sv: Single): Single; | |
asm | |
fld dword [ebp + 8] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@1 | |
fstp st | |
fldz | |
jmp @@end | |
@@1: | |
fld1 | |
fcomp st(1) | |
fnstsw ax | |
shr ah, 1 | |
jnc @@end | |
fstp st | |
fld1 | |
@@end: | |
end; //ret 4 | |
function Clamp01D(const dv: Double): Double; | |
asm | |
fld qword [ebp + 8] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@1 | |
fstp st | |
fldz | |
jmp @@end | |
@@1: | |
fld1 | |
fcomp st(1) | |
fnstsw ax | |
shr ah, 1 | |
jnc @@end | |
fstp st | |
fld1 | |
@@end: | |
end; | |
function MakeSplineCoeff(const xs: Double): TSVec; | |
asm | |
fld d1d6 | |
fld qword [ebp + 8] | |
fld st | |
fmul st, st | |
fmul st, st(1) | |
fmul st, st(2) | |
fst dword [eax + 12] //Result[3],xs,d1d6 | |
fld1 | |
fsub st, st(2) //1-xs,Result[3],xs,d1d6 | |
fmul st, st(2) | |
fmul s05 | |
fsubp st(3), st //Result[3],xs,d1d6 + 0.5 * xs * (xs - 1.0) | |
fsub st(2), st //Result[3],xs,Result[0] | |
fxch //xs,Result[3],Result[0] | |
fadd st, st(2) | |
fsub st, st(1) | |
fsub st, st(1) //Result[2],Result[3],Result[0] | |
fst dword [eax + 8] | |
fld1 | |
fsubrp //1-Result[2],Result[3],Result[0] | |
fsubrp //1-Result[2]-Result[3],Result[0] | |
fsub st, st(1) //1-Result[2]-Result[3]-Result[0],Result[0] | |
fstp dword [eax + 4] | |
fstp dword [eax] | |
end; | |
function Add2SVecsWeight2(const sv1, sv2: TSVec; const w2: Single): TSVec; | |
asm | |
fld dword [edx] | |
fld dword [edx + 4] | |
fld dword [edx + 8] | |
fld dword [ebp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fadd dword [eax + 8] | |
fstp dword [ecx + 8] | |
fadd dword [eax + 4] | |
fstp dword [ecx + 4] | |
fadd dword [eax] | |
fstp dword [ecx] | |
xor eax, eax | |
mov [ecx + 12], eax | |
end; | |
function LinInterpolate2SVecs(const sv1, sv2: TSVec; const w1: Single): TSVec; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm2, [ebp + 8] | |
movups xmm0, [eax] | |
movups xmm1, [edx] | |
shufps xmm2, xmm2, 0 | |
subps xmm0, xmm1 | |
mulps xmm0, xmm2 | |
addps xmm0, xmm1 | |
movups [ecx], xmm0 | |
pop ebp | |
ret 4 | |
@@1: | |
fld dword [edx] | |
fld dword [edx + 4] | |
fld dword [edx + 8] | |
fld dword [edx + 12] | |
fld dword [ebp + 8] | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] //s12,s11,s10,w1,s23,s22,s21,s20 | |
fsub st, st(5) | |
fmul st, st(3) | |
faddp st(5), st //s11,s10,w1,s23,result2,s21,s20 | |
fsub st, st(5) | |
fmul st, st(2) | |
faddp st(5), st //s10,w1,s23,result2,result1,s20 | |
fsub st, st(5) | |
fmul st, st(1) | |
faddp st(5), st //w1,s23,result2,result1,result0 | |
fld dword [eax + 12] | |
fsub st, st(2) //..,w1,s23,result2,result1,result0 | |
fmulp //..*w1,s23,result2,result1,result0 | |
faddp | |
fstp dword [ecx + 12] | |
fstp dword [ecx + 8] | |
fstp dword [ecx + 4] | |
fstp dword [ecx] | |
end; | |
function Add2SVecsWeight(const sv1, sv2: TSVec; const w1, w2: Single): TSVec; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm2, [ebp + 12] | |
movss xmm3, [ebp + 8] | |
movups xmm0, [eax] | |
movups xmm1, [edx] | |
shufps xmm2, xmm2, $C0 | |
shufps xmm3, xmm3, $C0 | |
mulps xmm0, xmm2 | |
mulps xmm1, xmm3 | |
addps xmm0, xmm1 | |
movups [ecx], xmm0 | |
pop ebp | |
ret 8 | |
@@1: | |
fld dword [edx] | |
fld dword [edx + 4] | |
fld dword [edx + 8] | |
fld dword [ebp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fld dword [ebp + 12] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //s12,s11,s10,s22,s21,s20 | |
xor eax, eax | |
faddp st(3), st //s11,s10,result2,s21,s20 | |
faddp st(3), st | |
faddp st(3), st //result2,result1,result0 | |
fstp dword [ecx + 8] | |
fstp dword [ecx + 4] | |
fstp dword [ecx] | |
mov [ecx + 12], eax | |
end; | |
procedure ClearSVec(var sv: TSVec); | |
asm | |
xor edx, edx | |
mov [eax], edx | |
mov [eax + 4], edx | |
mov [eax + 8], edx | |
mov [eax + 12], edx | |
end; | |
procedure ClearDVec(var dv: TVec3D); | |
asm | |
fldz | |
fst qword [eax] | |
fst qword [eax + 8] | |
fstp qword [eax + 16] | |
end; | |
procedure mClampSqrtSVecV(v: TPSVec); | |
asm | |
xor edx, edx | |
mov [eax + 12], edx | |
cmp SupportSSE, 0 | |
jz @@1 | |
movups xmm0, [eax] | |
xorps xmm1, xmm1 | |
maxps xmm0, xmm1 | |
sqrtps xmm0, xmm0 | |
movups [eax], xmm0 | |
ret | |
@@1: | |
mov edx, eax | |
fld dword [edx] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@2 | |
fstp st | |
fldz | |
jmp @@21 | |
@@2: | |
fsqrt | |
@@21: | |
fstp dword [edx] | |
fld dword [edx + 4] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@3 | |
fstp st | |
fldz | |
jmp @@31 | |
@@3: | |
fsqrt | |
@@31: | |
fstp dword [edx + 4] | |
fld dword [edx + 8] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@4 | |
fstp st | |
fldz | |
jmp @@41 | |
@@4: | |
fsqrt | |
@@41: | |
fstp dword [edx + 8] | |
end; | |
procedure mClampSqrSVecV(v: TPSVec); | |
asm | |
xor edx, edx | |
mov [eax + 12], edx | |
cmp SupportSSE, 0 | |
jz @@1 | |
movups xmm0, [eax] | |
xorps xmm1, xmm1 | |
maxps xmm0, xmm1 | |
mulps xmm0, xmm0 | |
movups [eax], xmm0 | |
ret | |
@@1: | |
mov edx, eax | |
fld dword [edx] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@2 | |
fstp st | |
fldz | |
@@2: | |
fmul st, st | |
fstp dword [edx] | |
fld dword [edx + 4] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@3 | |
fstp st | |
fldz | |
@@3: | |
fmul st, st | |
fstp dword [edx + 4] | |
fld dword [edx + 8] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@4 | |
fstp st | |
fldz | |
@@4: | |
fmul st, st | |
fstp dword [edx + 8] | |
end; | |
function FastPow(const x, y: Single): Single; //used by vis light 3 | |
asm // Result := x / (y - x * y + x); | |
fld dword [ebp+12] | |
fld st | |
fmul dword [ebp+8] | |
fsubr dword [ebp+8] | |
fadd st, st(1) | |
fdivp | |
end; | |
function MakeSVecFromNormalsD(PsiLight: Pointer): TSVec; | |
const d3: Double = 3.0518509476e-5; | |
asm | |
fild word [eax] | |
fild word [eax + 2] | |
fild word [eax + 4] | |
fld d3 | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
xor eax, eax | |
fstp dword [edx + 8] | |
fstp dword [edx + 4] | |
fstp dword [edx] | |
mov [edx + 12], eax | |
end; | |
function MinMaxSVecSSE(const smin, smax: Single; const V1: TSVec): TSVec; | |
asm | |
movss xmm1, [ebp + 12] | |
movss xmm2, [ebp + 8] | |
movups xmm0, [eax] | |
shufps xmm1, xmm1, 0 | |
shufps xmm2, xmm2, 0 | |
maxps xmm0, xmm1 | |
minps xmm0, xmm2 | |
movups [edx], xmm0 | |
end; | |
function mSqrtSVec(const V1: TSVec): TSVec; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movups xmm0, [eax]; | |
sqrtps xmm0, xmm0; | |
movups [edx], xmm0; | |
ret | |
@@1: | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fsqrt | |
xor eax, eax | |
fstp dword [edx + 8] | |
fsqrt | |
fstp dword [edx + 4] | |
fsqrt | |
fstp dword [edx] | |
mov [edx + 12], eax | |
end; | |
function LengthOfVec(const V: TVec3D): Double; | |
asm // Result := Sqrt(Sqr(V[0]) + Sqr(V[1]) + Sqr(V[2])); | |
fld qword [eax] | |
fmul st, st | |
fld qword [eax+8] | |
fmul st, st | |
faddp | |
fld qword [eax+16] | |
fmul st, st | |
faddp | |
fsqrt | |
end; | |
function SqrLengthOfVec(const V: TVec3D): Double; | |
asm | |
fld qword [eax] | |
fmul st, st | |
fld qword [eax+8] | |
fmul st, st | |
faddp | |
fld qword [eax+16] | |
fmul st, st | |
faddp | |
end; | |
function SqrLengthOfSVec(const V: TSVec): Single; | |
asm //eax st Result := Sqr(V[0]) + Sqr(V[1]) + Sqr(V[2]); | |
fld dword [eax] | |
fmul st, st | |
fld dword [eax + 4] | |
fmul st, st | |
faddp | |
fld dword [eax + 8] | |
fmul st, st | |
faddp | |
end; | |
function NormaliseVector(V: TPVec3D): TVec3D; | |
asm //max 4 st slots useable because of calling formula | |
fld qword [eax] | |
fld st //v0,v0 | |
fmul st, st //v0²,v0 | |
fld qword [eax + 8] | |
fld st //v1,v1,vo²,vo | |
fmul st, st //v1²,v1,v0²,v0 | |
faddp st(2), st //v1,v0²+v1²,v0 | |
fld qword [eax + 16] | |
fmul st, st //v2²,v1,v0²+v1²,v0 | |
fadd d1em100 | |
faddp st(2), st //v1,v0²+v1²+v2²,v0 | |
fxch //v0²+v1²+v2²,v1,v0 | |
fsqrt //r,v1,v0 | |
fld1 //1,r,v1,v0 | |
fdivrp //1/r,v1,v0 | |
fmul st(2), st | |
fmul st(1), st | |
fmul qword [eax + 16] //v2',v1',v0' | |
fstp qword [edx + 16] | |
fstp qword [edx + 8] | |
fstp qword [edx] // | |
end; | |
procedure NormaliseVectorVar(var V: TVec3D); | |
asm | |
fld qword [eax] | |
fld st //v0,v0 | |
fmul st, st //v0²,v0 | |
fld qword [eax + 8] | |
fld st | |
fmul st, st //v1²,v1,v0²,v0 | |
faddp st(2), st //v1,v0²+v1²,v0 | |
fld qword [eax + 16] | |
fld st //v2,v2,v1,v0²+v1²,v0 | |
fmul st, st //v2²,v2,v1,v0²+v1²,v0 | |
fadd d1em100 | |
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0 | |
fxch st(2) //v0²+v1²+v2²,v1,v2,v0 | |
fsqrt | |
fld1 | |
fdivrp | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //v1',v2',v0' | |
fstp qword [eax + 8] | |
fstp qword [eax + 16] | |
fstp qword [eax] //} | |
end; | |
procedure NormaliseSVectorVar(var V: TSVec); | |
asm | |
fld dword [eax] | |
fld st //v0,v0 | |
fmul st, st //v0²,v0 | |
fld dword [eax + 4] | |
fld st | |
fmul st, st //v1²,v1,v0²,v0 | |
faddp st(2), st //v1,v0²+v1²,v0 | |
fld dword [eax + 8] | |
fld st //v2,v2,v1,v0²+v1²,v0 | |
fmul st, st //v2²,v2,v1,v0²+v1²,v0 | |
fadd s1em30 | |
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0 | |
fxch st(2) //v0²+v1²+v2²,v1,v2,v0 | |
fsqrt | |
fld1 | |
fdivrp | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //v1',v2',v0' | |
fstp dword [eax + 4] | |
fstp dword [eax + 8] | |
fstp dword [eax] | |
end; | |
function NormaliseVectorTo(const n: Double; const V: TVec3D): TVec3D; overload; | |
asm | |
fld qword [eax] | |
fld st //v0,v0 | |
fmul st, st //v0²,v0 | |
fld qword [eax + 8] | |
fld st | |
fmul st, st //v1²,v1,v0²,v0 | |
faddp st(2), st //v1,v0²+v1²,v0 | |
fld qword [eax + 16] | |
fld st //v2,v2,v1,v0²+v1²,v0 | |
fmul st, st //v2²,v2,v1,v0²+v1²,v0 | |
fadd d1em100 | |
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0 | |
fxch st(2) //v0²+v1²+v2²,v1,v2,v0 | |
fsqrt | |
fld qword [ebp + 8] | |
fdivrp | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //v1',v2',v0' | |
fstp qword [edx + 8] | |
fstp qword [edx + 16] | |
fstp qword [edx] | |
end; | |
procedure NormaliseVectorTo(const n: Double; V: TPVec3D); overload; | |
asm | |
fld qword [eax] | |
fld st //v0,v0 | |
fmul st, st //v0²,v0 | |
fld qword [eax + 8] | |
fld st | |
fmul st, st //v1²,v1,v0²,v0 | |
faddp st(2), st //v1,v0²+v1²,v0 | |
fld qword [eax + 16] | |
fld st //v2,v2,v1,v0²+v1²,v0 | |
fmul st, st //v2²,v2,v1,v0²+v1²,v0 | |
fadd d1em100 | |
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0 | |
fxch st(2) //v0²+v1²+v2²,v1,v2,v0 | |
fsqrt | |
fld qword [ebp + 8] | |
fdivrp | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //v1',v2',v0' | |
fstp qword [eax + 8] | |
fstp qword [eax + 16] | |
fstp qword [eax] | |
end; | |
function NormaliseSVector(const V: TSVec): TSVec; //..in SSE | |
asm | |
fld dword [eax] | |
fld st //v0,v0 | |
fmul st, st //v0²,v0 | |
fld dword [eax + 4] | |
fld st | |
fmul st, st //v1²,v1,v0²,v0 | |
faddp st(2), st //v1,v0²+v1²,v0 | |
fld dword [eax + 8] | |
fld st //v2,v2,v1,v0²+v1²,v0 | |
fmul st, st //v2²,v2,v1,v0²+v1²,v0 | |
fadd d1em100 | |
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0 | |
fxch st(2) //v0²+v1²+v2²,v1,v2,v0 | |
fsqrt | |
fld1 | |
fdivrp | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //v1',v2',v0' | |
fstp dword [edx + 4] | |
fstp dword [edx + 8] | |
fstp dword [edx] | |
end; | |
procedure SVecToNormals(const sv: TSVec; pn: Pointer); | |
const d32767: Double = 32767; | |
asm | |
fld dword [eax] | |
fld st //v0,v0 | |
fmul st, st //v0²,v0 | |
fld dword [eax + 4] | |
fld st | |
fmul st, st //v1²,v1,v0²,v0 | |
faddp st(2), st //v1,v0²+v1²,v0 | |
fld dword [eax + 8] | |
fld st //v2,v2,v1,v0²+v1²,v0 | |
fmul st, st //v2²,v2,v1,v0²+v1²,v0 | |
fadd d1em100 | |
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0 | |
fxch st(2) //v0²+v1²+v2²,v1,v2,v0 | |
fsqrt | |
fld d32767 | |
fdivrp | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //v1',v2',v0' | |
fistp word [edx + 2] | |
fistp word [edx + 4] | |
fistp word [edx + 0] | |
end; | |
procedure RotateVector(V: TPVec3D; M: TPMatrix3); //is like reversed S version | |
asm | |
fld qword [edx] | |
fld qword [edx + 24] | |
fld qword [edx + 48] | |
fld qword [eax] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
fld qword [edx + 8] | |
fld qword [edx + 32] | |
fld qword [edx + 56] | |
fld qword [eax + 8] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fld qword [edx + 16] | |
fld qword [edx + 40] | |
fld qword [edx + 64] | |
fld qword [eax + 16] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp qword [eax + 16] | |
fstp qword [eax + 8] | |
fstp qword [eax] | |
end; | |
procedure RotateVectorReverse(V: TPVec3D; M: TPMatrix3); | |
asm | |
fld qword [edx] | |
fld qword [edx + 8] | |
fld qword [edx + 16] | |
fld qword [eax] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
fld qword [edx + 24] | |
fld qword [edx + 32] | |
fld qword [edx + 40] | |
fld qword [eax + 8] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fld qword [edx + 48] | |
fld qword [edx + 56] | |
fld qword [edx + 64] | |
fld qword [eax + 16] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp qword [eax + 16] | |
fstp qword [eax + 8] | |
fstp qword [eax] | |
end; | |
procedure RotateSVector(V: TPSVec; M: TPMatrix3); | |
asm | |
fld qword [edx] | |
fld qword [edx + 8] | |
fld qword [edx + 16] | |
fld dword [eax] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
fld qword [edx + 24] | |
fld qword [edx + 32] | |
fld qword [edx + 40] | |
fld dword [eax + 4] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fld qword [edx + 48] | |
fld qword [edx + 56] | |
fld qword [edx + 64] | |
fld dword [eax + 8] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp dword [eax + 8] | |
fstp dword [eax + 4] | |
fstp dword [eax] | |
end; | |
procedure RotateSVectorReverse(V: TPSVec; M: TPMatrix3); | |
asm | |
fld qword [edx] | |
fld qword [edx + 24] | |
fld qword [edx + 48] | |
fld dword [eax] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
fld qword [edx + 8] | |
fld qword [edx + 32] | |
fld qword [edx + 56] | |
fld dword [eax + 4] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fld qword [edx + 16] | |
fld qword [edx + 40] | |
fld qword [edx + 64] | |
fld dword [eax + 8] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp dword [eax + 8] | |
fstp dword [eax + 4] | |
fstp dword [eax] | |
end; | |
procedure RotateSVectorS(V: TPSVec; M: TPSMatrix3); //in calcpixelcol | |
asm // eax edx | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm0, [eax] | |
movss xmm1, [eax + 4] | |
movss xmm2, [eax + 8] | |
shufps xmm0, xmm0, 0 | |
shufps xmm1, xmm1, 0 | |
shufps xmm2, xmm2, 0 | |
movups xmm4, [edx] | |
movups xmm5, [edx + 16] | |
movups xmm6, [edx + 32] | |
mulps xmm4, xmm0 //m0*v0 | |
mulps xmm5, xmm1 //m1*v1 | |
mulps xmm6, xmm2 //m2*v2 | |
addps xmm4, xmm5 | |
addps xmm4, xmm6 | |
movups [eax], xmm4 | |
ret | |
@@1: | |
fld dword [edx] //M[0,0] | |
fld dword [edx + 4] | |
fld dword [edx + 8] //M[0,2],M[0,1],M[0,0] | |
fld dword [eax] //V[0],M[0,2],M[0,1],M[0,0] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) //M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0] | |
fld dword [edx + 16] | |
fld dword [edx + 20] | |
fld dword [edx + 24] | |
fld dword [eax + 4] //v[1],M[1,2],M[1,1],M[1,0], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) //M[1,2]*V[1],M[1,1]*V[1],M[1,0]*V[1], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0] | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) //M[1,2]*V[1]+M[0,2]*V[0], M[1,1]*V[1]+M[0,1]*V[0], M[1,0]*V[1]+M[0,0]*V[0] | |
fld dword [edx + 32] | |
fld dword [edx + 36] | |
fld dword [edx + 40] | |
fld dword [eax + 8] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp dword [eax + 8] //v2=m02*v0+m12*v1+m22*v2 | |
fstp dword [eax + 4] //v1=m01*v0+m11*v1+m21*v2 | |
fstp dword [eax] //v0=m00*v0+m10*v1+m20*v2 | |
end; | |
procedure RotateVectorS(V: TPVec3D; M: TPSMatrix3); | |
asm | |
fld dword [edx] //M[0,0] | |
fld dword [edx + 4] | |
fld dword [edx + 8] //M[0,2],M[0,1],M[0,0] | |
fld qword [eax] //V[0],M[0,2],M[0,1],M[0,0] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) //M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0] | |
fld dword [edx + 16] | |
fld dword [edx + 20] | |
fld dword [edx + 24] | |
fld qword [eax + 8] //v[1],M[1,2],M[1,1],M[1,0], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) //M[1,2]*V[1],M[1,1]*V[1],M[1,0]*V[1], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0] | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) //M[1,2]*V[1]+M[0,2]*V[0], M[1,1]*V[1]+M[0,1]*V[0], M[1,0]*V[1]+M[0,0]*V[0] | |
fld dword [edx + 32] | |
fld dword [edx + 36] | |
fld dword [edx + 40] | |
fld qword [eax + 16] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp qword [eax + 16] //v2=m02*v0+m12*v1+m22*v2 | |
fstp qword [eax + 8] //v1=m01*v0+m11*v1+m21*v2 | |
fstp qword [eax] //v0=m00*v0+m10*v1+m20*v2 | |
end; | |
procedure RotateVectorReverseS(V: TPVec3D; M: TPSMatrix3); | |
asm | |
fld dword [edx] | |
fld dword [edx + 16] | |
fld dword [edx + 32] | |
fld qword [eax] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
fld dword [edx + 4] | |
fld dword [edx + 20] | |
fld dword [edx + 36] | |
fld qword [eax + 8] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fld dword [edx + 8] | |
fld dword [edx + 24] | |
fld dword [edx + 40] | |
fld qword [eax + 16] | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp qword [eax + 16] | |
fstp qword [eax + 8] | |
fstp qword [eax] | |
end; | |
procedure RotateSVectorReverseS(V: TPSVec; M: TPSMatrix3); | |
asm | |
fld dword [edx] | |
fld dword [edx + 16] | |
fld dword [edx + 32] //M[2,0], M[1,0], M[0,0] | |
fld dword [eax] //V[0] | |
fmul st(1), st(0) //V[0], V[0]*M[2,0], M[1,0], M[0,0] | |
fmul st(2), st(0) //V[0], V[0]*M[2,0], V[0]*M[1,0], M[0,0] | |
fmulp st(3), st(0) //V[0]*M[2,0], V[0]*M[1,0], V[0]*M[0,0] | |
fld dword [edx + 4] | |
fld dword [edx + 20] | |
fld dword [edx + 36] | |
fld dword [eax + 4] | |
fmul st(1), st(0) //+v[1]*M[x,1] | |
fmul st(2), st(0) | |
fmulp st(3), st(0) //V[1]*M[2,1], V[1]*M[1,1], V[1]*M[0,1], V[0]*M[2,0], V[0]*M[1,0], V[0]*M[0,0] | |
faddp st(3), st(0) //v0*m20+v1*m21 | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fld dword [edx + 8] | |
fld dword [edx + 24] | |
fld dword [edx + 40] | |
fld dword [eax + 8] | |
fmul st(1), st(0) //+v[2]*M[x,2] | |
fmul st(2), st(0) | |
fmulp st(3), st(0) //v0*m20+v1*m21+v2*m22 | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
faddp st(3), st(0) | |
fstp dword [eax + 8] //v0*m20+v1*m21+v2*m22 | |
fstp dword [eax + 4] //v0*m10+v1*m11+v2*m12 | |
fstp dword [eax] //v0*m00+v1*m01+v2*m02 | |
end; | |
function AddSVectors(const V1, V2: TSVec): TSVec; overload; | |
asm | |
fld dword [eax] | |
fadd dword [edx] | |
fstp dword [ecx] | |
fld dword [eax + 4] | |
fadd dword [edx + 4] | |
fstp dword [ecx + 4] | |
fld dword [eax + 8] | |
fadd dword [edx + 8] | |
fstp dword [ecx + 8] | |
xor eax, eax | |
mov [ecx + 12], eax | |
end; | |
procedure AddSVectors(V1: TPSVec; const V2: TSVec); overload; | |
asm | |
fld dword [eax] | |
fadd dword [edx] | |
fstp dword [eax] | |
fld dword [eax + 4] | |
fadd dword [edx + 4] | |
fstp dword [eax + 4] | |
fld dword [eax + 8] | |
fadd dword [edx + 8] | |
fstp dword [eax + 8] | |
end; | |
function MakeSVecMultiplierFromDynFogCol(sv: TSVec): TSVec; //not used | |
asm | |
cmp SupportSSE2, 0 | |
jz @@1 | |
@@1: | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fld s1d255 | |
fmul st(3), st(0) | |
fmul st(2), st(0) | |
fmulp | |
fld1 | |
fsubr st(3), st(0) | |
fsubr st(2), st(0) | |
fsubr st(1), st(0) | |
fxch st(3) //vs0,vs2,vs1,1 | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @up1 | |
fstp st(0) | |
fldz | |
@up1: | |
fcom st(3) | |
fnstsw ax | |
shr ah, 1 | |
jc @skip1 | |
fstp st(0) | |
fld1 | |
@skip1: | |
fstp dword [edx] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @up2 | |
fstp st(0) | |
fldz | |
@up2: | |
fcom st(2) | |
fnstsw ax | |
shr ah, 1 | |
jc @skip2 | |
fstp st(0) | |
fld1 | |
@skip2: | |
fstp dword [edx + 8] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @up3 | |
fstp st(0) | |
fldz | |
@up3: | |
fcom st(1) | |
fnstsw ax | |
shr ah, 1 | |
jc @skip3 | |
fstp st(0) | |
fld1 | |
@skip3: | |
fstp dword [edx + 4] | |
fstp st(0) | |
xor eax, eax | |
mov dword [edx + 12], eax | |
end; | |
procedure AddSVecWeightS(V1, V2: TPSVec; const W: Single); overload; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm2, [ebp + 8] | |
movups xmm0, [edx] | |
shufps xmm2, xmm2, 0 | |
movups xmm1, [eax] | |
mulps xmm0, xmm2 | |
addps xmm0, xmm1 | |
movups [eax], xmm0 | |
pop ebp | |
ret 4 | |
@@1: | |
fld dword [edx] | |
fld dword [edx + 4] | |
fld dword [edx + 8] | |
fld dword [ebp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fadd dword [eax + 8] | |
fstp dword [eax + 8] | |
fadd dword [eax + 4] | |
fstp dword [eax + 4] | |
fadd dword [eax] | |
fstp dword [eax] | |
end; //ret 4 | |
procedure AddSVecWeightS(var V1: TSVec; const V2: TSVec; const W: Single); overload; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm2, [ebp + 8] | |
movups xmm0, [edx] | |
shufps xmm2, xmm2, 0 | |
movups xmm1, [eax] | |
mulps xmm0, xmm2 | |
addps xmm0, xmm1 | |
movups [eax], xmm0 | |
pop ebp | |
ret 4 | |
@@1: | |
fld dword [edx] | |
fld dword [edx + 4] | |
fld dword [edx + 8] | |
fld dword [ebp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fadd dword [eax + 8] | |
fstp dword [eax + 8] | |
fadd dword [eax + 4] | |
fstp dword [eax + 4] | |
fadd dword [eax] | |
fstp dword [eax] | |
end; | |
function DotOfSVectors(const V1, V2: TSVec): Single; | |
asm | |
fld dword [eax] | |
fmul dword [edx] | |
fld dword [eax + 4] | |
fmul dword [edx + 4] | |
faddp | |
fld dword [eax + 8] | |
fmul dword [edx + 8] | |
faddp | |
end; | |
function SubtractVectors2s(const V1, V2: TVec3D): TSVec; | |
asm | |
fld qword [eax] | |
fsub qword [edx] | |
fstp dword [ecx] | |
fld qword [eax + 8] | |
fsub qword [edx + 8] | |
fstp dword [ecx + 4] | |
fld qword [eax + 16] | |
fsub qword [edx + 16] | |
fstp dword [ecx + 8] | |
xor eax, eax | |
mov [ecx + 12], eax | |
end; | |
function SubtractVectors(const V1, V2: TVec3D): TVec3D; overload; | |
asm | |
fld qword [eax] | |
fsub qword [edx] | |
fstp qword [ecx] | |
fld qword [eax + 8] | |
fsub qword [edx + 8] | |
fstp qword [ecx + 8] | |
fld qword [eax + 16] | |
fsub qword [edx + 16] | |
fstp qword [ecx + 16] | |
end; | |
// eax edx ecx | |
function SubtractVectors(V1: TPVec3D; const V2: TVec3D): TVec3D; overload; | |
asm | |
fld qword [eax] | |
fsub qword [edx] | |
fstp qword [ecx] | |
fld qword [eax + 8] | |
fsub qword [edx + 8] | |
fstp qword [ecx + 8] | |
fld qword [eax + 16] | |
fsub qword [edx + 16] | |
fstp qword [ecx + 16] | |
end; | |
function SubtractVectors(const V1: TVec3D; V2: TPVec3D): TVec3D; overload; | |
asm | |
fld qword [eax] | |
fsub qword [edx] | |
fstp qword [ecx] | |
fld qword [eax + 8] | |
fsub qword [edx + 8] | |
fstp qword [ecx + 8] | |
fld qword [eax + 16] | |
fsub qword [edx + 16] | |
fstp qword [ecx + 16] | |
end; | |
function SubtractSVectors(V1: TPSVec; const V2: TSVec): TSVec; | |
asm | |
fld dword [eax] | |
fsub dword [edx] | |
fstp dword [ecx] | |
fld dword [eax + 4] | |
fsub dword [edx + 4] | |
fstp dword [ecx + 4] | |
fld dword [eax + 8] | |
fsub dword [edx + 8] | |
fstp dword [ecx + 8] | |
xor eax, eax | |
mov [ecx + 12], eax | |
end; | |
function AddSVecS(const V1: TSVec; const s: Single): TSVec; | |
asm | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fld dword [esp + 8] | |
fadd st(3), st | |
fadd st(2), st | |
faddp | |
fstp dword [edx + 8] | |
fstp dword [edx + 4] | |
fstp dword [edx] | |
xor eax, eax | |
mov [edx + 12], eax | |
end; | |
procedure ScaleSVectorV(V1: TPSVec; const s: Single); | |
asm | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fld dword [esp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fstp dword [eax + 8] | |
fstp dword [eax + 4] | |
fstp dword [eax] | |
end; | |
function MultiplySVectors(const V1, V2: TSVec): TSVec; | |
asm | |
fld dword [eax + 8] | |
fld dword [eax + 4] | |
fld dword [eax] | |
fmul dword [edx] | |
fstp dword [ecx] | |
fmul dword [edx + 4] | |
fstp dword [ecx + 4] | |
fmul dword [edx + 8] | |
fstp dword [ecx + 8] | |
xor eax, eax | |
mov [ecx + 12], eax | |
end; | |
procedure MultiplySVectorsV(V1, V2: TPSVec); overload; | |
asm | |
fld dword [eax + 8] | |
fld dword [eax + 4] | |
fld dword [eax] | |
fmul dword [edx] | |
fstp dword [eax] | |
fmul dword [edx + 4] | |
fstp dword [eax + 4] | |
fmul dword [edx + 8] | |
fstp dword [eax + 8] | |
end; | |
procedure MultiplySVectorsV(V1: TPSVec; const V2: TSVec); overload; | |
asm | |
fld dword [eax + 8] | |
fld dword [eax + 4] | |
fld dword [eax] | |
fmul dword [edx] | |
fstp dword [eax] | |
fmul dword [edx + 4] | |
fstp dword [eax + 4] | |
fmul dword [edx + 8] | |
fstp dword [eax + 8] | |
end; | |
function ScaleSVector(const V1: TSVec; const s: Single): TSVec; | |
asm | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fld dword [esp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fstp dword [edx + 8] | |
fstp dword [edx + 4] | |
fstp dword [edx] | |
end; | |
function ScaleSVector4(const V1: TSVec; const s: Single): TSVec; | |
asm | |
cmp SupportSSE, 0 | |
jz @1 | |
movss xmm1, [esp + 8] | |
movups xmm0, [eax] | |
shufps xmm1, xmm1, 0 | |
mulps xmm0, xmm1 | |
movups [eax], xmm0 | |
ret 4 | |
@1: fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fld dword [eax + 12] | |
fld dword [esp + 8] | |
fmul st(4), st | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fstp dword [edx + 12] | |
fstp dword [edx + 8] | |
fstp dword [edx + 4] | |
fstp dword [edx] | |
end; | |
function ScaleSVectorD(V1: TPSVec; const d: Double): TSVec; | |
asm | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] | |
fld qword [esp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fstp dword [edx + 8] | |
fstp dword [edx + 4] | |
fstp dword [edx] | |
xor eax, eax | |
mov [edx + 12], eax | |
end; | |
procedure BuildViewVectorDFOV(var xa, ya: Double; v: TPVec3D); | |
asm // -sinY, sinX, cosX*cosY ...pano: sinX*cosY, sinY, -cosX*cosY | |
fld qword [eax] | |
fsincos //cosX,sinX | |
fld qword [edx] | |
fsincos //cosY,sinY,cosX,sinX | |
fmulp st(2), st(0) //sinY,cosX*cosY,sinX | |
fchs | |
fld st(0) //normalize | |
fmul st(0), st(1) | |
fld st(2) | |
fmul st(0), st(3) | |
faddp | |
fld st(3) | |
fmul st(0), st(4) | |
faddp | |
fsqrt | |
fld1 | |
fdivrp | |
fmul st(3), st(0) | |
fmul st(2), st(0) | |
fmulp | |
fstp qword [ecx] //cosX*cosY,sinX | |
fstp qword [ecx + 16] //sinX | |
fstp qword [ecx + 8] | |
end; | |
procedure BuildViewVectorDSphereFOV(var xa, ya: Double; v: TPVec3D); | |
asm //x<->y | |
fld qword [edx] | |
fsincos //cosY,sinY | |
fld qword [eax] | |
fsincos //cosX,sinX,cosY,sinY | |
fmul st(2), st(0) //cosX,sinX,cosX*cosY,sinY // pano: sinX*cosY, sinY, cosX*cosY | |
fmulp st(3), st(0) //sinX,cosX*cosY,sinY*cosX | |
fstp qword [ecx + 8] //cosX*cosY,sinX*cosY | |
fstp qword [ecx + 16] | |
fchs | |
fstp qword [ecx] | |
end; | |
procedure BuildViewVectorSphereFOV(var xa, ya: Double; v: TPSVec); | |
asm | |
fld qword [edx] | |
fsincos //cosX,sinX X<->Y | |
fld qword [eax] | |
fsincos //cosY,sinY,cosX,sinX | |
fmul st(2), st(0) //cosY,sinY,cosX*cosY,sinX // pano: sinX*cosY, sinY, cosX*cosY | |
fmulp st(3), st(0) //sinY,cosX*cosY,sinX*cosY | |
fstp dword [ecx + 4] //cosX*cosY,sinX*cosY | |
fstp dword [ecx + 8] | |
fchs | |
fstp dword [ecx] | |
fldz | |
fstp dword [ecx + 12] | |
end; | |
procedure BuildViewVectorFOV(var xa, ya: Double; v: TPSVec); | |
asm // -sinY, sinX, cosX*cosY | |
fld qword [eax] | |
fsincos //cosX,sinX | |
fld qword [edx] | |
fsincos //cosY,sinY,cosX,sinX | |
fmulp st(2), st(0) //sinY,cosX*cosY,sinX | |
fchs //x,z,y | |
fld st(0) //normalize | |
fmul st(0), st(1) | |
fld st(2) | |
fmul st(0), st(3) | |
faddp | |
fld st(3) | |
fmul st(0), st(4) | |
faddp | |
fsqrt | |
fld1 | |
fdivrp | |
fmul st(1), st(0) | |
fmul st(2), st(0) | |
fmulp st(3), st(0) | |
fstp dword [ecx] //cosX*cosY,sinX | |
fstp dword [ecx + 8] //sinX | |
fstp dword [ecx + 4] | |
fldz | |
fstp dword [ecx + 12] | |
end; | |
procedure SVectorChangeSign(V1: TPSVec); | |
asm | |
mov edx, $80000000 | |
xor [eax], edx | |
xor [eax + 4], edx | |
xor [eax + 8], edx | |
end; | |
procedure mAddVecWeight(V1, V2: TPVec3D; const W: Double); | |
asm | |
cmp SupportSSE2, 0 | |
jz @@1 | |
movlpd xmm1, [ebp + 8] | |
movupd xmm2, [edx] | |
unpcklpd xmm1, xmm1 | |
movupd xmm0, [eax] | |
mulpd xmm2, xmm1 | |
mulsd xmm1, [edx + 16] | |
addpd xmm0, xmm2 | |
addsd xmm1, [eax + 16] | |
movupd [eax], xmm0 | |
movsd [eax + 16], xmm1 | |
pop ebp | |
ret 8 | |
@@1: | |
fld qword [edx] | |
fld qword [edx + 8] | |
fld qword [edx + 16] | |
fld qword [ebp + 8] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fadd qword [eax + 16] | |
fstp qword [eax + 16] | |
fadd qword [eax + 8] | |
fstp qword [eax + 8] | |
fadd qword [eax] | |
fstp qword [eax] | |
end; | |
procedure mCopyAddVecWeight(V1, V2, V3: TPVec3D; const W: Double); | |
asm //dest,src,add weight | |
cmp SupportSSE2, 0 | |
jz @@1 | |
movlpd xmm1, [ebp + 8] | |
movupd xmm2, [ecx] | |
unpcklpd xmm1, xmm1 | |
movupd xmm0, [edx] | |
mulpd xmm2, xmm1 | |
mulsd xmm1, [ecx + 16] | |
addpd xmm0, xmm2 | |
addsd xmm1, [edx + 16] | |
movupd [eax], xmm0 | |
movsd [eax + 16], xmm1 | |
pop ebp | |
ret 8 | |
@@1: | |
fld qword [ecx] | |
fld qword [ecx + 8] | |
fld qword [ecx + 16] | |
fld qword [ebp + 8] | |
fmul st(3), st(0) | |
fmul st(2), st(0) | |
fmulp | |
fadd qword [edx + 16] | |
fstp qword [eax + 16] | |
fadd qword [edx + 8] | |
fstp qword [eax + 8] | |
fadd qword [edx] | |
fstp qword [eax] | |
end; | |
procedure mCopyVec(Vd, Vs: TPVec3D); | |
asm | |
fld qword [edx + 16] | |
fld qword [edx + 8] | |
fld qword [edx] | |
fstp qword [eax] | |
fstp qword [eax + 8] | |
fstp qword [eax + 16] | |
end; | |
procedure CopyVecSSE2(V1, V2: TPVec3D); //not used | |
asm | |
movupd xmm0, [edx] | |
movlpd xmm1, [edx + 16] | |
movupd [eax], xmm0 | |
movlpd [eax + 16], xmm1 | |
end; | |
procedure CopyVec4SSE2(V1, V2: TPVec4D); | |
asm | |
movupd xmm0, [edx] | |
movupd xmm1, [edx + 16] | |
movupd [eax], xmm0 | |
movupd [eax + 16], xmm1 | |
end; | |
procedure AddSubVecWeightSSE2(V1, V2, V3: TPVec3D; const W: Double); | |
asm | |
movlpd xmm7, [ebp + 8] | |
movhpd xmm7, [ebp + 8] | |
movupd xmm2, [ecx] | |
movupd xmm0, [edx] | |
movlpd xmm1, [edx + 16] | |
subpd xmm0, xmm2 | |
subsd xmm1, [ecx + 16] | |
movupd xmm4, [eax] | |
mulpd xmm0, xmm7 | |
mulsd xmm1, xmm7 | |
addpd xmm0, xmm4 | |
addsd xmm1, [eax + 16] | |
movupd [eax], xmm0 | |
movlpd [eax + 16], xmm1 | |
end; | |
function MaxCS(s1, s2: Single): Single; | |
asm | |
fld dword [ebp + 8] | |
fcomp dword [ebp + 12] | |
fnstsw ax | |
shr ah, 1 | |
jc @S2isSmallerThanS1 | |
fld dword [ebp + 8] | |
jmp @end | |
@S2isSmallerThanS1: | |
fld dword [ebp + 12] | |
@end: | |
end; | |
function Max0S(s: Single): Single; | |
asm | |
fld dword [ebp + 8] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @@1 | |
fstp st | |
fldz | |
@@1: | |
end; | |
function MinCS(const s1, s2: Single): Single; | |
asm | |
fld dword [ebp + 8] | |
fcomp dword [ebp + 12] | |
fnstsw ax | |
shr ah, 1 | |
jc @S2isSmallerThanS1 | |
fld dword [ebp + 12] | |
jmp @end | |
@S2isSmallerThanS1: | |
fld dword [ebp + 8] | |
@end: | |
end; | |
procedure MinMaxSvar(const smin, smax: Single; var s: Single); | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm0, [eax] | |
maxss xmm0, [ebp + 12] | |
minss xmm0, [ebp + 8] | |
movss [eax], xmm0 | |
pop ebp | |
ret 8 | |
@@1: | |
mov edx, eax | |
fld dword [eax] | |
fcom dword [ebp + 12] | |
fnstsw ax | |
shr ah, 1 | |
jc @SminIsSmallerThanS | |
fcom dword [ebp + 8] | |
fnstsw ax | |
shr ah, 1 | |
jc @end | |
fstp st(0) | |
fld dword [ebp + 8] | |
jmp @end | |
@SminIsSmallerThanS: | |
fstp st(0) | |
fld dword [ebp + 12] | |
@end: | |
fstp dword [edx] | |
end; | |
function MinMaxCS(const smin, s, smax: Single): Single; | |
asm | |
fld dword [ebp + 12] | |
fcom dword [ebp + 16] | |
fnstsw ax | |
shr ah, 1 | |
jc @SminIsSmallerThanS | |
fcom dword [ebp + 8] | |
fnstsw ax | |
shr ah, 1 | |
jc @end | |
fstp st(0) | |
fld dword [ebp + 8] | |
jmp @end | |
@SminIsSmallerThanS: | |
fstp st(0) | |
fld dword [ebp + 16] | |
@end: | |
end; | |
function Min0MaxCS(const s, smax: Single): Single; | |
asm | |
fld dword [ebp + 12] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jc @SminIsSmallerThanS | |
fcom dword [ebp + 8] | |
fnstsw ax | |
shr ah, 1 | |
jc @end | |
fstp st(0) | |
fld dword [ebp + 8] | |
jmp @end | |
@SminIsSmallerThanS: | |
fstp st(0) | |
fldz | |
@end: | |
end; | |
procedure MaxCDvar(var ds, ddest: Double); | |
asm | |
fld qword [eax] | |
fcom qword [edx] | |
fnstsw ax | |
shr ah, 1 | |
jc @@1 | |
fstp qword [edx] | |
ret | |
@@1: | |
fstp st | |
end; | |
procedure Clamp1Svar(var s: Single); | |
asm | |
fld1 | |
mov edx, eax | |
fcom dword [eax] | |
fnstsw ax | |
shr ah, 1 | |
jnc @@1 | |
fstp dword [edx] | |
ret | |
@@1: | |
fstp st | |
end; | |
function Min0MaxCD(const d, dmax: Double): Double; | |
asm | |
fld qword [ebp + 16] | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jc @@1 | |
fcom qword [ebp + 8] | |
fnstsw ax | |
shr ah, 1 | |
jc @end | |
fstp st(0) | |
fld qword [ebp + 8] | |
jmp @end | |
@@1: | |
fstp st(0) | |
fldz | |
@end: | |
end; | |
function MinCD(const s1, s2: Double): Double; | |
asm | |
fld qword [ebp + 8] | |
fcomp qword [ebp + 16] | |
fnstsw ax | |
shr ah, 1 | |
jc @S2isSmallerThanS1 | |
fld qword [ebp + 16] | |
jmp @end | |
@S2isSmallerThanS1: | |
fld qword [ebp + 8] | |
@end: | |
end; | |
function MaxCD(const s1, s2: Double): Double; | |
asm | |
fld qword [ebp + 8] | |
fcomp qword [ebp + 16] | |
fnstsw ax | |
shr ah, 1 | |
jc @S2isSmallerThanS1 | |
fld qword [ebp + 8] | |
jmp @end | |
@S2isSmallerThanS1: | |
fld qword [ebp + 16] | |
@end: | |
end; | |
function MaxAbsCD(const s1, s2: Double): Double; | |
asm | |
fld qword [ebp + 16] | |
fabs | |
fld qword [ebp + 8] | |
fabs | |
fcompp | |
fnstsw ax | |
shr ah, 1 | |
jc @S2isSmallerThanS1 | |
fld qword [ebp + 8] | |
jmp @end | |
@S2isSmallerThanS1: | |
fld qword [ebp + 16] | |
@end: | |
end; | |
function MinAbsCD(const s1, s2: Double): Double; | |
asm | |
fld qword [ebp + 16] | |
fabs | |
fld qword [ebp + 8] | |
fabs | |
fcompp | |
fnstsw ax | |
shr ah, 1 | |
jc @S2isSmallerThanS1 | |
fld qword [ebp + 16] | |
jmp @end | |
@S2isSmallerThanS1: | |
fld qword [ebp + 8] | |
@end: | |
end; | |
procedure SinCosD(const a: Double; var Sin, Cos: Double); | |
asm | |
fld a | |
fsincos | |
fstp qword ptr [edx] // Cos | |
fstp qword ptr [eax] // Sin | |
end; | |
procedure SinCosS(const a: Double; var Sin, Cos: Single); | |
asm | |
fld a | |
fsincos | |
fstp dword ptr [edx] // Cos | |
fstp dword ptr [eax] // Sin | |
end; | |
function FracSingle(const s: Single): Single; | |
asm | |
fld s //ebp+8 | |
fld st(0) | |
sub esp, 4 | |
fnstcw [esp].word // save | |
fnstcw [esp + 2].word // scratch | |
or [esp + 2].word, $0F00 // trunc toward zero, full precision | |
fldcw [esp + 2].word | |
frndint | |
fldcw [esp].word | |
add esp, 4 | |
fsubp | |
end; | |
function MonitorComponent(Component: TComponent): Boolean; | |
// ... | |
asm | |
mov eax,[ebp+4] | |
mov Addr,eax | |
end; | |
// ... | |
constructor TMonitorObject.Create; | |
// ... | |
asm | |
mov eax,[ebp+4] | |
mov Addr,eax | |
end; | |
// ... | |
procedure GetMem(var P; Size: Integer); | |
// ... | |
asm | |
mov eax,[ebp+4] | |
mov Addr,eax | |
end; | |
// ... | |
procedure BuildATlevels(MWidth, MHeight: Integer); | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push esi | |
push edi | |
mov ebx, iStep2 | |
mov ecx, MWidth | |
mov esi, PATL2 | |
sub ecx, ebx | |
mov edi, PATL | |
shr ecx, 2 | |
sub esi, ebx | |
mov eax, ecx | |
sub edi, esi | |
shl eax, 2 | |
add x2, eax | |
@ll: movq mm0, [esi] // calculate 4 words at once | |
pavgw mm0, [esi + ebx * 2] | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, 8 | |
dec ecx | |
jnz @ll | |
add edi, esi | |
mov PATL, edi | |
add esi, ebx | |
mov PATL2, esi | |
pop edi | |
pop esi | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi | |
movq mm1, W4tmp | |
mov ebx, MWidth2step | |
mov esi, PATL2 | |
mov edi, PATL | |
mov ecx, iStep | |
mov edx, MWidth | |
sub esi, ebx | |
add edx, edx | |
sub edi, esi | |
dec ecx | |
@l1: movq mm0, [esi + ebx * 2] | |
pavgw mm0, mm1 | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, edx | |
dec ecx | |
jns @l1 | |
mov ecx, MHeight | |
sub ecx, iStep2 | |
dec ecx | |
js @u2 | |
@l2: movq mm0, [esi] | |
pavgw mm0, [esi + ebx * 2] | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, edx | |
dec ecx | |
jns @l2 | |
@u2: | |
movq mm1, W4tmp2 | |
mov ecx, iStep | |
dec ecx | |
@l3: movq mm0, [esi] | |
pavgw mm0, mm1 | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, edx | |
dec ecx | |
jns @l3 | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
// ... | |
asm | |
emms | |
end; | |
// ... | |
procedure TAmbShadowCalc.Execute; | |
// ... | |
asm // stmxcsr i | |
stmxcsr x | |
end; // if i<>$1f80 then i:=0; //=8064 | |
// ... | |
function BuildATlevels(PsiLight, MWidth, MHeight: Integer; PATlevel: TPATlevel; var CorrMul: Single; var Zsub: Integer): Integer; | |
// ... | |
asm | |
emms | |
end; | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push esi | |
push edi | |
mov ebx, iStep2 | |
mov ecx, MWidth | |
mov esi, PATL2 | |
sub ecx, ebx | |
mov edi, PATL | |
shr ecx, 2 | |
sub esi, ebx | |
mov eax, ecx | |
sub edi, esi | |
shl eax, 2 | |
add x2, eax | |
@ll: movq mm0, [esi] // calculate 4 words at once | |
pavgw mm0, [esi + ebx * 2] | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, 8 | |
dec ecx | |
jnz @ll | |
add edi, esi | |
mov PATL, edi | |
add esi, ebx | |
mov PATL2, esi | |
pop edi | |
pop esi | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi | |
movq mm1, W4tmp | |
mov ebx, MWidth2step | |
mov esi, PATL2 | |
mov edi, PATL | |
mov ecx, iStep | |
mov edx, MWidth | |
sub esi, ebx | |
add edx, edx | |
sub edi, esi | |
dec ecx | |
@l1: movq mm0, [esi + ebx * 2] | |
pavgw mm0, mm1 | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, edx | |
dec ecx | |
jns @l1 | |
mov ecx, MHeight | |
sub ecx, iStep2 | |
dec ecx | |
js @u2 | |
@l2: movq mm0, [esi] | |
pavgw mm0, [esi + ebx * 2] | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, edx | |
dec ecx | |
jns @l2 | |
@u2: | |
movq mm1, W4tmp2 | |
mov ecx, iStep | |
dec ecx | |
@l3: movq mm0, [esi] | |
pavgw mm0, mm1 | |
pavgw mm0, [esi + ebx] | |
movq [edi + esi], mm0 | |
add esi, edx | |
dec ecx | |
jns @l3 | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
// ... | |
asm | |
emms | |
end; | |
// ... | |
procedure TAmbShadowCalc.Execute; | |
// ... | |
asm | |
stmxcsr x //set roundingmode sse | |
end; // if i<>$1f80 then i:=0; //=8064 } | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push esi | |
push edi | |
movss xmm7, RM | |
mov esi, PATL | |
lea edi, zp4 | |
cvtsi2ss xmm4, RadS | |
movss xmm5, sZRTLev //sZRT | |
rsqrtss xmm4, xmm4 | |
movzx eax, word [esi] | |
movzx ebx, word [esi + 2] | |
mulss xmm7, xmm4 | |
sub eax, [edi] | |
sub ebx, [edi + 4] | |
cvtss2si ecx, xmm7 //iC | |
shufps xmm4, xmm4, 0 //R1d | |
cvtsi2ss xmm0, eax | |
cvtsi2ss xmm1, ebx | |
movzx eax, word [esi + 4] | |
movzx ebx, word [esi + 6] | |
sub eax, [edi + 8] | |
sub ebx, [edi + 12] | |
cvtsi2ss xmm2, eax | |
cvtsi2ss xmm3, ebx | |
shufps xmm0, xmm1, 0 | |
shufps xmm2, xmm3, 0 | |
shufps xmm5, xmm5, 0 | |
shufps xmm0, xmm2, $88 | |
mov eax, iAngC | |
mov ebx, ecx | |
shr ebx, 1 | |
sub eax, ebx | |
and eax, 31 | |
add eax, eax | |
mulps xmm0, xmm4 | |
lea esi, [AngMaxArr4 + eax * 8] | |
minps xmm0, xmm5 | |
@ll: movups xmm1, [esi] | |
maxps xmm1, xmm0 | |
movups [esi], xmm1 | |
add esi, 16 | |
dec ecx | |
jns @ll | |
pop edi | |
pop esi | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
asm | |
push eax | |
push ebx | |
push esi | |
push edi | |
mov esi, PATL | |
lea edi, zp4 | |
cvtsi2ss xmm4, RadS | |
movzx eax, word [esi] | |
movzx ebx, word [esi + 2] | |
sub eax, [edi] | |
sub ebx, [edi + 4] | |
shufps xmm4, xmm4, 0 | |
movss xmm5, sZRTLev | |
cvtsi2ss xmm0, eax | |
cvtsi2ss xmm1, ebx | |
rsqrtps xmm4, xmm4 //only 4..6 clocks, not slower than scalar | |
movzx eax, word [esi + 4] | |
movzx ebx, word [esi + 6] | |
sub eax, [edi + 8] | |
sub ebx, [edi + 12] | |
cvtsi2ss xmm2, eax | |
cvtsi2ss xmm3, ebx | |
shufps xmm5, xmm5, 0 | |
shufps xmm0, xmm1, 0 | |
shufps xmm2, xmm3, 0 | |
mov eax, iAngC | |
shufps xmm0, xmm2, $88 | |
add eax, eax | |
mulps xmm0, xmm4 | |
movups xmm1, dqword [AngMaxArr4 + eax * 8] | |
minps xmm0, xmm5 | |
maxps xmm1, xmm0 | |
movups dqword [AngMaxArr4 + eax * 8], xmm1 | |
pop edi | |
pop esi | |
pop ebx | |
pop eax | |
end | |
// ... | |
function BuildATlevelsT0(PsiLight, MWidth, MHeight: Integer; PATlevel: TPATlevel; sZRT: Single): Integer; | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
mov ecx, xa | |
mov edx, PATL2 | |
mov eax, iStep2 | |
add ecx, 4 | |
mov ebx, PATL | |
sub edx, eax | |
@@1: cmp ecx, iwids | |
jg @@3 | |
movq mm4, [edx + eax] | |
movq mm1, [edx] //it1 | |
paddw mm4, iTh4 //PATL2^ + iTh | |
psubw mm4, sub32k | |
movq mm2, [edx + eax * 2] //it2 | |
psubw mm1, sub32k | |
psubw mm2, sub32k | |
pminsw mm1, mm4 //only signed word, therefore first sub, afterwards add | |
pminsw mm2, mm4 | |
paddw mm1, sub32k | |
paddw mm2, sub32k | |
pavgw mm1, mm2 //Average unsigned words | |
pavgw mm1, [edx + eax] | |
movq [ebx], mm1 | |
add ebx, 8 | |
add edx, 8 | |
add ecx, 4 | |
jmp @@1 | |
@@3: sub ecx, 4 | |
add edx, eax | |
mov xa, ecx | |
mov PATL, ebx | |
mov PATL2, edx | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; // xa | |
// ... | |
asm | |
emms | |
end; | |
// ... | |
procedure TAmbShadowCalcT0.Execute; | |
// ... | |
asm | |
stmxcsr x | |
end; // if i<>$1f80 then i:=0; //=8064 } | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push esi | |
push edi | |
movss xmm7, RM | |
mov esi, PATL | |
lea edi, zp4 | |
cvtsi2ss xmm4, RadS | |
movss xmm5, sZRTLev | |
rsqrtss xmm4, xmm4 | |
movzx eax, word [esi] | |
movzx ebx, word [esi + 2] | |
mulss xmm7, xmm4 | |
sub eax, [edi] | |
sub ebx, [edi + 4] | |
cvtss2si ecx, xmm7 //iC | |
shufps xmm4, xmm4, 0 //R1d | |
cvtsi2ss xmm0, eax | |
cvtsi2ss xmm1, ebx | |
movzx eax, word [esi + 4] | |
movzx ebx, word [esi + 6] | |
sub eax, [edi + 8] | |
sub ebx, [edi + 12] | |
cvtsi2ss xmm2, eax | |
cvtsi2ss xmm3, ebx | |
shufps xmm0, xmm1, 0 | |
shufps xmm2, xmm3, 0 | |
shufps xmm5, xmm5, 0 | |
shufps xmm0, xmm2, $88 | |
mov eax, iAngC | |
mov ebx, ecx | |
shr ebx, 1 | |
sub eax, ebx | |
and eax, 31 | |
add eax, eax | |
mulps xmm0, xmm4 | |
minps xmm0, xmm5 | |
rcpps xmm2, xmm5 //approx 1/x | |
lea esi, [AngMaxArr4 + eax * 8] | |
mulps xmm2, xmm0 | |
movaps xmm4, xmm2 | |
mulps xmm2, xmm2 | |
mulps xmm2, xmm4 | |
mulps xmm2, xmm0 | |
subps xmm0, xmm2 | |
@ll: movups xmm1, [esi] | |
maxps xmm1, xmm0 | |
movups [esi], xmm1 | |
add esi, 16 | |
dec ecx | |
jns @ll | |
pop edi | |
pop esi | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
asm | |
push eax | |
push ebx | |
push esi | |
push edi | |
mov esi, PATL | |
lea edi, zp4 | |
cvtsi2ss xmm4, RadS | |
movzx eax, word [esi] | |
movzx ebx, word [esi + 2] | |
sub eax, [edi] | |
sub ebx, [edi + 4] | |
shufps xmm4, xmm4, 0 | |
movss xmm5, sZRTLev | |
cvtsi2ss xmm0, eax | |
cvtsi2ss xmm1, ebx | |
rsqrtps xmm4, xmm4 //only 4..6 clocks, not slower than scalar | |
movzx eax, word [esi + 4] | |
movzx ebx, word [esi + 6] | |
sub eax, [edi + 8] | |
sub ebx, [edi + 12] | |
cvtsi2ss xmm2, eax | |
cvtsi2ss xmm3, ebx | |
shufps xmm5, xmm5, 0 | |
shufps xmm0, xmm1, 0 | |
shufps xmm2, xmm3, 0 | |
mov eax, iAngC | |
shufps xmm0, xmm2, $88 | |
add eax, eax | |
mulps xmm0, xmm4 | |
minps xmm0, xmm5 | |
movups xmm1, dqword [AngMaxArr4 + eax * 8] | |
rcpps xmm2, xmm5 | |
mulps xmm2, xmm0 | |
movaps xmm4, xmm2 | |
mulps xmm2, xmm2 | |
mulps xmm2, xmm4 | |
mulps xmm2, xmm0 | |
subps xmm0, xmm2 | |
@up: maxps xmm1, xmm0 | |
movups dqword [AngMaxArr4 + eax * 8], xmm1 | |
pop edi | |
pop esi | |
pop ebx | |
pop eax | |
end | |
// ... | |
function ColToSVecFlipRBc(c: Cardinal): TSVec; | |
asm | |
add esp, -4 | |
mov ecx, eax | |
shr ecx, 16 | |
and ecx, $FF | |
mov [esp], ecx | |
fild dword [esp] | |
fstp dword [edx] | |
mov ecx, eax | |
shr ecx, 8 | |
and ecx, $FF | |
mov [esp], ecx | |
fild dword [esp] | |
fstp dword [edx + 4] | |
and eax, $FF | |
mov [esp], eax | |
fild dword [esp] | |
fstp dword [edx + 8] | |
pop edx | |
end; | |
function ColAToSVecFlipRBc(c: Cardinal): TSVec; | |
asm | |
mov ecx, eax | |
shr ecx, 24 | |
push ecx | |
fild dword [esp] | |
fstp dword [edx + 12] | |
mov ecx, eax | |
shr ecx, 16 | |
and ecx, $FF | |
mov [esp], ecx | |
fild dword [esp] | |
fstp dword [edx] | |
mov ecx, eax | |
shr ecx, 8 | |
and ecx, $FF | |
mov [esp], ecx | |
fild dword [esp] | |
fstp dword [edx + 4] | |
and eax, $FF | |
mov [esp], eax | |
fild dword [esp] | |
fstp dword [edx + 8] | |
pop edx | |
end; | |
function SVecToColNoScale(sv: TSVec): Cardinal; | |
asm | |
add esp, -16 | |
push 0 | |
push $437f0000 | |
lea edx, [esp + 8] | |
call [mMinMaxSVec] | |
fld dword [esp] | |
fistp word [esp] | |
fld dword [esp + 4] | |
fistp word [esp + 1] | |
fld dword [esp + 8] | |
fistp word [esp + 2] | |
mov eax, [esp] | |
add esp, 16 | |
end; | |
function SVecToColNoScaleFlipXZ(var sv: TSVec): Cardinal; | |
asm | |
add esp, -16 | |
push 0 | |
push $437f0000 | |
lea edx, [esp + 8] //2x pushed, +8 is esp..esp+16 for svec | |
call [mMinMaxSVec] //mMinMaxSVec(const smin, smax: Single; const V1: TSVec): TSVec; ret8 | |
fld dword [esp + 8] // ebp+12, ebp+8, eax edx | |
fistp word [esp + 8] | |
fld dword [esp + 4] | |
fistp word [esp + 9] | |
fld dword [esp] | |
fistp word [esp + 10] | |
mov eax, [esp + 8] | |
add esp, 16 | |
end; | |
procedure MinMaxClip15bit(var s: Single; var w: Word); | |
const s32767: Single = 32767; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm0, [eax] | |
xorps xmm1, xmm1 | |
minss xmm0, s32767 | |
maxss xmm0, xmm1 | |
cvtss2si eax, xmm0 | |
mov word [edx], ax | |
ret | |
@@1: | |
fld dword [eax] | |
ftst | |
fnstsw ax | |
and ah, 41H | |
jz @biggerThanZero | |
fstp st(0) | |
mov word [edx], 0 | |
jmp @e | |
@biggerThanZero: | |
fcom s32767 | |
fnstsw ax | |
shr ah, 1 | |
jc @SmallerThanS3 | |
fstp st(0) | |
mov word [edx], 32767 | |
jmp @e | |
@SmallerThanS3: | |
fistp word [edx] | |
@e: | |
end; | |
function CPUID_Supported: Boolean; | |
asm | |
pushfd | |
pop eax | |
mov edx, eax | |
xor eax, $200000 | |
push eax | |
popfd | |
pushfd | |
pop eax | |
xor eax, edx | |
setnz al | |
end; | |
function GetCPUID(AInfoRequired: Integer): TRegisters; | |
asm | |
push ebx | |
push esi | |
mov esi, edx | |
cpuid | |
mov TRegisters[esi].RegEAX, eax | |
mov TRegisters[esi].RegEBX, ebx | |
mov TRegisters[esi].RegECX, ecx | |
mov TRegisters[esi].RegEDX, edx | |
pop esi | |
pop ebx | |
end; | |
procedure FastMove(const Source; var Dest; count: Integer); | |
asm | |
cmp eax, edx | |
je @@Exit | |
cmp ecx, 32 | |
ja @@LargeMove //Count > 32 or Count < 0 | |
sub ecx, 8 | |
jg @@SmallMove | |
@@TinyMove: //0..8 Byte Move | |
jmp dword [@@JumpTable + 32 + ecx * 4] | |
@@SmallMove: //9..32 Byte Move | |
fild qword [eax + ecx] | |
fild qword [eax] | |
cmp ecx, 8 | |
jle @@Small16 | |
fild qword [eax + 8] | |
cmp ecx, 16 | |
jle @@Small24 | |
fild qword [eax + 16] | |
fistp qword [edx + 16] | |
@@Small24: | |
fistp qword [edx + 8] | |
@@Small16: | |
fistp qword [edx] | |
fistp qword [edx + ecx] | |
@@Exit: | |
ret | |
nop //4-Byte Align JumpTable | |
nop | |
@@JumpTable: | |
dd @@Exit, @@M01, @@M02, @@M03, @@M04, @@M05, @@M06, @@M07, @@M08 | |
@@LargeForwardMove: | |
push edx | |
fild qword [eax] | |
lea eax, [eax + ecx - 8] | |
lea ecx, [ecx + edx - 8] | |
fild qword [eax] //fp stack check error | |
push ecx | |
neg ecx | |
and edx, -8 | |
lea ecx, [ecx + edx + 8] | |
pop edx | |
@FwdLoop: | |
fild qword [eax + ecx] | |
fistp qword [edx + ecx] | |
add ecx, 8 | |
jl @FwdLoop | |
fistp qword [edx] | |
pop edx | |
fistp qword [edx] | |
ret | |
@@LargeMove: | |
jng @@LargeDone // Count < 0 | |
cmp eax, edx | |
ja @@LargeForwardMove | |
sub edx, ecx | |
cmp eax, edx | |
lea edx, [edx + ecx] | |
jna @@LargeForwardMove | |
sub ecx, 8 | |
push ecx | |
fild qword [eax + ecx] | |
fild qword [eax] | |
add ecx, edx | |
and ecx, -8 | |
sub ecx, edx | |
@BwdLoop: | |
fild qword [eax + ecx] | |
fistp qword [edx + ecx] | |
sub ecx, 8 | |
jg @BwdLoop | |
pop ecx | |
fistp qword [edx] | |
fistp qword [edx + ecx] | |
@@LargeDone: | |
ret | |
@@M01: | |
movzx ecx, [eax] | |
mov [edx], cl | |
ret | |
@@M02: | |
movzx ecx, word [eax] | |
mov [edx], cx | |
ret | |
@@M03: | |
mov cx, [eax] | |
mov al, [eax + 2] | |
mov [edx], cx | |
mov [edx + 2], al | |
ret | |
@@M04: | |
mov ecx, [eax] | |
mov [edx], ecx | |
ret | |
@@M05: | |
mov ecx, [eax] | |
mov al, [eax + 4] | |
mov [edx], ecx | |
mov [edx + 4], al | |
ret | |
@@M06: | |
mov ecx, [eax] | |
mov ax, [eax + 4] | |
mov [edx], ecx | |
mov [edx + 4], ax | |
ret | |
@@M07: | |
mov ecx, [eax] | |
mov eax, [eax + 3] | |
mov [edx], ecx | |
mov [edx + 3], eax | |
ret | |
@@M08: | |
fild qword [eax] | |
fistp qword [edx] | |
end; | |
procedure fill0bytes(const p: Pointer; const anz: Integer; const useSSE: Boolean); | |
// ... | |
asm | |
push eax | |
push ecx | |
mov ecx, x4 | |
mov eax, p1 | |
xorps xmm0, xmm0 | |
@loop: movaps [eax], xmm0 | |
add eax, 16 | |
sub ecx, 1 | |
jnz @loop | |
mov p1, eax | |
pop ecx | |
pop eax | |
end; | |
// ... | |
procedure doFFT(const d: Double); | |
// ... | |
asm | |
push edx | |
push ecx | |
push ebx | |
push eax | |
push esi | |
push edi | |
mov ebx, pFFTreal | |
mov ecx, pFFTimag | |
movsd xmm7, d | |
shufpd xmm7, xmm7, 0 | |
mov eax, l | |
@loo0: shl eax, 1 // while l<=fl2 | |
mov edi, eax // war: ischritt, eax | |
xor eax, eax // eax=m | |
mov tabnr, eax | |
@loo1: mov edx, tabnr // for m:=0 to l-1 | |
mov esi, pFFTcos | |
movlpd xmm3, [esi + edx * 8] | |
mov esi, pFFTsin | |
movlpd xmm4, [esi + edx * 8] | |
shufpd xmm3, xmm3, 0 // xmm3 = [wichreal, wichreal] | |
shufpd xmm4, xmm4, 0 // xmm4 = [wichimag, wichimag] | |
mulpd xmm4, xmm7 // xorpd xmm4, [sign] | |
mov edx, eax // edx=i=m | |
@loo2: mov esi, edx | |
add esi, l // j=i+l | |
movlpd xmm0, [ebx + esi * 8] // hi lo | |
movhpd xmm0, [ecx + esi * 8] // xmm0 = [imag, real] | |
movapd xmm1, xmm0 | |
shufpd xmm1, xmm1, 1 // xmm1 = [real, imag] (,1=swap) | |
mulpd xmm0, xmm3 // xmm0 = [imag*wichreal, real*wichreal] | |
mulpd xmm1, xmm4 // xmm1 = [real*wichimag, imag*wichimag] | |
movapd xmm2, xmm0 | |
addpd xmm0, xmm1 // xmm0 = [i*wr+r*wi, r*wr+i*wi] | |
subpd xmm2, xmm1 // xmm2 = [i*wr-r*wi, r*wr-i*wi] | |
shufpd xmm2, xmm0, 2 // xmm2 = [i*wr+r*wi, r*wr-i*wi]? | |
// tmpimag tmpreal | |
movlpd xmm0, [ebx + edx * 8] | |
movhpd xmm0, [ecx + edx * 8] // xmm0 = [imag_i, real_i] | |
movapd xmm1, xmm0 | |
subpd xmm0, xmm2 | |
addpd xmm1, xmm2 | |
movlpd [ebx + esi * 8], xmm0 | |
movhpd [ecx + esi * 8], xmm0 | |
movlpd [ebx + edx * 8], xmm1 | |
movhpd [ecx + edx * 8], xmm1 | |
add edx, edi | |
cmp edx, fftlength | |
jl @loo2 | |
mov esi, fl3 | |
add tabnr, esi | |
add eax, 1 | |
cmp eax, l // for m:=0 to l-1 | |
jl @loo1 | |
shr fl3, 1 | |
mov eax, edi // ischritt | |
mov l, eax | |
cmp eax, fl2 // while l<=fl2 | |
jle @loo0 | |
pop edi | |
pop esi | |
pop eax | |
pop ebx | |
pop ecx | |
pop edx | |
end; | |
// ... | |
procedure FirstATlevelCAO(PIA: TPCardinalArray; PsiLight: TPsiLight5; Leng: Integer); | |
asm | |
push esi | |
dec ecx | |
js @@out | |
inc ecx | |
add edx, 8 | |
@@1: | |
cmp word [edx], $8000 | |
jnb @@2 | |
mov esi, [edx-2] | |
and esi, $ffffff00 | |
shr esi, 1 | |
jmp @@3 | |
@@2: | |
xor esi, esi | |
@@3: | |
mov [eax], esi | |
add edx, 18 | |
add eax, 4 | |
dec ecx | |
jnz @@1 | |
@@out: | |
pop esi | |
end; | |
procedure SmoothH(PIA, SA: TPCardinalArray; ya, Step: Integer); | |
asm | |
add esp, -12 | |
push ebx | |
push esi | |
push edi | |
mov [ebp-8], ecx | |
mov ebx, edx | |
mov edi, [ebp+8] | |
mov edx, ecx | |
test edx, edx | |
jl @@2 | |
inc edx | |
mov [ebp-12], edx | |
xor esi, esi | |
@@1: | |
mov edx, esi | |
sub edx, edi | |
test edx, edx | |
jnl @@3 | |
xor edx, edx | |
@@3: | |
mov ecx, edi | |
add ecx, esi | |
cmp ecx, [ebp-8] | |
jle @@4 | |
mov ecx, [ebp-8] | |
@@4: | |
mov ecx, [ebx+ecx*4] | |
add ecx, [ebx+edx*4] | |
shr ecx, 1 | |
add ecx, [eax] | |
shr ecx, 1 | |
mov [eax], ecx | |
inc esi | |
add eax, 4 | |
dec dword [ebp-12] | |
jnz @@1 | |
@@2: | |
pop edi | |
pop esi | |
pop ebx | |
add esp, 12 | |
end; | |
procedure SmoothV(PIA, SA: TPCardinalArray; ye, Step, wid: Integer); | |
asm | |
add esp, -12 | |
push ebx | |
push esi | |
push edi | |
mov [ebp-8], ecx | |
mov ebx, edx | |
mov edi, [ebp+12] | |
mov edx, ecx | |
test edx, edx | |
jl @@2 | |
inc edx | |
mov [ebp-12], edx | |
xor esi, esi | |
@@1: | |
mov edx, esi | |
sub edx, edi | |
test edx, edx | |
jnl @@3 | |
xor edx, edx | |
@@3: | |
mov ecx, edi | |
add ecx, esi | |
cmp ecx, [ebp-8] | |
jle @@4 | |
mov ecx, [ebp-8] | |
@@4: | |
mov ecx, [ebx+ecx*4] | |
add ecx, [ebx+edx*4] | |
shr ecx, 1 | |
add ecx, [eax] | |
shr ecx, 1 | |
mov [eax], ecx | |
inc esi | |
add eax, dword [ebp+8] | |
dec dword [ebp-12] | |
jnz @@1 | |
@@2: | |
pop edi | |
pop esi | |
pop ebx | |
add esp, 12 | |
end; | |
procedure MinSI(var SI: SmallInt; var i: Integer); | |
asm | |
movsx ecx, word [eax] | |
cmp ecx, [edx] | |
jnl @@1 | |
cmp dword [edx], $7FFF | |
jl @@2 | |
mov word [eax], $7FFF | |
ret | |
@@2: | |
mov edx, [edx] | |
mov word [eax], dx | |
@@1: | |
end; | |
function NotOnlyBackGround4(p: Pointer): Integer; | |
asm | |
mov edx, [eax] | |
and edx, [eax + 18] | |
and edx, [eax + 36] | |
and edx, [eax + 54] | |
and edx, $80000000 | |
mov eax, edx | |
end; | |
procedure MakeZP4(p: Pointer; var zp: array of Integer); | |
asm | |
mov ecx, [eax] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx], ecx | |
mov ecx, [eax + 18] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx + 4], ecx | |
mov ecx, [eax + 36] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx + 8], ecx | |
mov ecx, [eax + 54] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx + 12], ecx | |
end; | |
procedure isMemberQuat(PIteration3D: TPIteration3D); | |
// ... | |
asm | |
push esi | |
push edi | |
push ecx | |
mov esi, PIteration3D | |
xor ecx, ecx | |
mov edi, [esi + 48] | |
@u: movupd xmm0, [esi] // C1, C2 = X1, X2 | |
movsd xmm1, [esi + 16] // C3, 0 = X3, X4 | |
movapd xmm2, xmm0 | |
movapd xmm3, xmm1 | |
mulpd xmm2, xmm0 // X1*X1, X2*X2 | |
mulpd xmm3, xmm1 // X3*X3, X4*X4 | |
movapd xmm4, xmm2 | |
addpd xmm4, xmm3 // X1*X1 + X3*X3, X2*X2 + X4*X4 | |
pshufd xmm5, xmm4, $4E // X2*X2 + X4*X4, X1*X1 + X3*X3 | |
addsd xmm4, xmm5 // Rout | |
@a: addsd xmm3, xmm5 // X3*X3 + X2*X2 + X4*X4 | |
movlpd Rold, xmm4 | |
pshufd xmm7, xmm0, $4E // X2, X1 | |
subsd xmm2, xmm3 // X1*X1 - X2*X2 - X3*X3 - X4*X4 | |
movapd xmm5, xmm0 // X1, X2 | |
mulsd xmm7, xmm0 // X2*X1 | |
pshufd xmm6, xmm1, $4E // X4, X3 | |
addsd xmm2, [esi] | |
movapd xmm3, xmm6 // X4, X3 | |
movapd xmm0, xmm2 // X1 = X1*X1 - X2*X2 - X3*X3 - X4*X4 + C1; | |
mulpd xmm3, xmm5 // X4*X1, X3*X2 | |
mulsd xmm6, xmm1 // X4*X3 | |
mulpd xmm5, xmm1 // X1*X3, X2*X4 | |
addsd xmm7, xmm6 // X2*X1 + X4*X3 | |
pshufd xmm1, xmm5, $4E // X2*X4, X1*X3 | |
addsd xmm7, xmm7 // 2 * (X2*X1 + X4*X3) | |
addsd xmm7, [esi + 8] // X2 = 2 * (X2*X1 + X3*X4) + C2 | |
subsd xmm5, xmm1 // X2*X4*O1 + X1*X3 sub | |
shufpd xmm0, xmm7, 0 // X1, X2 | |
addsd xmm5, xmm5 // 2 * (X2*X4*O1 + X1*X3) | |
pshufd xmm6, xmm3, $4E // X3*X2, X4*X1 | |
addsd xmm5, [esi + 16] // X3 = 2 * (X2*X4*O1 + X1*X3) + C3 | |
addsd xmm6, xmm3 // X3*X2 + X4*X1 | |
movsd xmm1, xmm5 | |
addsd xmm6, xmm6 // X4 = 2 * (X4*X1 + X3*X2) | |
shufpd xmm1, xmm6, 0 // X3, X4 | |
movapd xmm2, xmm0 | |
movapd xmm3, xmm1 | |
mulpd xmm2, xmm0 // X1*X1, X2*X2 | |
mulpd xmm3, xmm1 // X3*X3, X4*X4 | |
movapd xmm4, xmm2 | |
addpd xmm4, xmm3 // X1*X1 + X3*X3, X2*X2 + X4*X4 | |
pshufd xmm5, xmm4, $4E // | |
addsd xmm4, xmm5 // Rout | |
inc ecx | |
cmp ecx, [esi + 68] | |
jge @c | |
ucomisd xmm4, [edi + 160] //>8? | |
jb @a | |
@c: movlpd [esi + 56], xmm4 // Rout = double | |
mov [esi + 64], ecx // ItResultI | |
pop ecx | |
pop edi | |
pop esi | |
end | |
// ... | |
procedure UpdateScaledImage(StartYh, EndYh: Integer); | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push edi | |
push esi | |
mov ecx, wid | |
mov esi, PB1 | |
mov edi, PBh | |
mov ebx, mFSIoffset | |
@ll: movzx eax, byte ptr [esi] | |
movzx edx, byte ptr [esi + 4] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 4] | |
lea eax, [eax + edx + 2] | |
shr eax, 2 | |
mov [edi], al | |
movzx eax, byte ptr [esi + 1] | |
movzx edx, byte ptr [esi + 5] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 1] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 5] | |
lea eax, [eax + edx + 2] | |
shr eax, 2 | |
mov [edi + 1], al | |
movzx eax, byte ptr [esi + 2] | |
movzx edx, byte ptr [esi + 6] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 2] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 6] | |
lea eax, [eax + edx + 2] | |
shr eax, 2 | |
mov [edi + 2], al | |
add esi, 8 | |
add edi, 4 | |
dec ecx | |
jnz @ll | |
pop esi | |
pop edi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push edi | |
push esi | |
mov ecx, wid | |
mov esi, PB1 | |
mov edi, PBh | |
mov ebx, mFSIoffset | |
@ll: movzx eax, byte ptr [esi] | |
movzx edx, byte ptr [esi + 4] | |
add eax, edx | |
movzx edx, byte ptr [esi + 8] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 4] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 8] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 4] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 8] | |
lea eax, [eax + edx + 4] | |
div b | |
mov [edi], al | |
movzx eax, byte ptr [esi + 1] | |
movzx edx, byte ptr [esi + 5] | |
add eax, edx | |
movzx edx, byte ptr [esi + 9] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 1] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 5] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 9] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 1] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 5] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 9] | |
lea eax, [eax + edx + 4] | |
div b | |
mov [edi + 1], al | |
movzx eax, byte ptr [esi + 2] | |
movzx edx, byte ptr [esi + 6] | |
add eax, edx | |
movzx edx, byte ptr [esi + 10] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 2] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 6] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx + 10] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 2] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 6] | |
add eax, edx | |
movzx edx, byte ptr [esi + ebx * 2 + 10] | |
lea eax, [eax + edx + 4] | |
div b | |
mov [edi + 2], al | |
add esi, 12 | |
add edi, 4 | |
dec ecx | |
jnz @ll | |
pop esi | |
pop edi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
asm //sum rows to buf | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push edi | |
push esi | |
mov ebx, ImageScale | |
dec ebx | |
mov y2, ebx | |
mov edi, PC1 | |
lea eax, ebx * 4 - 1 | |
mov itmp, eax | |
@@0: mov ecx, wid | |
mov w2, ecx | |
mov esi, PB1 | |
mov eax, mFSIoffset | |
mul y2 | |
add esi, eax | |
@ll: mov ecx, ebx | |
movzx eax, byte ptr [esi] | |
@@1: add esi, 4 | |
movzx edx, byte ptr [esi] | |
add eax, edx | |
dec ecx | |
jnz @@1 | |
mov [edi], eax | |
sub esi, itmp | |
mov ecx, ebx | |
movzx eax, byte ptr [esi] | |
@@2: add esi, 4 | |
movzx edx, byte ptr [esi] | |
add eax, edx | |
dec ecx | |
jnz @@2 | |
mov [edi + 4], eax | |
sub esi, itmp | |
mov ecx, ebx | |
movzx eax, byte ptr [esi] | |
@@3: add esi, 4 | |
movzx edx, byte ptr [esi] | |
add eax, edx | |
dec ecx | |
jnz @@3 | |
mov [edi + 8], eax | |
add edi, 12 | |
add esi, 2 | |
dec w2 | |
jnz @ll | |
dec y2 | |
jns @@0 | |
pop esi | |
pop edi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
// ... | |
asm //sum columns | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push edi | |
push esi | |
mov eax, ImageScale | |
dec eax | |
mov y2, eax | |
mov edx, PC1 | |
mov ebx, wid | |
mov w2, ebx | |
shl ebx, 2 | |
lea ebx, ebx * 2 + ebx | |
mov edi, PBh | |
@ll: mov ecx, y2 | |
mov esi, edx | |
mov eax, [esi] | |
@@1: add esi, ebx | |
add eax, [esi] | |
dec ecx | |
jnz @@1 | |
add eax, a | |
div b | |
mov [edi], al | |
add edx, 4 | |
mov ecx, y2 | |
mov esi, edx | |
mov eax, [esi] | |
@@2: add esi, ebx | |
add eax, [esi] | |
dec ecx | |
jnz @@2 | |
add eax, a | |
div b | |
mov [edi + 1], al | |
add edx, 4 | |
mov ecx, y2 | |
mov esi, edx | |
mov eax, [esi] | |
@@3: add esi, ebx | |
add eax, [esi] | |
dec ecx | |
jnz @@3 | |
add eax, a | |
div b | |
mov [edi + 2], al | |
add edx, 4 | |
add edi, 4 | |
dec w2 | |
jnz @ll | |
pop esi | |
pop edi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
// ... | |
function ColToSVecFlipRBc4(c: T4Cardinal): T4SVec; | |
asm | |
push ebx | |
push esi | |
push edi | |
add esp, -16 | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 2] //dereferenz | |
mov ecx, [ecx + 2] | |
mov esi, [esi + 2] | |
mov edi, [edi + 2] | |
and ebx, $FF | |
and ecx, $FF | |
and esi, $FF | |
and edi, $FF | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fstp dword [edx + 48] | |
fstp dword [edx + 32] | |
fstp dword [edx + 16] | |
fstp dword [edx] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 1] //dereferenz | |
mov ecx, [ecx + 1] | |
mov esi, [esi + 1] | |
mov edi, [edi + 1] | |
and ebx, $FF | |
and ecx, $FF | |
and esi, $FF | |
and edi, $FF | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fstp dword [edx + 52] | |
fstp dword [edx + 36] | |
fstp dword [edx + 20] | |
fstp dword [edx + 4] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx] //dereferenz | |
mov ecx, [ecx] | |
mov esi, [esi] | |
mov edi, [edi] | |
and ebx, $FF | |
and ecx, $FF | |
and esi, $FF | |
and edi, $FF | |
xor eax, eax | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
mov [edx + 12], eax | |
mov [edx + 28], eax | |
mov [edx + 44], eax | |
mov [edx + 60], eax | |
fstp dword [edx + 56] | |
fstp dword [edx + 40] | |
fstp dword [edx + 24] | |
fstp dword [edx + 8] | |
add esp, 16 | |
pop edi | |
pop esi | |
pop ebx | |
end; | |
procedure ColToSVecSSE2(c: T4Cardinal; sv, svout: TPSVec); | |
asm //CVTDQ2PS: sse2 - 4 ints to 4 singles | |
MOVDQU xmm5, [edx] //PSRLDQ: sse2 - xmm1, imm8 Shift xmm1 right by imm8 while shifting in 0s. | |
add esp, -16 | |
mov edx, [eax] | |
mov edx, [edx] | |
mov [esp], edx | |
mov edx, [eax + 4] | |
mov edx, [edx] | |
mov [esp + 4], edx | |
mov edx, [eax + 8] | |
mov eax, [eax + 12] | |
mov edx, [edx] | |
mov eax, [eax] | |
mov [esp + 8], edx | |
mov [esp + 12], eax | |
movss xmm0, s1d255 | |
MOVDQU xmm1, [esp] //[eax] 4 cardinal colors | |
MOVDQU xmm4, sva1 | |
MOVDQA xmm2, xmm1 //todo: use input pointers, load vals before | |
MOVDQA xmm3, xmm1 | |
PSRLDQ xmm2, 1 //green | |
PSRLDQ xmm3, 2 //blue | |
shufps xmm0, xmm0, 0 | |
andps xmm1, xmm4 //red or $FF000000FF000000FF000000FF | |
andps xmm2, xmm4 | |
andps xmm3, xmm4 | |
mulps xmm5, xmm0 | |
CVTDQ2PS xmm1, xmm1 | |
CVTDQ2PS xmm2, xmm2 | |
CVTDQ2PS xmm3, xmm3 | |
mulps xmm1, xmm5 | |
mulps xmm2, xmm5 | |
mulps xmm3, xmm5 | |
MOVLHPS xmm4, xmm1 //HADDD L1,.. (H,L) | |
movhlps xmm4, xmm3 //L1,H3 | |
shufps xmm3, xmm1, $E4 //H1,L3 | |
MOVHLPS xmm0, xmm2 //..,H2 | |
addps xmm4, xmm3 //11,33 | |
addps xmm0, xmm2 //..,22 | |
pshufd xmm5, xmm4, $B1 //can't copy 1 dw to more than 1 dest! | |
pshufd xmm2, xmm0, $B1 | |
addps xmm5, xmm4 //3,1 | |
addss xmm2, xmm0 //.,2 | |
movups [ecx], xmm5 // r,.,b | |
movss [ecx + 4], xmm2 // .,g,. | |
add esp, 16 | |
end; | |
procedure ColToSVecSqrSSE2(c: T4Cardinal; sv, svout: TPSVec); //svout := sumof([0..3] cardinal colors * sv[0..3]) | |
const scmul: Single = 1 / 65025; | |
asm // eax edx ecx | |
MOVDQU xmm5, [edx] | |
add esp, -16 | |
mov edx, [eax] | |
mov edx, [edx] | |
mov [esp], edx | |
mov edx, [eax + 4] | |
mov edx, [edx] | |
mov [esp + 4], edx | |
mov edx, [eax + 8] | |
mov eax, [eax + 12] | |
mov edx, [edx] | |
mov eax, [eax] | |
mov [esp + 8], edx | |
mov [esp + 12], eax | |
movss xmm0, scmul | |
MOVDQU xmm1, [esp] | |
MOVDQU xmm4, sva1 | |
MOVDQA xmm2, xmm1 | |
MOVDQA xmm3, xmm1 | |
PSRLDQ xmm2, 1 | |
PSRLDQ xmm3, 2 | |
shufps xmm0, xmm0, 0 | |
andps xmm1, xmm4 | |
andps xmm2, xmm4 | |
andps xmm3, xmm4 | |
mulps xmm5, xmm0 | |
CVTDQ2PS xmm1, xmm1 | |
CVTDQ2PS xmm2, xmm2 | |
CVTDQ2PS xmm3, xmm3 | |
mulps xmm1, xmm1 | |
mulps xmm2, xmm2 | |
mulps xmm3, xmm3 | |
mulps xmm1, xmm5 | |
mulps xmm2, xmm5 | |
mulps xmm3, xmm5 | |
MOVLHPS xmm4, xmm1 //HADDD L1,.. (H,L) | |
movhlps xmm4, xmm3 //L1,H3 | |
shufps xmm3, xmm1, $E4 //H1,L3 | |
MOVHLPS xmm0, xmm2 //..,H2 | |
addps xmm4, xmm3 //11,33 | |
addps xmm0, xmm2 //..,22 | |
pshufd xmm5, xmm4, $B1 //can't copy 1 dw to more than 1 dest! | |
pshufd xmm2, xmm0, $B1 | |
addps xmm5, xmm4 //3,1 | |
addss xmm2, xmm0 //.,2 | |
movups [ecx], xmm5 // r,.,b | |
movss [ecx + 4], xmm2 // .,g,. | |
add esp, 16 | |
end; | |
procedure ColToSVecSqrSSE2_16(c: T4Cardinal; sv, svout: TPSVec); //svout := sumof([0..3] cardinal colors * sv[0..3]) | |
const csmul: Single = 1 {255.0} / (65535.0 * 65535.0); | |
asm | |
MOVDQU xmm5, [edx] | |
add esp, -16 | |
mov edx, [eax] | |
mov edx, [edx] | |
mov [esp], edx | |
mov edx, [eax + 4] | |
mov edx, [edx] | |
mov [esp + 4], edx | |
mov edx, [eax + 8] | |
mov edx, [edx] | |
mov [esp + 8], edx | |
mov edx, [eax + 12] | |
mov edx, [edx] | |
mov [esp + 12], edx | |
movss xmm0, csmul | |
MOVDQU xmm1, [esp] | |
MOVDQU xmm4, sva16 | |
MOVDQA xmm2, xmm1 | |
mov edx, [eax] | |
mov edx, [edx + 4] | |
mov [esp], edx | |
mov edx, [eax + 4] | |
mov edx, [edx + 4] | |
mov [esp + 4], edx | |
mov edx, [eax + 8] | |
mov eax, [eax + 12] | |
mov edx, [edx + 4] | |
mov eax, [eax + 4] | |
mov [esp + 8], edx | |
mov [esp + 12], eax | |
MOVDQA xmm2, xmm1 | |
MOVDQU xmm3, [esp] | |
PSRLDQ xmm2, 2 | |
shufps xmm0, xmm0, 0 | |
andps xmm1, xmm4 | |
andps xmm2, xmm4 | |
andps xmm3, xmm4 | |
mulps xmm5, xmm0 | |
CVTDQ2PS xmm1, xmm1 | |
CVTDQ2PS xmm2, xmm2 | |
CVTDQ2PS xmm3, xmm3 | |
mulps xmm1, xmm1 | |
mulps xmm2, xmm2 | |
mulps xmm3, xmm3 | |
mulps xmm1, xmm5 | |
mulps xmm2, xmm5 | |
mulps xmm3, xmm5 | |
MOVLHPS xmm4, xmm1 | |
movhlps xmm4, xmm3 | |
shufps xmm3, xmm1, $E4 | |
MOVHLPS xmm0, xmm2 | |
addps xmm4, xmm3 | |
addps xmm0, xmm2 | |
pshufd xmm5, xmm4, $B1 | |
pshufd xmm2, xmm0, $B1 | |
addps xmm5, xmm4 | |
addss xmm2, xmm0 | |
movups [ecx], xmm5 // r,.,b | |
movss [ecx + 4], xmm2 // .,g,. | |
add esp, 16 | |
end; | |
procedure ColToSVecSSE2_16(c: T4Cardinal; sv, svout: TPSVec); //svout := sumof([0..3] cardinal colors * sv[0..3]) | |
const csmul: Single = 1 {255.0} / 65535.0; | |
asm | |
MOVDQU xmm5, [edx] | |
add esp, -16 | |
mov edx, [eax] | |
mov edx, [edx] | |
mov [esp], edx | |
mov edx, [eax + 4] | |
mov edx, [edx] | |
mov [esp + 4], edx | |
mov edx, [eax + 8] | |
mov edx, [edx] | |
mov [esp + 8], edx | |
mov edx, [eax + 12] | |
mov edx, [edx] | |
mov [esp + 12], edx | |
movss xmm0, csmul | |
MOVDQU xmm1, [esp] | |
MOVDQU xmm4, sva16 | |
MOVDQA xmm2, xmm1 | |
mov edx, [eax] | |
mov edx, [edx + 4] | |
mov [esp], edx | |
mov edx, [eax + 4] | |
mov edx, [edx + 4] | |
mov [esp + 4], edx | |
mov edx, [eax + 8] | |
mov eax, [eax + 12] | |
mov edx, [edx + 4] | |
mov eax, [eax + 4] | |
mov [esp + 8], edx | |
mov [esp + 12], eax | |
MOVDQA xmm2, xmm1 | |
MOVDQU xmm3, [esp] | |
PSRLDQ xmm2, 2 | |
shufps xmm0, xmm0, 0 | |
andps xmm1, xmm4 | |
andps xmm2, xmm4 | |
andps xmm3, xmm4 | |
mulps xmm5, xmm0 | |
CVTDQ2PS xmm1, xmm1 | |
CVTDQ2PS xmm2, xmm2 | |
CVTDQ2PS xmm3, xmm3 | |
mulps xmm1, xmm5 | |
mulps xmm2, xmm5 | |
mulps xmm3, xmm5 | |
MOVLHPS xmm4, xmm1 | |
movhlps xmm4, xmm3 | |
shufps xmm3, xmm1, $E4 | |
MOVHLPS xmm0, xmm2 | |
addps xmm4, xmm3 | |
addps xmm0, xmm2 | |
pshufd xmm5, xmm4, $B1 | |
pshufd xmm2, xmm0, $B1 | |
addps xmm5, xmm4 | |
addss xmm2, xmm0 | |
movups [ecx], xmm5 // r,.,b | |
movss [ecx + 4], xmm2 // .,g,. | |
add esp, 16 | |
end; | |
function ColToSVecFlipRBc4sqr(c: T4Cardinal): T4SVec; | |
asm | |
push ebx | |
push esi | |
push edi | |
add esp, -16 | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 2] //dereferenz | |
mov ecx, [ecx + 2] | |
mov esi, [esi + 2] | |
mov edi, [edi + 2] | |
and ebx, $FF | |
and ecx, $FF | |
and esi, $FF | |
and edi, $FF | |
fld s1d255 | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fmul st, st(0) | |
fmul st, st(4) | |
fstp dword [edx + 48] | |
fmul st, st(0) | |
fmul st, st(3) | |
fstp dword [edx + 32] | |
fmul st, st(0) | |
fmul st, st(2) | |
fstp dword [edx + 16] | |
fmul st, st(0) | |
fmul st, st(1) | |
fstp dword [edx] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 1] //dereferenz | |
mov ecx, [ecx + 1] | |
mov esi, [esi + 1] | |
mov edi, [edi + 1] | |
and ebx, $FF | |
and ecx, $FF | |
and esi, $FF | |
and edi, $FF | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fmul st, st(0) | |
fmul st, st(4) | |
fstp dword [edx + 52] | |
fmul st, st(0) | |
fmul st, st(3) | |
fstp dword [edx + 36] | |
fmul st, st(0) | |
fmul st, st(2) | |
fstp dword [edx + 20] | |
fmul st, st(0) | |
fmul st, st(1) | |
fstp dword [edx + 4] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx] //dereferenz | |
mov ecx, [ecx] | |
mov esi, [esi] | |
mov edi, [edi] | |
and ebx, $FF | |
and ecx, $FF | |
and esi, $FF | |
and edi, $FF | |
xor eax, eax | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
mov [edx + 12], eax | |
fmul st, st(0) | |
mov [edx + 28], eax | |
mov [edx + 44], eax | |
fmul st, st(4) | |
mov [edx + 60], eax | |
fstp dword [edx + 56] | |
fmul st, st(0) | |
fmul st, st(3) | |
fstp dword [edx + 40] | |
fmul st, st(0) | |
fmul st, st(2) | |
fstp dword [edx + 24] | |
fmul st, st(0) | |
fmulp | |
fstp dword [edx + 8] | |
add esp, 16 | |
pop edi | |
pop esi | |
pop ebx | |
end; | |
function ColToSVecFlipRBc4sqr16(c: T4Cardinal): T4SVec; | |
const cdmul: Double = 255.0 / (65535.0 * 65535.0); | |
asm | |
push ebx | |
push esi | |
push edi | |
add esp, -16 | |
mov ebx, [eax] //pointers | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 4] //dereferenz | |
mov ecx, [ecx + 4] | |
mov esi, [esi + 4] | |
mov edi, [edi + 4] | |
and ebx, $FFFF | |
and ecx, $FFFF | |
and esi, $FFFF | |
and edi, $FFFF | |
fld cdmul | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fmul st, st(0) | |
fmul st, st(4) | |
fstp dword [edx + 48] | |
fmul st, st(0) | |
fmul st, st(3) | |
fstp dword [edx + 32] | |
fmul st, st(0) | |
fmul st, st(2) | |
fstp dword [edx + 16] | |
fmul st, st(0) | |
fmul st, st(1) | |
fstp dword [edx] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 2] //dereferenz | |
mov ecx, [ecx + 2] | |
mov esi, [esi + 2] | |
mov edi, [edi + 2] | |
and ebx, $FFFF | |
and ecx, $FFFF | |
and esi, $FFFF | |
and edi, $FFFF | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] //loads signed integer, therefore 16 bit direct iload would fail | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fmul st, st(0) | |
fmul st, st(4) | |
fstp dword [edx + 52] | |
fmul st, st(0) | |
fmul st, st(3) | |
fstp dword [edx + 36] | |
fmul st, st(0) | |
fmul st, st(2) | |
fstp dword [edx + 20] | |
fmul st, st(0) | |
fmul st, st(1) | |
fstp dword [edx + 4] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx] //dereferenz | |
mov ecx, [ecx] | |
mov esi, [esi] | |
mov edi, [edi] | |
and ebx, $FFFF | |
and ecx, $FFFF | |
and esi, $FFFF | |
and edi, $FFFF | |
xor eax, eax | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
mov [edx + 12], eax | |
mov [edx + 28], eax | |
fmul st, st(0) | |
mov [edx + 44], eax | |
fmul st, st(4) | |
mov [edx + 60], eax | |
fstp dword [edx + 56] | |
fmul st, st(0) | |
fmul st, st(3) | |
fstp dword [edx + 40] | |
fmul st, st(0) | |
fmul st, st(2) | |
fstp dword [edx + 24] | |
fmul st, st(0) | |
fmulp | |
fstp dword [edx + 8] | |
add esp, 16 | |
pop edi | |
pop esi | |
pop ebx | |
end; | |
function ColToSVecFlipRBc416(c: T4Cardinal): T4SVec; | |
asm | |
push ebx | |
push esi | |
push edi | |
add esp, -16 | |
mov ebx, [eax] //pointers | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 4] //dereferenz | |
mov ecx, [ecx + 4] | |
mov esi, [esi + 4] | |
mov edi, [edi + 4] | |
and ebx, $FFFF | |
and ecx, $FFFF | |
and esi, $FFFF | |
and edi, $FFFF | |
fld d1d256 | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fmul st, st(4) | |
fstp dword [edx + 48] | |
fmul st, st(3) | |
fstp dword [edx + 32] | |
fmul st, st(2) | |
fstp dword [edx + 16] | |
fmul st, st(1) | |
fstp dword [edx] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx + 2] //dereferenz | |
mov ecx, [ecx + 2] | |
mov esi, [esi + 2] | |
mov edi, [edi + 2] | |
and ebx, $FFFF | |
and ecx, $FFFF | |
and esi, $FFFF | |
and edi, $FFFF | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] //loads signed integer, therefore 16 bit direct iload would fail | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fmul st, st(4) | |
fstp dword [edx + 52] | |
fmul st, st(3) | |
fstp dword [edx + 36] | |
fmul st, st(2) | |
fstp dword [edx + 20] | |
fmul st, st(1) | |
fstp dword [edx + 4] | |
mov ebx, [eax] | |
mov ecx, [eax + 4] | |
mov esi, [eax + 8] | |
mov edi, [eax + 12] | |
mov ebx, [ebx] //dereferenz | |
mov ecx, [ecx] | |
mov esi, [esi] | |
mov edi, [edi] | |
and ebx, $FFFF | |
and ecx, $FFFF | |
and esi, $FFFF | |
and edi, $FFFF | |
xor eax, eax | |
mov [esp], ebx | |
mov [esp + 4], ecx | |
mov [esp + 8], esi | |
mov [esp + 12], edi | |
fild dword [esp] | |
fild dword [esp + 4] | |
fild dword [esp + 8] | |
fild dword [esp + 12] | |
fmul st, st(4) | |
mov [edx + 12], eax | |
mov [edx + 28], eax | |
mov [edx + 44], eax | |
mov [edx + 60], eax | |
fstp dword [edx + 56] | |
fmul st, st(3) | |
fstp dword [edx + 40] | |
fmul st, st(2) | |
fstp dword [edx + 24] | |
fmulp | |
fstp dword [edx + 8] | |
add esp, 16 | |
pop edi | |
pop esi | |
pop ebx | |
end; | |
procedure HybridCustomIFStest; | |
asm | |
movupd xmm0, [esi - 120] //x,y | |
movsd xmm1, [esi - 104] //z | |
mulpd xmm0, [edi - 32] | |
mulsd xmm1, [edi - 16] | |
addsd xmm1, xmm0 | |
unpckhpd xmm0, xmm0 | |
addsd xmm1, xmm0 | |
subsd xmm1, [edi - 40] | |
cmp [edi - 68], 0 | |
jne @up | |
andpd xmm1, [edi] | |
@up: movsd [esi - 32], xmm1 //Rout: Double; //+56 | |
mov edx, [edi - 52] | |
test edx, edx | |
jz @out | |
push ecx //otrap coloring | |
add esp, -32 | |
fld qword [edi - 16] | |
fld qword [edi - 24] | |
fld qword [edi - 32] //nx,ny,nz | |
fld st //makeorthovecs | |
fabs | |
fcomp s011 | |
fnstsw ax | |
and ah, 41H | |
jnz @@1 | |
fld st(2) | |
fmul st, st | |
fld st(1) | |
fmul st, st | |
faddp | |
fsqrt | |
fld1 | |
fdivrp //1/Sqrt(rr) | |
fldz | |
fld st(4) | |
fmul st, st(2) | |
fld st(3) | |
fchs | |
fmulp st(2), st //vo[0],0,vo[2],nx,ny,nz | |
jmp @@2 | |
@@1: | |
fld st(2) | |
fmul st, st | |
fld st(2) | |
fmul st, st | |
faddp | |
fsqrt | |
fld1 | |
fdivrp //1/Sqrt(rr) | |
fld st(3) | |
fchs | |
fmul st, st(1) | |
fld st(3) | |
fmulp st(2), st //0,vo[1],vo[2],nx,ny,nz | |
fldz | |
@@2: | |
fld st | |
fmul qword [esi - 120] //x | |
fld st(2) | |
fmul qword [esi - 112] //y | |
faddp | |
fld st(3) | |
fmul qword [esi - 104] //z | |
faddp | |
fmul qword [edi - 48] | |
fstp qword [esp] | |
fld st(5) | |
fmul st, st(2) | |
fld st(5) | |
fmul st, st(4) | |
fsubrp //r0,vo[0],vo[1],vo[2],nx,ny,nz | |
fxch | |
fmul st(6), st //vo[0],r0,vo[1],vo[2],nx,ny,nz*vo[0] | |
fxch st(4) | |
fmul st(3), st //nx,r0,vo[1],vo[2]*nx,vo[0],ny,nz*vo[0] | |
fmulp st(2), st //r0, vo[1]*nx, vo[2]*nx, vo[0], ny, nz*vo[0] | |
fxch st(4) //ny, vo[1]*nx, vo[2]*nx, vo[0], r0, nz*vo[0] | |
fmulp st(3), st //vo[1]*nx, vo[2]*nx, vo[0]*ny, r0, nz*vo[0] | |
fsubrp st(2), st //vo[2]*nx, vo[1]*nx-vo[0]*ny=r2, r0, nz*vo[0] | |
fsubp st(3), st //r2, r0, nz*vo[0] - vo[2]*nx = r1 | |
fmul qword [esi - 104] //z | |
fxch | |
fmul qword [esi - 120] //x | |
faddp | |
fxch | |
fmul qword [esi - 112] //y | |
faddp | |
fmul qword [edi - 48] | |
fstp qword [esp + 8] | |
mov eax, esp | |
mov ecx, esp | |
call [esi + 268] //+356 - 88 = 268 | |
mov ecx, [edi - 56] | |
and ecx, 3 | |
fld qword [esp + ecx * 8] //col of map | |
fadd dword [edi - 60] | |
fmul dword [edi - 64] | |
fstp qword [esi + 128] | |
add esp, 32 | |
pop ecx | |
@out: | |
end; | |
procedure ipow2(var x, y: Double); //x:=x*x-y*y y:=2xy | |
asm | |
fld qword [eax] | |
fld qword [edx] | |
fld st(0) //y,y,x | |
fmul st(0), st(2) //y*x,y,x | |
fadd st(0), st(0) | |
fstp qword [edx] | |
fmul st(0), st(0) //y*y,x | |
fxch | |
fmul st(0), st(0) //x*x,y*y | |
fsubrp st(1), st(0) | |
fstp qword [eax] | |
end; | |
procedure ComplexSqr(var xy: TComplex); //x:=x*x-y*y y:=2xy | |
asm | |
fld qword [eax] | |
fld qword [eax + 8] //y,x | |
fld st(0) //y,y,x | |
fmul st(0), st(2) //y*x,y,x | |
fadd st(0), st(0) | |
fstp qword [eax + 8] | |
fmul st(0), st(0) //y*y,x | |
fxch | |
fmul st(0), st(0) //x*x,y*y | |
fsubrp st(1), st(0) | |
fstp qword [eax] | |
end; | |
procedure doInterpolHybridSSE2(PIteration3D: TPIteration3D); // new ext version | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, (i = edi + 212 = btmp = esi - 44) | |
add esp, -72 | |
mov edi, eax //was: Rold = esp, Rstop = esp + 8, aligned16: esp + 16, X1 = a16 X2 = a16+8.. Y1 = a16+32 .. | |
lea esi, eax + 256 | |
mov eax, esp | |
add eax, 35 | |
and eax, $FFFFFFF0 | |
mov [esp], eax // aligned 16 Ybuf aligned16: esp, X1 = a16.. = Y1 = (aligned) | |
cvtps2pd xmm7, [edi + 76] //nHybrid[0] +76 weights in double for s1,s2 (lo,hi part) | |
movupd xmm0, [edi] | |
movsd xmm1, [edi + 16] | |
movupd [edi - 32], xmm0 //xyz=C | |
movupd [edi - 16], xmm1 | |
cmp dword [esi - 104], 0 //DoJulia:+152 | |
jz @sjup | |
movupd xmm2, [esi + 64] | |
movsd xmm3, [esi + 80] | |
movupd [edi + 24], xmm2 //J=Ju | |
movsd [edi + 40], xmm3 | |
jmp @skipIfJulia | |
@sjup: | |
movupd [edi + 24], xmm0 //J=C | |
movsd [edi + 40], xmm1 | |
@skipIfJulia: | |
mulpd xmm0, xmm0 | |
mulsd xmm1, xmm1 | |
CVTSS2SD xmm5, [edi + 72] //RStop in double | |
addsd xmm1, xmm0 | |
unpckhpd xmm0, xmm0 | |
movsd [edi - 40], xmm5 | |
addsd xmm1, xmm0 | |
xor ebx, ebx | |
movsd [esi - 64], xmm1 //OTrap=Rout | |
movsd [edi + 56], xmm1 //Rout | |
mov [esi - 48], ebx //bFirstIt := 0; +208 | |
mov [edi + 64], ebx //ItresultI :=0 +64 | |
@Repeat: | |
movsd xmm2, [edi + 56] | |
mov ebx, [edi + 100] //fHPVar[0] +100 | |
mov eax, [esp] | |
mov [edi + 48], ebx //PVars: +48 | |
movsd [edi - 48], xmm2 //Rold := Rout | |
movupd xmm0, [edi - 32] //Y:=xyz | |
movupd xmm1, [edi - 16] | |
movapd [eax], xmm0 | |
movapd [eax + 16], xmm1 | |
lea eax, edi - 32 // x | |
lea edx, edi - 24 // y | |
lea ecx, edi - 16 // z | |
lea ebx, edi - 8 // w | |
push ebx | |
push edi | |
call [edi + 124] //fHybrid[0] of ThybridIteration2 | |
mov eax, [esp] | |
movupd xmm0, [edi - 32] // mCopyVec4(@x1, @x); | |
movupd xmm1, [edi - 16] // mCopyVec4(@x, @Y1); | |
movapd xmm2, [eax] | |
movapd xmm3, [eax + 16] | |
movapd [eax], xmm0 | |
movapd [eax + 16], xmm1 | |
movupd [edi - 32], xmm2 //xyz=Y1 | |
movupd [edi - 16], xmm3 | |
mov ebx, [edi + 104] //fHPVar[1] | |
mov [edi + 48], ebx //PVars: +48 | |
lea eax, edi - 32 // x | |
lea edx, edi - 24 // y | |
lea ecx, edi - 16 // z | |
lea ebx, edi - 8 // w | |
push ebx | |
push edi | |
call [edi + 128] //fHybrid[1] of ThybridIteration2 | |
mov eax, [esp] | |
movupd xmm0, [edi - 32] //x,y was: y1 | |
movapd xmm2, [eax] //x[0,1] | |
movupd xmm1, [edi - 16] //z,w | |
movapd xmm3, [eax + 16] //x[2,3] | |
movapd xmm5, xmm0 //x,y | |
movapd xmm6, xmm2 //x[0,1] | |
mulpd xmm0, xmm0 //x²,y² | |
mulpd xmm2, xmm2 //x[0]²,x[1]² | |
mulsd xmm1, xmm1 //z²,w | |
mulsd xmm3, xmm3 //x[2]² | |
addsd xmm1, xmm0 //z²+x² | |
addsd xmm3, xmm2 //x[2]²+x[0]² | |
unpckhpd xmm0, xmm0 //y² | |
unpckhpd xmm2, xmm2 //x[1]² | |
addsd xmm1, xmm0 //x²+y²+z² | |
addsd xmm3, xmm2 //x[0]²+x[1]²+x[2]² | |
unpcklpd xmm3, xmm1 //x[0]²+x[1]²+x[2]²,x²+y²+z² | |
sqrtpd xmm0, xmm3 //xx,yy | |
mulpd xmm0, xmm7 //xx*s1,yy*s2 | |
pshufd xmm2, xmm0, $4E | |
addsd xmm0, xmm2 //XX = xx*s1+yy*s2 | |
pshufd xmm3, xmm7, $4E //wy | |
movsd xmm2, xmm7 //wx | |
unpcklpd xmm3, xmm3 //s2,s2 | |
unpcklpd xmm2, xmm2 //s1,s1 | |
movupd xmm1, [edi - 16] //z,w | |
mulpd xmm5, xmm3 //x,y *s2 | |
mulpd xmm6, xmm2 //x[0,1] *s1 | |
mulpd xmm3, xmm1 //z,w *s2 | |
mulpd xmm2, [eax + 16] //x[2,3] *s1 | |
addpd xmm5, xmm6 //x,y | |
addpd xmm3, xmm2 //z,w | |
movapd xmm4, xmm5 //x,y | |
movsd xmm2, xmm3 //z | |
mulpd xmm4, xmm4 //x²,y² | |
mulsd xmm2, xmm2 //z² 4D: mulpd | |
addsd xmm2, xmm4 //z²+x² 4D: addpd ... | |
unpckhpd xmm4, xmm4 //y² | |
addsd xmm4, xmm2 //x²+y²+z² | |
addsd xmm4, d1em40 | |
sqrtsd xmm4, xmm4 | |
movsd xmm2, xmm0 //XX | |
divsd xmm2, xmm4 //YY := XX / Sqrt(x * x + y * y + z * z + 1e-40); | |
unpcklpd xmm2, xmm2 //YY,YY | |
mulpd xmm5, xmm2 | |
mulsd xmm3, xmm2 | |
movupd [edi - 32], xmm5 //x,y | |
movupd [edi - 16], xmm3 //z,w | |
mulsd xmm0, xmm0 | |
movsd [edi + 56], xmm0 //Rout := XX * XX; | |
movsd xmm1, xmm0 | |
inc dword [edi + 64] //Inc(ItResultI) | |
minsd xmm0, [esi - 64] | |
movsd [esi - 64], xmm0 //OTrap := Min(Rout, OTrap); | |
mov eax, [edi + 64] | |
cmp eax, [edi + 68] //maxIt: +68 | |
jnl @out | |
comisd xmm1, [edi - 40] //RStop | |
jc @Repeat | |
@out: | |
cmp byte [esi - 108], 0 //CalcSIT: +148 | |
jz @NoCalcSITout | |
mov eax, edi | |
xor edx, edx | |
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer); | |
@NoCalcSITout: | |
add esp, 72 | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
function doInterpolHybridDESSE2(PIteration3D: TPIteration3D): Double; // new ext version | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, i = edi + 212 = btmp = esi - 44 | |
add esp, -72 | |
mov edi, eax //was: Rold = esp, Rstop = esp + 8, aligned16: esp + 16, X1 = a16 X2 = a16+8.. Y1 = a16+32 .. | |
lea esi, eax + 256 | |
mov eax, esp | |
add eax, 35 | |
and eax, $FFFFFFF0 | |
mov [esp], eax // aligned 16 Ybuf aligned16: esp, X1 = a16.. = Y1 = (aligned) | |
cvtps2pd xmm7, [edi + 76] //nHybrid[0] +76 weights in double for s1,s2 (lo,hi part) | |
movupd xmm0, [edi] | |
movsd xmm1, [edi + 16] | |
movupd [edi - 32], xmm0 //xyz=C | |
movupd [edi - 16], xmm1 | |
cmp dword [esi - 104], 0 //DoJulia:+152 | |
jz @sjup | |
movupd xmm2, [esi + 64] | |
movsd xmm3, [esi + 80] | |
movupd [edi + 24], xmm2 //J=Ju | |
movsd [edi + 40], xmm3 | |
jmp @skipIfJulia | |
@sjup: | |
movupd [edi + 24], xmm0 //J=C | |
movsd [edi + 40], xmm1 | |
@skipIfJulia: | |
mulpd xmm0, xmm0 | |
mulsd xmm1, xmm1 | |
CVTSS2SD xmm5, [edi + 72] //RStop in double | |
addsd xmm1, xmm0 | |
unpckhpd xmm0, xmm0 | |
movsd [edi - 40], xmm5 | |
addsd xmm1, xmm0 | |
xor ebx, ebx | |
movsd [esi - 64], xmm1 //OTrap=Rout | |
movsd [edi + 56], xmm1 //Rout | |
mov [esi - 48], ebx //bFirstIt := 0; +208 | |
mov [edi + 64], ebx //ItresultI :=0 +64 | |
mov eax, [esi - 96] //DEoption +160 | |
and eax, $18 | |
sub eax, 16 | |
jnz @UU1 | |
fld qword [edi + 56] | |
jmp @UU2 | |
@UU1: | |
fld1 | |
@UU2: | |
fstp qword [edi - 8] // if (DEoption and $18) = 16 then w := Rout else w := 1; | |
@Repeat: | |
movsd xmm2, [edi + 56] | |
mov ebx, [edi + 100] //fHPVar[0] +100 | |
mov eax, [esp] | |
mov [edi + 48], ebx //PVars: +48 | |
movsd [edi - 48], xmm2 //Rold := Rout | |
movupd xmm0, [edi - 32] //Y:=xyz | |
movupd xmm1, [edi - 16] | |
movapd [eax], xmm0 | |
movapd [eax + 16], xmm1 | |
lea eax, edi - 32 // x | |
lea edx, edi - 24 // y | |
lea ecx, edi - 16 // z | |
lea ebx, edi - 8 // w | |
push ebx | |
push edi | |
call [edi + 124] //fHybrid[0] of ThybridIteration2 | |
mov eax, [esp] | |
movupd xmm0, [edi - 32] // mCopyVec4(@x1, @x); | |
movupd xmm1, [edi - 16] // mCopyVec4(@x, @Y1); | |
movapd xmm2, [eax] | |
movapd xmm3, [eax + 16] | |
movapd [eax], xmm0 | |
movapd [eax + 16], xmm1 | |
movupd [edi - 32], xmm2 //xyz=Y1 | |
movupd [edi - 16], xmm3 | |
mov ebx, [edi + 104] //fHPVar[1] | |
mov [edi + 48], ebx //PVars: +48 | |
lea eax, edi - 32 // x | |
lea edx, edi - 24 // y | |
lea ecx, edi - 16 // z | |
lea ebx, edi - 8 // w | |
push ebx | |
push edi | |
call [edi + 128] //fHybrid[1] of ThybridIteration2 | |
mov eax, [esp] | |
movupd xmm0, [edi - 32] //x,y was: y1 | |
movapd xmm2, [eax] //x[0,1] | |
movupd xmm1, [edi - 16] //z,w | |
movapd xmm3, [eax + 16] //x[2,3] | |
movapd xmm5, xmm0 //x,y | |
movapd xmm6, xmm2 //x[0,1] | |
mulpd xmm0, xmm0 //x²,y² | |
mulpd xmm2, xmm2 //x[0]²,x[1]² | |
mulsd xmm1, xmm1 //z²,w | |
mulsd xmm3, xmm3 //x[2]² | |
addsd xmm1, xmm0 //z²+x² | |
addsd xmm3, xmm2 //x[2]²+x[0]² | |
unpckhpd xmm0, xmm0 //y² | |
unpckhpd xmm2, xmm2 //x[1]² | |
addsd xmm1, xmm0 //x²+y²+z² | |
addsd xmm3, xmm2 //x[0]²+x[1]²+x[2]² | |
unpcklpd xmm3, xmm1 //x[0]²+x[1]²+x[2]²,x²+y²+z² | |
sqrtpd xmm0, xmm3 //xx,yy | |
mulpd xmm0, xmm7 //xx*s1,yy*s2 | |
pshufd xmm2, xmm0, $4E | |
addsd xmm0, xmm2 //XX = xx*s1+yy*s2 | |
pshufd xmm3, xmm7, $4E //wy | |
movsd xmm2, xmm7 //wx | |
unpcklpd xmm3, xmm3 //s2,s2 | |
unpcklpd xmm2, xmm2 //s1,s1 | |
movupd xmm1, [edi - 16] //z,w | |
mulpd xmm5, xmm3 //x,y *s2 | |
mulpd xmm6, xmm2 //x[0,1] *s1 | |
mulpd xmm3, xmm1 //z,w *s2 | |
mulpd xmm2, [eax + 16] //x[2,3] *s1 | |
addpd xmm5, xmm6 //x,y | |
addpd xmm3, xmm2 //z,w | |
movapd xmm4, xmm5 //x,y | |
movsd xmm2, xmm3 //z | |
mulpd xmm4, xmm4 //x²,y² | |
mulsd xmm2, xmm2 //z² 4D: mulpd | |
addsd xmm2, xmm4 //z²+x² 4D: addpd ... | |
unpckhpd xmm4, xmm4 //y² | |
addsd xmm4, xmm2 //x²+y²+z² | |
addsd xmm4, d1em40 | |
sqrtsd xmm4, xmm4 | |
movsd xmm2, xmm0 //XX | |
divsd xmm2, xmm4 //YY := XX / Sqrt(x * x + y * y + z * z + 1e-40); | |
unpcklpd xmm2, xmm2 //YY,YY | |
mulpd xmm5, xmm2 | |
mulsd xmm3, xmm2 | |
movupd [edi - 32], xmm5 //x,y | |
movupd [edi - 16], xmm3 //z,w | |
mulsd xmm0, xmm0 | |
movsd [edi + 56], xmm0 //Rout := XX * XX; | |
movsd xmm1, xmm0 | |
inc dword [edi + 64] //Inc(ItResultI) | |
minsd xmm0, [esi - 64] | |
movsd [esi - 64], xmm0 //OTrap := Min(Rout, OTrap); | |
mov eax, [edi + 64] | |
cmp eax, [edi + 68] //maxIt: +68 | |
jnl @out | |
comisd xmm1, [edi - 40] //RStop | |
jc @Repeat | |
@out: | |
mov eax, [esi - 96] //DEoption +160 | |
and eax, 7 | |
sub eax, 4 | |
jnz @UU3 //Result := Abs(z) * Ln(Abs(z)) / w; | |
fld qword [edi - 16] | |
fabs | |
fldln2 | |
fld st(1) | |
fyl2x | |
fmulp | |
fdiv qword [edi - 8] //Result | |
jmp @UU6 | |
@UU3: | |
sub eax, 3 // / intPower faster? | |
jnz @UU4 //Result := Sqrt(Rout/RStop) * Power(PDouble(Integer(PVar) - 16)^, -ItResultI); | |
mov eax, [edi + 48] | |
fild dword [edi + 64] //ItResultI | |
fchs //-ItresultI | |
fld qword [eax - 16] //(Pvar-16)^ (= scale or something) | |
fldln2 //power function base,expo -> st, st(1) | |
fxch | |
fyl2x | |
fxch | |
fmulp | |
fldl2e | |
fmulp | |
fld st(0) | |
frndint | |
fsub st(1), st(0) | |
fxch | |
f2xm1 | |
fld1 | |
faddp | |
fscale | |
fstp st(1) //end of power function | |
fld qword [edi + 56] | |
fdiv dword [edi + 72] //rout/rstop,pow | |
fsqrt | |
fmulp | |
jmp @UU6 | |
@UU4: // else Result := Sqrt(Rout) / Abs(w); | |
fld qword [edi + 56] | |
fsqrt | |
fld qword [edi - 8] | |
fabs | |
fdivp | |
@UU6: | |
cmp byte [esi - 108], 0 //CalcSIT: +148 | |
jz @NoCalcSITout | |
mov eax, edi | |
xor edx, edx | |
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer); | |
@NoCalcSITout: | |
add esp, 72 | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
function doHybridIFS3D(PIteration3D: TPIteration3D): Double; | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi //x = esi-128 y = esi-120 .. btmp = esi+116 (eax+212) | |
lea esi, eax + 88 | |
movupd xmm0, [eax] | |
movsd xmm1, [eax + 16] | |
movupd [eax - 32], xmm0 //X=Cx | |
movsd [eax - 16], xmm1 | |
lea edx, esi + 128 | |
cmp dword [esi + 64], 0 //DoJulia:+152 | |
jz @sjup | |
movupd xmm0, [edx + 104] //J=Ju +320 -88=+232 -128=104 | |
movsd xmm1, [edx + 120] | |
@sjup: | |
movupd [eax + 24], xmm0 //J=C | |
movsd [eax + 40], xmm1 | |
@skipIfJulia: | |
xor ebx, ebx //n:=0 | |
mov eax, [esi + 296] | |
mov [esi + 120], ebx //bFirstIt := 0; +208 | |
mov [esi - 24], ebx //ItresultI:=0 +64 | |
mov [esi - 36], eax //bIsInsideRender tmp in SmothIts | |
movzx ebx, word [esi + 102] | |
fldz | |
fld d65535 //minDE ini | |
fld1 | |
fstp qword [esi + 112] //VaryScale: //+200 absScale, must be changed in formulas | |
fstp qword [esi + TIteration3Dext.OTrap - 144] // 104 OTrap: Double; //+192 min of AbsScale | |
fstp qword [edx + TIteration3Dext.Dfree1 - 144 - 128] //+248 +56 | |
mov edi, [esi + ebx * 4 + 12] //fHPVar[0] +100 | |
mov ecx, [esi + ebx * 4 - 12] //i:=nHybrid[0] +76 | |
and ecx, $7FFFFFFF | |
@Repeat: | |
cmp ecx, 0 | |
jnle @up2 | |
@While: | |
inc ebx | |
cmp bx, word [esi + 62] //5 wEndTo: Word; //+150 | |
jle @up3 | |
movzx ebx, word [esi + 100] //n := iRepeatFrom //+188 | |
@up3: | |
mov ecx, [esi + ebx * 4 - 12] //i := nHybrid[n]; +76 | |
and ecx, $7FFFFFFF | |
jle @While | |
mov edi, [esi + ebx * 4 + 12] //fHPVar:array[0..5] of Pointer; //+100 | |
@up2: | |
call [esi + ebx * 4 + 36] //fHybrid[0..5] of ThybridIteration2; //+124 | |
dec ecx //Dec(i) | |
cmp [esi + ebx * 4 - 12], 0 | |
jl @Repeat | |
movsd xmm0, [esi - 32] //DEout relative; Rout: Double; //+56 | |
inc dword [esi - 24] //Inc(ItResultI) //+64 | |
divsd xmm0, [esi + 112] //abs Scale VaryScale: Double; //+200 | |
mov eax, [esi - 24] | |
ucomisd xmm0, [esi + 104] // memorize the smallest DE for itresult | |
jnc @skip | |
lea edx, esi + 104 | |
mov [esi + 124], eax // bTmp: Integer; //+212 | |
fld qword [edx + TIteration3Dext.Dfree1 - 144-104] //+128 | |
movsd [edx], xmm0 //result DE output | |
fstp qword [edx + TIteration3Dext.Dfree2 - 144-104] //+136 | |
cmp dword [esi - 36], 0 //was: +384 -88=296 bIsInsideRender | |
js @skip //if outside, compare if DE is lower than minDE | |
ucomisd xmm0, [esi - 128] //compare with RstopD, that contains the DEstop condition. Stop if nearer. | |
jc @out | |
@skip: | |
cmp eax, [esi - 20] //maxIt: +68 | |
jl @Repeat | |
@out: | |
fild dword [esi + 124] | |
mov eax, [esi + 124] //it on minDE | |
fstp dword [esi - 36] //SmoothItD: Single; //+52 | |
mov [esi - 24], eax //ItResultI | |
add esi, 104 | |
fld qword [esi] //MinDE in OTrap | |
fld qword [esi + 32] //Dfree2 | |
fstp qword [esi] //OTrap | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
function doHybridIFS3DnoVecIni(PIteration3D: TPIteration3D): Double; //to use behind common fractals, use the new vec for it | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi //x = esi-128 y = esi-120 .. btmp = esi+116 (eax+212) | |
lea esi, eax + 88 // | |
xor ebx, ebx //n:=0 | |
mov eax, [esi + 296] | |
mov [esi + 120], ebx //bFirstIt := 0; +208 | |
mov [esi + 124], ebx | |
mov [esi - 24], ebx //ItresultI:=0 +64 | |
mov [esi - 36], eax //bIsInsideRender tmp in SmothIts | |
movzx ebx, word [esi + 102] //n := iStartFrom | |
fldz | |
fld d65535 //minDE ini | |
fld1 | |
fstp qword [esi + 112] //VaryScale: //+200 absScale, must be changed in formulas | |
fstp qword [esi + TIteration3Dext.OTrap - 144] // 104 OTrap: Double; //+192 min of AbsScale | |
fstp qword [esi + TIteration3Dext.Dfree1 - 144] //+248 +56 | |
mov edi, [esi + ebx * 4 + 12] //fHPVar[0] +100 | |
mov ecx, [esi + ebx * 4 - 12] //i:=nHybrid[n] +76 | |
and ecx, $7FFFFFFF | |
@Repeat: | |
cmp ecx, 0 | |
jnle @up2 | |
@While: | |
inc ebx | |
cmp bx, word [esi + 62] //5 wEndTo: Word; //+150 | |
jle @up3 | |
movzx ebx, word [esi + 100] //n := iRepeatFrom //+188 | |
@up3: | |
mov ecx, [esi + ebx * 4 - 12] //i := nHybrid[n]; +76 | |
and ecx, $7FFFFFFF | |
jle @While | |
mov edi, [esi + ebx * 4 + 12] //fHPVar:array[0..5] of Pointer; //+100 | |
@up2: | |
call [esi + ebx * 4 + 36] //fHybrid[0..5] of ThybridIteration2; //+124 | |
dec ecx //Dec(i) | |
cmp [esi + ebx * 4 - 12], 0 | |
jl @Repeat | |
movsd xmm0, [esi - 32] //DEout relative; Rout: Double; //+56 | |
inc dword [esi - 24] //Inc(ItResultI) //+64 | |
divsd xmm0, [esi + 112] //abs Scale VaryScale: Double; //+200 | |
mov eax, [esi - 24] | |
ucomisd xmm0, [esi + 104] // memorize the smallest DE for itresult | |
jnc @skip | |
lea edx, esi + 104 | |
mov [esi + 124], eax // bTmp: Integer; //+212 | |
fld qword [edx + TIteration3Dext.Dfree1 - 144-104] //+128 otrap color option | |
movsd [edx], xmm0 //result DE output | |
fstp qword [edx + TIteration3Dext.Dfree2 - 144-104] //+136 | |
cmp dword [esi - 36], 0 //was: +384 -88=296 bIsInsideRender | |
jne @skip //if outside, compare if DE is lower than minDE | |
ucomisd xmm0, [esi - 128] //compare with RstopD, that contains the DEstop condition. Stop if nearer. | |
jc @out | |
@skip: | |
cmp eax, [esi - 20] //maxIt: +68 | |
jl @Repeat | |
@out: | |
fild dword [esi + 124] | |
mov eax, [esi + 124] //it on minDE | |
fstp dword [esi - 36] //SmoothItD: Single; //+52 | |
mov [esi - 24], eax //ItResultI | |
add esi, 104 | |
fld qword [esi] //MinDE in OTrap | |
fld qword [esi + 32] //Dfree2 | |
fstp qword [esi] //OTrap | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
procedure CalcSmoothIterations(PIt3D: TPIteration3D; n: Integer); | |
asm | |
add eax, $34 | |
cmp dword [eax + TIteration3D.Rout + 4 - $34], $3FF00000 //Rout <= 1? [Rout+4] //+$3c cmp with $3FF0.. does not work always!!! | |
jg @@1 | |
fild dword [eax + TIteration3D.ItResultI - $34] //+$40 | |
fstp dword [eax + TIteration3D.SmoothItD - $34] //+$34 | |
ret | |
@@1: | |
fld qword [eax + TIteration3D.Rout - $34] //+$38 Rout | |
cmp dword [eax + TIteration3Dext.Rold - 56 + 4 - $34], $3FF00000 //Rold <= 1? -$2c | |
jnb @@2 | |
fldln2 | |
fxch //Rout,ln | |
fyl2x | |
fmul s05 //ln(Rout)*0.5 | |
fldln2 | |
fxch | |
fyl2x | |
fmul dword [eax + edx * 4 + TIteration3D.fHln - $34] // PIt3D.fHln[n] +$00a4 | |
fild dword [eax + TIteration3D.ItResultI - $34] //+$40 | |
fadd dword [eax + TIteration3D.LNRStop - $34] //+$009c | |
fsubrp | |
fstp dword [eax + TIteration3D.SmoothItD - $34] //+$34 | |
ret | |
@@2: | |
fldln2 | |
fxch | |
fyl2x //ln(Rout) | |
fmul s05 | |
fldln2 | |
fxch | |
fyl2x //d | |
fldln2 //ln2,d | |
fld qword [eax + TIteration3Dext.Rold - 56 - $34] //Rold,ln2,d | |
fyl2x | |
fmul s05 | |
fldln2 | |
fxch | |
fyl2x | |
fsubr st, st(1) //d - Ln(0.5 * Ln(PIt3D.Rold)), d | |
fld dword [eax + TIteration3D.LNRStop - $34] //+$009c | |
fsubrp st(2), st //d - Ln(0.5 * Ln(PIt3D.Rold)), PIt3D.LNRStop - d | |
fadd d1em100 //test | |
fdivp //div0 sometimes | |
fiadd dword [eax + TIteration3D.ItResultI - $34] //+$40 | |
fstp dword [eax + TIteration3D.SmoothItD - $34] //+$34 | |
end; | |
procedure doHybrid4DSSE2(PIteration3D: TPIteration3D); //new ext version | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, i = edi + 212 = btmp = esi - 44 | |
mov edi, eax | |
lea esi, eax + 256 | |
lea edx, edi -32 | |
mov ecx, esi | |
call Rotate4Dex //(@C1, @x, SMatrix4); C1=It3D=eax | |
movupd xmm6, [edi - 32] | |
movupd xmm7, [edi - 16] | |
cmp dword [esi - 104], 0 //DoJulia:+152 | |
jz @sjup | |
movupd xmm2, [esi + 64] | |
movupd xmm3, [esi + 80] | |
movupd [edi + 24], xmm2 //J=Ju | |
movlpd [edi + 40], xmm3 | |
movhpd [edi - 56], xmm3 | |
jmp @skipIfJulia | |
@sjup: | |
movupd [edi + 24], xmm6 //J=C | |
movlpd [edi + 40], xmm7 | |
movhpd [edi - 56], xmm7 //J4 = edi - 56 | |
@skipIfJulia: | |
mulpd xmm6, xmm6 | |
mulpd xmm7, xmm7 | |
CVTSS2SD xmm5, [edi + 72] //RStop in double | |
addpd xmm7, xmm6 | |
pshufd xmm6, xmm7, $4E | |
movsd [edi - 40], xmm5 | |
addsd xmm7, xmm6 //xmm7=Rout | |
movsd [esi - 64], xmm7 //OTrap=Rout | |
movsd [edi + 56], xmm7 //Rout | |
xor ebx, ebx //n:=0 | |
mov [esi - 48], ebx //bFirstIt := 0; +208 | |
mov [edi + 64], ebx //ItresultI:=0 +64 | |
movzx ebx, word [esi - 66] //n:=iStartFrom | |
mov eax, [edi + ebx * 4 + 100] //fHPVar[0] +100 | |
mov [edi + 48], eax //PVars: +48 | |
mov eax, [edi + ebx * 4 + 76] //i:=nHybrid[0] +76 | |
and eax, $7FFFFFFF | |
mov [esi - 44], eax //i(=It3D.btmp) | |
@Repeat: | |
movsd [edi - 48], xmm7 //Rold := Rout | |
cmp dword [esi - 44], 0 | |
jnle @up2 | |
@While: | |
inc ebx | |
cmp bx, word [esi - 106] //5 wEndTo: Word; //+150 | |
jle @up3 | |
movzx ebx, word [esi - 68] //n := iRepeatFrom | |
@up3: | |
mov eax, [edi + ebx * 4 + 76] //i := nHybrid[n]; +76 | |
and eax, $7FFFFFFF | |
jle @While | |
mov [esi - 44], eax | |
mov eax, [edi + ebx * 4 + 100] //fHPVar:array[0..5] of Pointer; | |
mov [edi + 48], eax //PVars: +48 | |
@up2: | |
lea eax, edi - 8 //was: esp + 24 w | |
push eax | |
push edi | |
lea edx, edi - 24 //was: esp + 16 y | |
lea ecx, edi - 16 //was: esp + 24 z | |
add eax, -24 // x | |
call [edi + ebx * 4 + 124] //fHybrid[0..5] of ThybridIteration2; //+124 | |
dec [esi - 44] //Dec(i) write at addr... false dIFS?? | |
cmp [edi + ebx * 4 + 76], 0 //nHybrid[fnr] | |
jl @Repeat //SkipMaxItTest | |
movupd xmm6, [edi - 32] | |
movupd xmm7, [edi - 16] | |
mulpd xmm6, xmm6 | |
mulpd xmm7, xmm7 | |
addpd xmm7, xmm6 | |
pshufd xmm6, xmm7, $4E | |
addsd xmm7, xmm6 //xmm7=Rout | |
movsd xmm5, xmm7 | |
minsd xmm5, qword [esi - 64] | |
movsd [edi + 56], xmm7 //Rout | |
movsd [esi - 64], xmm5 //OTrap | |
inc dword [edi + 64] //Inc(ItResultI) | |
mov eax, [edi + 64] | |
cmp eax, [edi + 68] //maxIt: +68 | |
jnl @out | |
comisd xmm7, [edi - 40] //RStop | |
jc @Repeat | |
@out: | |
cmp byte [esi - 108], 0 //CalcSIT: +148 | |
jz @NoCalcSITout | |
mov eax, edi | |
mov edx, ebx | |
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer); | |
@NoCalcSITout: | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
procedure doHybridSSE2(PIteration3D: TPIteration3D); //new ext version | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, (i = edi + 212 = btmp = esi - 44) | |
mov edi, eax // = [edi - 32] | |
lea esi, eax + 256 | |
movupd xmm6, [edi] //Iteration3D by calcMissed not aligned16?! | |
movsd xmm7, [edi + 16] | |
movupd [edi - 32], xmm6 //X=C | |
movupd [edi - 16], xmm7 | |
cmp dword [esi - 104], 0 //DoJulia:+152 | |
jz @sjup | |
movupd xmm2, [esi + 64] | |
movsd xmm3, [esi + 80] | |
movupd [edi + 24], xmm2 //J=Ju | |
movsd [edi + 40], xmm3 | |
jmp @skipIfJulia | |
@sjup: | |
movupd [edi + 24], xmm6 //J=C | |
movsd [edi + 40], xmm7 | |
@skipIfJulia: | |
mulpd xmm6, xmm6 | |
mulsd xmm7, xmm7 | |
CVTSS2SD xmm5, [edi + 72] //RStop in double | |
addsd xmm7, xmm6 | |
shufpd xmm6, xmm6, 1 | |
movsd [edi - 40], xmm5 | |
addsd xmm7, xmm6 //xmm7=Rout | |
movsd [esi - 64], xmm7 //OTrap=Rout | |
movsd [edi + 56], xmm7 //Rout | |
xor ebx, ebx | |
mov [esi - 48], ebx //bFirstIt := 0; +208 | |
mov [edi + 64], ebx //ItresultI:=0 +64 | |
movzx ebx, word [esi - 66] //n := iStartFrom | |
mov eax, [edi + ebx * 4 + 100] //fHPVar[0] +100 | |
mov [edi + 48], eax //PVars: +48 | |
mov eax, [edi + ebx * 4 + 76] //i:=nHybrid[0] +76 | |
and eax, $7FFFFFFF | |
mov [esi - 44], eax //btmp | |
@Repeat: | |
movsd [edi - 48], xmm7 //Rold := Rout | |
cmp dword [esi - 44], 0 | |
jnle @up2 | |
@While: | |
inc ebx | |
cmp bx, word [esi - 106] //5 wEndTo: Word; //+150 | |
jle @up3 | |
movzx ebx, word [esi - 68] //n := iRepeatFrom | |
@up3: | |
mov eax, [edi + ebx * 4 + 76] //i := nHybrid[n]; +76 | |
and eax, $7FFFFFFF | |
jle @While | |
mov [esi - 44], eax //was btmp, now own var | |
mov eax, [edi + ebx * 4 + 100] //fHPVar:array[0..5] of Pointer; | |
mov [edi + 48], eax //PVars: +48 | |
@up2: | |
lea eax, edi - 8 // w | |
push eax | |
push edi | |
lea edx, edi - 24 | |
lea ecx, edi - 16 | |
add eax, -24 | |
call [edi + ebx * 4 + 124] //fHybrid[0..5] of ThybridIteration2; //+124 fp overflow: it3dex.z > 1eXXX ! | |
dec [esi - 44] //Dec(i) | |
cmp [edi + ebx * 4 + 76], 0 | |
jl @Repeat //SkipMaxItTest | |
movupd xmm6, [edi - 32] | |
movupd xmm7, [edi - 16] | |
mulpd xmm6, xmm6 | |
mulsd xmm7, xmm7 //4D: mulpd | |
addsd xmm7, xmm6 //4D: addpd | |
shufpd xmm6, xmm6, 1 //4D: pshufd xmm6, xmm7, $4E | |
addsd xmm7, xmm6 //xmm7=Rout | |
movsd xmm5, xmm7 | |
minsd xmm5, qword [esi - 64] | |
movsd [edi + 56], xmm7 //Rout | |
movsd [esi - 64], xmm5 //OTrap | |
inc dword [edi + 64] //Inc(ItResultI) | |
mov eax, [edi + 64] | |
cmp eax, [edi + 68] //maxIt: +68 | |
jnl @out | |
comisd xmm7, [edi - 40] //RStop | |
jc @Repeat | |
@out: | |
cmp byte [esi - 108], 0 //CalcSIT: +148 | |
jz @NoCalcSITout | |
mov eax, edi | |
mov edx, ebx | |
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer); | |
@NoCalcSITout: | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
function doHybridDESSE2(PIteration3D: TPIteration3D): Double; //result in st(0) new ext version | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, (i = edi + 212 = btmp = esi - 44) | |
mov edi, eax | |
lea esi, eax + 256 | |
movupd xmm6, [edi] //Iteration3D by calcMissed not aligned16?! | |
movsd xmm7, [edi + 16] | |
movupd [edi - 32], xmm6 //X=C | |
movupd [edi - 16], xmm7 | |
cmp dword [esi - 104], 0 //DoJulia:+152 | |
jz @sjup | |
movupd xmm2, [esi + 64] | |
movsd xmm3, [esi + 80] | |
movupd [edi + 24], xmm2 //J=Ju | |
movsd [edi + 40], xmm3 | |
jmp @skipIfJulia | |
@sjup: | |
movupd [edi + 24], xmm6 //J=C | |
movsd [edi + 40], xmm7 | |
@skipIfJulia: | |
mulpd xmm6, xmm6 | |
mulsd xmm7, xmm7 | |
CVTSS2SD xmm5, [edi + 72] //RStop in double | |
addsd xmm7, xmm6 | |
shufpd xmm6, xmm6, 1 | |
movsd [edi - 40], xmm5 | |
addsd xmm7, xmm6 //xmm7=Rout | |
movsd [esi - 64], xmm7 //OTrap=Rout | |
movsd [edi + 56], xmm7 //Rout | |
movsd [edi - 48], xmm7 //Rold := Rout | |
xor ebx, ebx //n:=0 | |
mov [edi + 208], ebx //mov [esi - 48], ebx //bFirstIt := 0; +208 | |
mov [edi + 64], ebx //ItresultI:=0 +64 | |
movzx ebx, word [esi - 66] //n := iStartFrom | |
mov eax, [edi + ebx * 4 + 100] //fHPVar[n] +100 | |
mov [edi + 48], eax //PVars: +48 | |
mov eax, [edi + ebx * 4 + 76] //i:=nHybrid[n] +76 | |
and eax, $7FFFFFFF | |
mov [esi - 44], eax | |
mov eax, [esi - 96] //DEoption +160 | |
and eax, $38 // case (DEoption and $38) of | |
sub eax, 16 | |
jnz @UU1 | |
fld qword [edi + 56] // 16: w := Rout; | |
jmp @UU2 | |
@UU1: | |
sub eax, 16 | |
jnz @UU | |
fld1 | |
fstp qword [esi - 24] // deriv1 | |
fldz // 32: begin Deriv1 := 1; Deriv2 := 0; Deriv3 := 0; end; | |
fst qword [esi - 16] | |
fst qword [esi - 8] | |
jmp @UU2 | |
@UU: | |
fld1 // else w := 1; | |
@UU2: | |
fstp qword [edi - 8] //w := Rout,1,0 | |
@Repeat: | |
movsd [edi - 48], xmm7 //Rold := Rout | |
cmp dword [esi - 44], 0 | |
jnle @up2 | |
@While: | |
inc ebx | |
cmp bx, word [esi - 106] //5 wEndTo: Word; //+150 | |
jle @up3 | |
movzx ebx, word [esi - 68] //n := iRepeatFrom | |
@up3: | |
mov eax, [edi + ebx * 4 + 76] //i := nHybrid[n]; +76 | |
and eax, $7FFFFFFF | |
jle @While | |
mov [esi - 44], eax | |
mov eax, [edi + ebx * 4 + 100] //fHPVar:array[0..5] of Pointer; | |
mov [edi + 48], eax //PVars: +48 | |
@up2: | |
lea eax, edi - 8 //was: esp + 24 w | |
push eax | |
push edi | |
lea edx, edi - 24 //was: esp + 16 y | |
lea ecx, edi - 16 //was: esp + 24 z | |
add eax, -24 // x | |
call [edi + ebx * 4 + 124] //fHybrid[0..5] of ThybridIteration2; //+124 error in called function sometimes!!! | |
dec [esi - 44] //Dec(i) //Write off...??? bug in call... of mandbox or menger??! abox as testhybrid! esi has changed? | |
cmp [edi + ebx * 4 + 76], 0 | |
jl @Repeat //SkipMaxItTest | |
movupd xmm6, [edi - 32] | |
movupd xmm7, [edi - 16] | |
mulpd xmm6, xmm6 | |
mulsd xmm7, xmm7 | |
addsd xmm7, xmm6 | |
shufpd xmm6, xmm6, 1 | |
addsd xmm7, xmm6 //xmm7=Rout | |
movsd xmm5, xmm7 | |
minsd xmm5, qword [esi - 64] | |
movsd [edi + 56], xmm7 //Rout | |
movsd [esi - 64], xmm5 //OTrap | |
inc dword [edi + 64] //Inc(ItResultI) | |
mov eax, [edi + 64] | |
cmp eax, [edi + 68] //maxIt: +68 | |
jnl @out | |
comisd xmm7, [edi - 40] //RStop | |
jc @Repeat | |
@out: | |
mov eax, [esi - 96] //DEoption +160 if (DEoption and $38) = 32 then | |
and eax, 38 | |
sub eax, 32 | |
jnz @JU1 | |
fld qword [edi + 56] //rout Result := Sqrt(Rout) * 0.5 * Ln(Rout) / RoutDeriv | |
fldln2 | |
fld st(1) //rout,ln2,rout | |
fyl2x //ln(rout),rout | |
fxch | |
fsqrt | |
fmulp | |
fmul cs05 | |
fdiv qword [esi - 24] //Deriv1 | |
jmp @UU6 | |
@JU1: | |
mov eax, [esi - 96] //DEoption +160 | |
and eax, 7 | |
sub eax, 4 | |
jnz @UU3 //Result := Abs(X3) * Ln(Abs(X3)) / X4; | |
fld qword [edi - 16] //X3 | |
fabs | |
fldln2 | |
fld st(1) //absX3,ln2,absX3 | |
fyl2x //ln(absX3),absX3 | |
fmulp | |
fdiv qword [edi - 8] //Result | |
jmp @UU6 | |
@UU3: | |
sub eax, 3 | |
jnz @UU4 //Result := Sqrt(Rout/RStop) * Power(PDouble(Integer(PVar) - 16)^, -ItResultI); | |
mov eax, [edi + 48] | |
fild dword [edi + 64] //ItResultI | |
fchs //-ItresultI | |
fld [eax - 16] //(Pvar-16)^ (= scale or something) | |
fldln2 //power function x,pow | |
fxch | |
fyl2x | |
fxch | |
fmulp | |
fldl2e | |
fmulp | |
fld st(0) | |
frndint | |
fsub st(1), st(0) | |
fxch | |
f2xm1 | |
fld1 | |
faddp | |
fscale | |
fstp st(1) //end of power function | |
fld qword [edi + 56] | |
fdiv dword [edi + 72] //rout/rstop,pow | |
fsqrt | |
fmulp | |
jmp @UU6 | |
@UU4: // else Result := Sqrt(Rout) / Abs(X4); | |
fld qword [edi + 56] | |
fsqrt | |
fld qword [edi - 8] | |
fabs | |
fdivp | |
@UU6: | |
cmp byte [esi - 108], 0 //CalcSIT: +148 | |
jz @NoCalcSITout | |
mov eax, edi | |
mov edx, ebx | |
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer); | |
@NoCalcSITout: | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end; | |
procedure HybridItTricorn(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
fld qword [edx] | |
fld st(0) | |
fmul st(0), st(1) // y*y, y | |
fld qword [eax] // x, y*y, y | |
mov esi, [ebp + 8] // PIteration3D | |
fld st(0) // x, x, y*y, y | |
fmul st(0), st(1) // x*x, x, y*y, y | |
fld qword [ecx] // z, x*x, x, y*y, y | |
fld st(0) | |
mov edi, [esi + 48] | |
fmul st(0), st(1) // z*z, z, x*x, x, y*y, y | |
faddp st(4), st(0) // z, x*x, x, y*y+z*z, y | |
fmul st(0), st(2) // z*x, x*x, x, y*y+z*z, y | |
fmul qword [edi - 16] | |
fld qword [esi + 40] | |
fmul qword [edi - 24] | |
faddp | |
fstp qword [ecx] // x*x, x, y*y+z*z, y | |
fsubrp st(2), st(0) // x, x*x-y*y-z*z, y | |
fmulp st(2), st(0) // x*x-y*y-z*z, y*x | |
fadd qword [esi + 24] | |
fstp qword [eax] // y*x | |
fadd st(0), st(0) | |
fadd qword [esi + 32] | |
fstp qword [edx] | |
pop edi | |
pop esi | |
end; | |
procedure HybridQuatSSE2(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] | |
mov edi, [esi + 48] //PVars | |
movupd xmm0, [eax] //x,y | |
movupd xmm1, [ecx] //z,w | |
movapd xmm6, xmm0 //x,y | |
movapd xmm5, xmm1 //z,w | |
movapd xmm3, xmm1 //z,w | |
xorpd xmm4, xmm4 //0,0 | |
mulpd xmm6, xmm6 //xx,yy | |
mulpd xmm5, xmm5 //zz,ww | |
movupd xmm2, [edx] //y,z | |
subsd xmm4, xmm5 //-zz | |
shufpd xmm3, xmm0, 1 //w,x | |
shufpd xmm4, xmm5, 2 //-zz, ww | |
mulpd xmm2, xmm0 //yx, zy | |
addpd xmm4, xmm6 //xx-zz, yy+ww | |
mulpd xmm0, xmm1 //xz, yw | |
mulpd xmm3, xmm1 //wz, xw | |
pshufd xmm6, xmm0, $4E //yw, xz | |
pshufd xmm1, xmm4, $4E //yy+ww, xx-zz | |
mulsd xmm6, [edi - 16] //ywMul, xz | |
addpd xmm2, xmm3 //yx+wz, zy+xw -> y, w | |
addsd xmm6, xmm0 //ywMul + xz -> z | |
subpd xmm4, xmm1 //xx-zz-yy-ww -> x | |
addpd xmm2, xmm2 //y,w | |
addsd xmm6, xmm6 //z | |
shufpd xmm2, xmm2, 1 //w, y | |
movupd xmm5, [esi + 24] //J1,J2 | |
addsd xmm2, [edi - 24] | |
addsd xmm2, [esi - 56] //+J4 | |
shufpd xmm6, xmm2, 0 //z, w | |
shufpd xmm4, xmm2, 2 //x, y | |
addsd xmm6, [esi + 40] //+J3 | |
addpd xmm4, xmm5 //+J1,2 | |
movupd [eax], xmm4 | |
movupd [ecx], xmm6 | |
pop edi | |
pop esi | |
end; | |
procedure HybridItIntPow2(var x, y, z, w: Double; PIteration3D: TPIteration3D); //sine bulb | |
asm | |
push esi | |
push edi | |
fld qword [ecx] | |
fld qword [edx] | |
fld qword [eax] //x,y,z | |
mov esi, [ebp + 8] //PIteration3D | |
fld st(1) //y,x,y,z | |
fmul st(0), st(2) // y*y,x,y,z | |
fld st(1) // x,y*y,x,y,z | |
fmul st(0), st(2) // x*x, y*y,x,y,z | |
fld st(0) // x*x, x*x, y*y,x,y,z | |
fadd st(0), st(2) // xx+yy, xx, yy,x,y,z | |
fld st(0) // xx+yy, xx+yy, xx, yy,x,y,z | |
fsqrt | |
mov edi, [esi + 48] | |
fmul qword [edi - 16] //*dOption1=Zmul | |
fmul st(0), st(6) //*z | |
fadd st(0), st(0) //*2 | |
fadd qword [esi + 40] //+cz nly for test | |
fstp qword [ecx] //xx+yy, xx, yy,x,y,z | |
fld st(5) //z, xx+yy, xx, yy,x,y,z | |
fmulp st(6), st(0) //xx+yy, xx, yy,x,y,z*z | |
fld st(0) //xx+yy, xx+yy, xx, yy,x,y,z*z | |
fsubrp st(6), st(0) //xx+yy, xx, yy,x,y, a - z*z | |
fdivp st(5), st(0) //xx, yy,x,y, a - z*z / a = a | |
fsubrp //xx-yy,x,y, a | |
fmul st(0), st(3) //a(xx-yy),x,y, a | |
fadd qword [esi + 24] | |
fstp qword [eax] //x,y, a | |
fmulp | |
fmulp //x*y*a | |
fadd st(0), st(0) //*2 | |
fadd qword [esi + 32] //+ cy only for test | |
fstp qword [edx] | |
pop edi | |
pop esi //SineP2 | |
end; | |
procedure HybridItIntPow2SSE2(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push ebx | |
mov esi, [ebp + 8] | |
movlpd xmm0, [eax] // x | |
movhpd xmm0, [edx] // x, y | |
movlpd xmm1, [ecx] // z | |
movapd xmm2, xmm0 | |
mov ebx, [esi + 48] //Pvars | |
movsd xmm3, xmm1 | |
mulpd xmm2, xmm2 // S1, S2 | |
mulsd xmm3, xmm3 // S3 | |
pshufd xmm5, xmm2, $4E // S2, S1 | |
movapd xmm4, xmm5 | |
addpd xmm5, xmm2 // S1+S2 | |
subsd xmm2, xmm4 // S1-S2 | |
movapd xmm6, xmm5 | |
mulsd xmm1, [ebx - 16] // z*dZmul | |
sqrtsd xmm4, xmm6 // Sqrt(S2+S1) | |
addsd xmm1, xmm1 // z*dZmul*2 | |
subsd xmm6, xmm3 // (S1+S2)-S3 | |
mulsd xmm1, xmm4 // z*dZmul*2*Sqrt(S2+S1) | |
movsd xmm3, [edx] // y | |
addsd xmm1, [esi + 40] // z*dZmul*Sqrt(S2+S1)+J3 = z | |
divsd xmm6, xmm5 // (XT-S3)/XT = XT | |
addsd xmm3, xmm3 // y*2 | |
movsd [ecx], xmm1 // z | |
mulsd xmm3, xmm0 // y*2*x | |
mulsd xmm2, xmm6 // (S1-S2)*XT | |
mulsd xmm3, xmm6 // y*2*x*XT | |
addsd xmm2, [esi + 24] // (S1-S2)*XT+J1 = x | |
addsd xmm3, [esi + 32] // y*2*x*XT+J2 = y | |
movsd [eax], xmm2 // x | |
movsd [edx], xmm3 // y | |
pop ebx | |
pop esi | |
end; | |
procedure HybridFloatPow(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] //PIteration3D | |
mov edi, [esi + 48] | |
fld qword [edi - 16] | |
fld qword [edx] | |
fld qword [eax] | |
fld st(1) | |
fld st(1) | |
fpatan //theta, x, y, pow | |
fmul st, st(3) | |
fsincos //Costheta, Sintheta, x, y, pow | |
fld qword [ecx] //z,Costheta, Sintheta,x,y,pow | |
fxch st(3) //x,Costheta, Sintheta,z,y,pow | |
fmul st, st | |
fxch st(4) //y,Costheta, Sintheta,z,xx,pow | |
fmul st, st | |
faddp st(4), st //Costheta,Sintheta,z,xx+yy,pow | |
fxch st(2) //z,Sintheta,Costheta,xx+yy,pow | |
fxch //Sintheta,z,Costheta,xx+yy,pow | |
fxch st(3) //xx+yy,z,Costheta,Sintheta,pow | |
fsqrt | |
fpatan //phi,Costheta, Sintheta,pow | |
fmul st, st(3) | |
fsincos //Cosphi,Sinphi,Costheta,Sintheta,pow | |
fxch st(4) //pow,Sinphi,Costheta,Sintheta,Cosphi | |
fmul qword [edi - 8] //*0.5 because of Rout=sqr(R) | |
fld qword [esi + 56] //SqrRadius, pow*0.5,Sinphi,Costheta,Sintheta,Cosphi | |
fldln2 //power function x,pow | |
fxch | |
fyl2x | |
fxch | |
fmulp | |
fldl2e | |
fmulp | |
fld st(0) | |
frndint | |
fsub st(1), st(0) | |
fxch | |
f2xm1 | |
fld1 | |
faddp | |
fscale | |
fstp st(1) //NewRadius,Sinphi,Costheta,Sintheta,Cosphi | |
fxch st(2) //Costheta,Sinphi,NewRadius,Sintheta,Cosphi | |
fmul st, st(4) | |
fmul st, st(2) | |
fadd qword [esi + 24] | |
fstp qword [eax] //Sinphi,NewRadius,Sintheta,Cosphi | |
fxch st(3) //Cosphi,NewRadius,Sintheta,Sinphi | |
fmulp st(2), st //NewRadius,Sintheta*Cosphi,Sinphi | |
fmul st(1), st | |
fmulp st(2), st //Sintheta*Cosphi*r,Sinphi*r | |
fadd qword [esi + 32] | |
fstp qword [edx] | |
fmul qword [edi - 24] | |
fadd qword [esi + 40] | |
fstp qword [ecx] | |
pop edi | |
pop esi | |
end; | |
procedure HybridItIntPow3(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] //PIteration3D | |
fld qword [edx] | |
fmul st, st // y*y | |
fld qword [eax] // x, y*y | |
mov edi, [esi + 48] // PVars | |
fmul st, st // x*x, y*y | |
fld st(0) // x*x, x*x, y*y | |
fadd st(0), st(2) // x*x+y*y = R, x*x = sx, y*y = sy | |
fld qword [ecx] | |
fmul st, st // sz, R, sx, sy | |
fld qword [edi + 120] // 3, sz, R, sx, sy | |
fld st(1) // sz, 3, sz, R, sx, sy | |
fmul st(0), st(1) // 3*sz, 3, sz, R, sx, sy | |
fld st(3) | |
fadd qword [edi + 24] | |
fdivp // 3*sz/R, 3, sz, R, sx, sy | |
fld1 | |
fsubrp | |
fld st(1) // 3, A, 3, sz, R, sx, sy | |
fmul st(0), st(6) // 3*sy, .. | |
fsubr st(0), st(5) // sx-3*sy, .. | |
fmul st(0), st(1) // A*(sx-3*sy), A, 3, sz, R, sx, sy | |
fmul qword [eax] | |
fadd qword [esi + 24] | |
fstp qword [eax] // A, 3, sz, R, sx, sy | |
fxch st(4) // sx, 3, sz, R, A, sy | |
fmul st(0), st(1) // 3*sx, 3, sz, R, A, sy | |
fsubrp st(5), st(0) // 3, sz, R, A, 3*sx-sy was: sy-3*sx! | |
fmulp st(2), st(0) // sz, 3*R, A, 3*sx-sy | |
fsubrp // sz-3*R, A, 3*sx-sy | |
fmul qword [ecx] | |
fmul qword [edi - 16] //*dZmul | |
fsubr qword [esi + 40] | |
fstp qword [ecx] // A, 3*sx-sy | |
fmulp // A*(3*sx-sy) | |
fmul qword [edx] | |
fadd qword [esi + 32] | |
fstp qword [edx] | |
pop edi | |
pop esi | |
end; | |
procedure HybridItIntPow4(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] //PIteration3D | |
fld qword [edx] | |
fmul st, st // y*y | |
fld qword [eax] // x, y*y | |
mov edi, [esi + 48] // PVars | |
fld st | |
fmul st, st // x*x, x, y*y | |
fld st(0) // x*x, x*x, x, y*y | |
fadd st(0), st(3) // x*x+y*y = R, sx, x, sy | |
fld qword [ecx] | |
fmul st, st // sz, R, sx, x, sy | |
fld qword [edi + 144] // 6, sz, R, sx, x, sy | |
fmul st, st(2) // 6*R, sz, R, sx, x, sy | |
fsubr st, st(1) // sz - 6*R, sz, R, sx, x, sy | |
fmul st(0), st(1) // sz * (sz - 6 * R), sz, R, sx, x, sy | |
fld st(2) | |
fmul st, st // R*R, sz * (sz - 6 * R), sz, R, sx, x, sy | |
fadd qword [edi + 24] // 24-112 +1e-40 | |
fdivp // sz * (sz - 6 * R) / R*R, sz, R, sx, x, sy | |
fld1 | |
faddp // A, sz, R, sx, x, sy | |
fld st(5) // sy, A, sz, R, sx, x, sy | |
fmul qword [edi + 144] // 6*sy, A, sz, R, sx, x, sy | |
fsubr st, st(4) // sx-6*sy, A, sz, R, sx, x, sy | |
fmul st(0), st(4) // sx*(sx-6*sy), A, sz, R, sx, x, sy | |
fld st(6) | |
fmul st, st | |
faddp // sy*sy + sx*(sx-6*sy), A, sz, R, sx, x, sy | |
fmul st, st(1) | |
fadd qword [esi + 24] | |
fstp qword [eax] // A, sz, R, sx, x, sy | |
fxch st(2) // R, sz, A, sx, x, sy | |
fsubr st(1), st // R, R-sz, A, sx, x, sy | |
fsqrt | |
fmulp // sqrt(R)*(R-sz), A, sx, x, sy | |
fmul qword [ecx] | |
fmul qword [edi + 128] //*4 | |
fmul qword [edi - 16] //*dZmul | |
fadd qword [esi + 40] | |
fstp qword [ecx] // A, sx, x, sy | |
fxch // sx, A, x, sy y := 4 * x * y * A * (sx - sy) + J2; | |
fsubrp st(3), st // A, x, sx-sy | |
fmulp // A*x, sx-sy | |
fmulp | |
fmul qword [edi + 128] //*4 | |
fmul qword [edx] //*y | |
fadd qword [esi + 32] | |
fstp qword [edx] | |
pop edi | |
pop esi | |
end; | |
procedure HybridIntP5(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] //PIteration3D | |
fld qword [edx] | |
fmul st, st // y*y | |
fld qword [eax] // x, y*y | |
mov edi, [esi + 48] // PVars | |
fmul st, st // x*x, y*y | |
fld st // x*x, x*x, y*y | |
fadd st, st(2) // x*x+y*y = R, sx, sy | |
fld qword [ecx] | |
fmul st, st // sz, R, sx, sy | |
fld qword [edi + 136] // 5, sz, R, sx, sy | |
fld st // 5, 5, sz, R, sx, sy | |
fld st(2) | |
fmul st, st(4) | |
fadd st, st // sz*R*2, 5, 5, sz, R, sx, sy | |
fld st(3) | |
fmul st, st | |
fsubrp // sz*sz - sz*R*2, 5, 5, sz, R, sx, sy | |
fmulp // (sz*sz - sz*R*2) * 5, 5, sz, R, sx, sy | |
fld st(3) | |
fmul st, st // R*R, (sz*sz - sz*R*2) * 5, 5, sz, R, sx, sy | |
fadd qword [edi + 24] // 24-112 +1e-40 | |
fdivp // (sz*sz - sz*R*2) * 5 / R*R, 5, sz, R, sx, sy | |
fld1 | |
faddp // A, 5, sz, R, sx, sy | |
fld st(4) // sx, A, 5, sz, R, sx, sy | |
fmul qword [edi + 168] // 10*sx, A, 5, sz, R, sx, sy | |
fsub st, st(6) // 10*sx - sy, A, 5, sz, R, sx, sy | |
fmul st, st(6) // sy*(10*sx - sy), A, 5, sz, R, sx, sy | |
fld st(5) | |
fmul st, st | |
fmul st, st(3) // 5*sx*sx, sy*(10*sx - sy), A, 5, sz, R, sx, sy | |
fsubrp // 5*sx*sx - sy*(10*sx - sy), A, 5, sz, R, sx, sy | |
fmul st, st(1) | |
fmul qword [edx] | |
fadd qword [esi + 32] | |
fstp qword [edx] // A, 5, sz, R, sx, sy | |
fld st(3) | |
fmul st, st(2) | |
fadd st, st // 10*R, A, 5, sz, R, sx, sy | |
fsubr st, st(3) // sz-10*R, A, 5, sz, R, sx, sy | |
fmulp st(3), st // A, 5, sz*(sz-10*R), R, sx, sy | |
fxch st(3) // R, 5, sz*(sz-10*R), A, sx, sy | |
fmul st, st // | |
fmul st, st(1) // | |
faddp st(2), st // 5, sz*(sz-10*R)+5*R*R, A, sx, sy | |
fld st(4) | |
fmul st, st | |
fmul st, st(1) // 5*sy*sy, 5, sz*(sz-10*R)+5*R*R, A, sx, sy | |
fxch st(5) // sy, 5, sz*(sz-10*R)+5*R*R, A, sx, 5*sy*sy | |
fmulp | |
fadd st, st // 10*sy, sz*(sz-10*R)+5*R*R, A, sx, 5*sy*sy | |
fsubr st, st(3) // sx-10*sy, sz*(sz-10*R)+5*R*R, A, sx, 5*sy*sy | |
fmulp st(3), st // sz*(sz-10*R)+5*R*R, A, sx*(sx-10*sy), 5*sy*sy | |
fmul qword [ecx] | |
fmul qword [edi - 16] //*dZmul | |
fadd qword [esi + 40] | |
fstp qword [ecx] // A, sx*(sx-10*sy), 5*sy*sy | |
fmul qword [eax] | |
fxch // sx*(sx-10*sy), A*x, 5*sy*sy | |
faddp st(2), st // A*x, sx*(sx-10*sy)+5*sy*sy | |
fmulp | |
fadd qword [esi + 24] | |
fstp qword [eax] | |
pop edi | |
pop esi | |
end; | |
procedure HybridIntP6(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] //PIteration3D | |
fld qword [edx] | |
fmul st, st // y*y | |
fld qword [eax] // x, y*y | |
mov edi, [esi + 48] // PVars | |
fmul st, st // x*x, y*y | |
fld st // x*x, x*x, y*y | |
fadd st, st(2) // x*x+y*y = R, sx, sy | |
fld qword [ecx] | |
add edi, 112 | |
fmul st, st // sz, R, sx, sy | |
fld qword [edi + 176-112] // 15, sz, R, sx, sy | |
fld st // 15, 15, sz, R, sx, sy | |
fmul st, st(3) // 15*R, | |
fsubr st, st(2) // sz-R*15, 15, sz, R, sx, sy | |
fmul st, st(2) | |
fld st(3) | |
fmul st, st // R*R, sz*(sz-R*15), 15, sz, R, sx, sy | |
fxch | |
fld st(1) // R*R, sz*(sz-R*15), R*R, 15, sz, R, sx, sy | |
fmulp st(3), st // sz*(sz-R*15), R*R, 15*R*R, sz, R, sx, sy | |
faddp st(2), st // R*R, 15*R*R+sz*(sz-R*15), sz, R, sx, sy | |
fxch // 15*R*R+sz*(sz-R*15), R*R, sz, R, sx, sy | |
fmul st, st(2) // sz*(15*R*R+sz*(sz-R*15)), R*R, sz, R, sx, sy | |
fld st(1) | |
fmul st, st(4) // R*R*R, sz*(15*R*R+sz*(sz-R*15)), R*R, sz, R, sx, sy | |
fadd qword [edi + 24-112] // 24-112 +1e-40 | |
fdivp // sz*(15*R*R+sz*(sz-R*15)) / R*R*R, R*R, sz, R, sx, sy | |
fld1 | |
fsubrp // 1 - sz*(15*R*R+sz*(sz-R*15)) / R*R*R, R*R, sz, R, sx, sy | |
fld st(5) // sy, A, R*R, sz, R, sx, sy | |
fmul qword [edi + 168-112] // 10*sy, A, R*R, sz, R, sx, sy | |
fld st(5) | |
fmul qword [edi + 120-112] // 3*sx, 10*sy, A, R*R, sz, R, sx, sy | |
fsubrp // 3*sx-10*sy, A, R*R, sz, R, sx, sy | |
fmul st, st(5) // sx*(3*sx-10*sy), A, R*R, sz, R, sx, sy | |
fld st(6) // sy, | |
fmul st, st | |
fmul qword [edi + 120-112] // 3*sy*sy, sx*(3*sx-10*sy), A, R*R, sz, R, sx, sy | |
faddp // 3*sy*sy+sx*(3*sx-10*sy), A, R*R, sz, R, sx, sy | |
fmul st, st(1) | |
fmul qword [edx] | |
fmul qword [eax] // z := PDouble(Integer(PVar) - 16)^*2*z*Sqrt(R)*(sz*(3*sz - 10*R) + 3*R*R) + J3; | |
fadd st, st // x := A*(S1*S1*(S1 - 15*S2) + S2*S2*(15*S1 - S2)) + J1; | |
fadd qword [esi + 32] | |
fstp qword [edx] // A, R*R, sz, R, sx, sy | |
fld st(3) | |
fmul qword [edi + 168-112] | |
fld st(3) // sz, 10*R, A, R*R, sz, R, sx, sy | |
fmul qword [edi + 120-112] | |
fsubrp // 3*sz-10*R, A, R*R, sz, R, sx, sy | |
fmulp st(3), st // A, R*R, sz*(3*sz-10*R), R, sx, sy | |
fxch | |
fmul qword [edi + 120-112] // 3*R*R, A, sz*(3*sz-10*R), R, sx, sy | |
faddp st(2), st // A, sz*(3*sz-10*R)+3*R*R, R, sx, sy | |
fld qword [edi + 176-112] // 15 | |
fld st // 15, 15, A, sz*(3*sz-10*R)+3*R*R, R, sx, sy | |
fmul st, st(5) | |
fsub st, st(6) | |
fmul st, st(6) | |
fmul st, st(6) // S2*S2*(15*S1-S2), 15, A, sz*(3*sz-10*R)+3*R*R, R, sx, sy | |
fxch | |
fmulp st(6), st // S2*S2*(15*S1-S2), A, sz*(3*sz-10*R)+3*R*R, R, sx, 15*sy | |
fxch st(5) // 15*sy, A, sz*(3*sz-10*R)+3*R*R, R, sx, S2*S2*(15*S1-S2) | |
fsubr st, st(4) // sx-15*sy, A, sz*(3*sz-10*R)+3*R*R, R, sx, S2*S2*(15*S1-S2) | |
fmul st, st(4) | |
fmulp st(4), st // A, sz*(3*sz-10*R)+3*R*R, R, sx*sx*(sx-15*sy), S2*S2*(15*S1-S2) | |
fxch st(4) // S2*S2*(15*S1-S2), sz*(3*sz-10*R)+3*R*R, R, sx*sx*(sx-15*sy), A | |
faddp st(3), st // sz*(3*sz-10*R)+3*R*R, R, S2*S2*(15*S1-S2)+sx*sx*(sx-15*sy), A | |
fxch | |
fsqrt | |
fmulp // (sz*(3*sz-10*R)+3*R*R)*sqrt(R), S2*S2*(15*S1-S2)+sx*sx*(sx-15*sy), A | |
fmul qword [ecx] | |
fmul qword [edi - 16-112] //*dZmul | |
fadd st, st | |
fadd qword [esi + 40] | |
fstp qword [ecx] // S2*S2*(15*S1-S2)+sx*sx*(sx-15*sy), A | |
fmulp | |
fadd qword [esi + 24] | |
fstp qword [eax] | |
pop edi | |
pop esi | |
end; | |
procedure HybridIntP7(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] //PIteration3D | |
fld qword [edx] | |
fmul st, st // y*y | |
fld qword [eax] // x, y*y | |
mov edi, [esi + 48] // PVars | |
fmul st, st // x*x, y*y | |
fld st // x*x, x*x, y*y | |
fadd st, st(2) // x*x+y*y = R, sx, sy | |
fld qword [ecx] | |
add edi, 112 | |
fmul st, st // sz, R, sx, sy | |
fld st(1) // R, sz, R, sx, sy | |
fmul qword [edi + 136-112] // 5R, | |
fsubr st, st(1) // sz-5R, sz, R, sx, sy | |
fmul st, st(1) // sz(sz-5R), sz, R, sx, sy | |
fld st(2) | |
fmul st, st // R*R, sz(sz-5R), sz, R, sx, sy | |
fxch // sz(sz-5R), R*R, sz, R, sx, sy | |
fld st(1) // R*R, sz(sz-5R), R*R, sz, R, sx, sy | |
fmul qword [edi + 120-112] // 3*R*R, sz(sz-5R), R*R, sz, R, sx, sy | |
faddp // 3RR+sz(sz-5R), R*R, sz, R, sx, sy | |
fmul st, st(2) // sz(3RR+sz(sz-5R)), R*R, sz, R, sx, sy | |
fmul qword [edi + 152-112] | |
fld st(1) | |
fmul st, st(4) // R*R*R, 7sz(3RR+sz(sz-5R)), R*R, sz, R, sx, sy | |
fadd qword [edi + 24-112] // 24-112 +1e-40 | |
fdivp // 7sz(3RR+sz(sz-5R))/RRR, R*R, sz, R, sx, sy | |
fld1 | |
fsubrp // A, R*R, sz, R, sx, sy | |
fld st(5) // sy, A, R*R, sz, R, sx, sy | |
fmul qword [edi + 200-112] // 35*sy, A, R*R, sz, R, sx, sy | |
fld st(5) | |
fmul qword [edi + 152-112] // 7*sx, 35*sy, A, R*R, sz, R, sx, sy | |
fsubrp // 7*sx-35*sy, A, R*R, sz, R, sx, sy | |
fmul st, st(5) // sx*(7*sx-35*sy), A, R*R, sz, R, sx, sy | |
fld st(6) // sy, | |
fmul st, st | |
fmul qword [edi + 184-112] // 21*sy*sy, sx*(7*sx-35*sy), A, R*R, sz, R, sx, sy | |
faddp // 21sysy+sx(7sx-35sy), A, R*R, sz, R, sx, sy | |
fmul st, st(5) | |
fld st(6) | |
fmul st, st | |
fmul st, st(7) // sysysy, sx(21sysy+sx(7sx-35sy)), A, R*R, sz, R, sx, sy | |
fsubp // sx(21sysy+sx(7sx-35sy))-sysysy, A, R*R, sz, R, sx, sy | |
fmul st, st(1) | |
fmul qword [edx] | |
fadd qword [esi + 32] | |
fstp qword [edx] // A, R*R, sz, R, sx, sy | |
fmul qword [eax] // z := J3 - PDouble(Integer(PVar) - 16)^*z*(sz*sz*sz - 7*R*(sz*(3*sz - 5*R) + R*R)); | |
// x := A*x*(sx*(sx*(sx - 21*sy) + 35*sy*sy) - 7*sy*sy*sy) + J1; | |
fld st(3) | |
fmul qword [edi + 136-112] // 5R, A*x, R*R, sz, R, sx, sy | |
fld st(3) // sz, 5R, A*x, R*R, sz, R, sx, sy | |
fmul qword [edi + 120-112] | |
fsubrp // 3sz-5R, A*x, R*R, sz, R, sx, sy | |
fmul st, st(3) // sz(3sz-5R), A*x, R*R, sz, R, sx, sy | |
faddp st(2), st // A*x, RR+sz(3sz-5R), sz, R, sx, sy | |
fxch | |
fmul qword [edi + 152-112] // 7(RR+sz(3sz-5R)), A*x, sz, R, sx, sy | |
fmulp st(3), st // A*x, sz, 7R(sz(3sz-5R)+RR), sx, sy | |
fld st(1) | |
fmul st, st | |
fmulp st(2), st // A*x, szszsz, 7R(RR+sz(3sz-5R)), sx, sy | |
fxch // szszsz, A*x, 7R(RR+sz(3sz-5R)), sx, sy | |
fsubrp st(2), st // A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
fld st(3) // sy, A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
fmul qword [edi + 184-112] // 21sy, A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
fsubr st, st(3) // sx-21sy, A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
fmul st, st(3) // sx(sx-21sy) | |
fld st(4) // sy, sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
fmul st, st // sysy, sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
fmul qword [edi + 200-112] // 35sysy, sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
faddp // 35sysy+sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy | |
fmulp st(3), st // A*x, szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy)), sy | |
fxch st(3) // sy, szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy)), A*x | |
fld st | |
fmul st, st | |
fmulp | |
fmul qword [edi + 152-112] // 7sysysy, szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy)), A*x | |
fsubp st(2), st // szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy))-7sysysy, A*x | |
fmul qword [ecx] | |
fmul qword [edi - 16-112] //*dZmul | |
fsubr qword [esi + 40] | |
fstp qword [ecx] // sx(35sysy+sx(sx-21sy))-7sysysy, A*x | |
fmulp | |
fadd qword [esi + 24] | |
fstp qword [eax] | |
pop edi | |
pop esi | |
end; | |
procedure HybridIntP8(var x, y, z, w: Double; PIteration3D: TPIteration3D); //P8 white's formula | |
asm | |
push esi | |
push edi | |
mov esi, [ebp + 8] //PIteration3D | |
fld qword [eax] //x | |
mov edi, [esi + 48] //PVars | |
fmul st(0), st(0) //xx | |
fld qword [edx] //y | |
add edi, 88 | |
fmul st(0), st(0) //yy,xx | |
fld qword [ecx] //z,yy,xx | |
fmul st(0), st(0) //zz,yy,xx | |
fld st(2) //xx,zz,yy,xx | |
fadd st(0), st(2) //xx+yy=r,zz,yy,xx | |
fld st(0) //r,r,zz,yy,xx | |
fmul st(0), st(1) //rr,r,zz,yy,xx | |
fld st(2) | |
fmul st(0), st(0) //zzzz(S3*S3),rr,r,zz,yy,xx | |
fld st(2) //r,zzzz(S3*S3),rr,r,zz,yy,xx z calculation | |
fmul st(0), st(4) //r*zz | |
fmul qword [edi + 56] //6*r*zz,zzzz(S3*S3),rr,r,zz,yy,xx | |
fsubr st(0), st(1) //zzzz-6rzz,zzzz,rr,r,zz,yy,xx | |
fadd st(0), st(2) //zzzz-6rzz+rr,zzzz,rr,r,zz,yy,xx | |
fld st(4) //zz,zzzz-6rzz+rr,zzzz,rr,r,zz,yy,xx | |
fsub st(0), st(4) //zz-r,zzzz-6rzz+rr,zzzz,rr,r,zz,yy,xx | |
fmulp //(zz-r)*(zzzz-6rzz+rr),zzzz,rr,r,zz,yy,xx | |
fld st(3) //r,(zz-r)*(zzzz-6rzz+rr),zzzz,rr,r,zz,yy,xx | |
fsqrt | |
fmulp //sqrt(r)*(zz-r)*(zzzz-6rzz+rr),zzzz,rr,r,zz,yy,xx | |
fmul qword [ecx] //*z | |
fmul qword [edi + 72] //*8 | |
fmul qword [edi - 104] //*dZmul | |
fchs | |
fadd qword [esi + 40] //+J3 | |
fstp qword [ecx] //zzzz,rr,r,zz,yy,xx | |
fld st(0) //zzzz,zzzz,rr,r,zz,yy,xx a calculation | |
fadd st(0), st(2) //zzzz+rr,zzzz,rr,r,zz,yy,xx | |
fmulp st(3), st(0) //zzzz,rr,r*(zzzz+rr),zz,yy,xx | |
fld st(1) //rr,zzzz,rr,r*(zzzz+rr),zz,yy,xx | |
fmul qword [edi + 120] //rr*70,zzzz,rr,r*(zzzz+rr),zz,yy,xx | |
fadd st(0), st(1) | |
fmulp //(rr*70+zzzz)*zzzz,rr,r*(zzzz+rr),zz,yy,xx | |
fxch st(2) //r*(zzzz+rr),rr,(rr*70+zzzz)*zzzz,zz,yy,xx | |
fmulp st(3), st(0) //rr,(rr*70+zzzz)*zzzz,zz*r*(zzzz+rr),yy,xx | |
fxch st(2) //zz*r*(zzzz+rr),(rr*70+zzzz)*zzzz,rr,yy,xx | |
fmul qword [edi + 104] //28*zz*r*(zzzz+rr),(rr*70+zzzz)*zzzz,rr,yy,xx | |
fsubp //(rr*70+zzzz)*zzzz-28*zz*r*(zzzz+rr),rr,yy,xx | |
fxch st(1) | |
fmul st(0), st(0) //rrrr,(rr*70+zzzz)*zzzz-28*zz*r*(zzzz+rr),yy,xx | |
fadd qword [edi - 64] // 24-88 +1e-40 | |
fdivp //(zzzz*(rr*70+zzzz)-28*zz*r*(zzzz+rr))/rrrr,yy,xx | |
fadd qword [edi - 56] //a,yy,xx +1 | |
fld st(1) //yy,a,yy,xx y calculation | |
fmul qword [edi + 64] //7*yy,a,yy,xx + 152-128=24 | |
fld st(3) //xx,7*yy,a,yy,xx | |
fmul qword [edi + 64] //7*xx,7*yy,a,yy,xx | |
fsub st(0), st(3) //7*xx-yy,7*yy,a,yy,xx | |
fld st(4) //xx,7*xx-yy,7*yy,a,yy,xx | |
fsubr st(2), st(0) //xx,7*xx-yy,xx-7*yy,a,yy,xx | |
fmul st(0), st(0) //xxxx,7*xx-yy,xx-7*yy,a,yy,xx | |
fmul st(2), st(0) //xxxx,7xx-yy,xxxx(xx-7yy),a,yy,xx | |
fld st(4) //yy,xxxx,7xx-yy,xxxx(xx-7yy),a,yy,xx | |
fmul st(0), st(0) //yyyy,xxxx,7xx-yy,xxxx(xx-7yy),a,yy,xx | |
fmul st(2), st(0) //yyyy,xxxx,yyyy(7xx-yy),xxxx(xx-7yy),a,yy,xx | |
fxch st(2) //yyyy(7xx-yy),xxxx,yyyy,xxxx(xx-7yy),a,yy,xx | |
faddp st(3), st(0) //xxxx,yyyy,yyyy(7xx-yy)+xxxx(xx-7yy),a,yy,xx | |
fxch st(2) //yyyy(7xx-yy)+xxxx(xx-7yy),yyyy,xxxx,a,yy,xx | |
fmul qword [edi + 72] //*8 | |
fmul qword [eax] //*x | |
fmul qword [edx] //*y | |
fmul st(0), st(3) //*a | |
fadd qword [esi + 32] //+J2 | |
fstp qword [edx] //yyyy,xxxx,a,yy,xx | |
fld st(1) //xxxx,yyyy,xxxx,a,yy,xx | |
fmul qword [edi + 120] //70xxxx,yyyy,xxxx,a,yy,xx | |
fadd st(0), st(1) //70xxxx+yyyy,yyyy,xxxx,a,yy,xx | |
fmul st(0), st(1) //yyyy(70xxxx+yyyy),yyyy,xxxx,a,yy,xx | |
fxch st(1) //yyyy,yyyy(70xxxx+yyyy),xxxx,a,yy,xx | |
fadd st(0), st(2) //yyyy+xxxx,yyyy(70xxxx+yyyy),xxxx,a,yy,xx | |
fmulp st(4), st(0) //yyyy(70xxxx+yyyy),xxxx,a,yy(yyyy+xxxx),xx | |
fxch st(4) //xx,xxxx,a,yy(yyyy+xxxx),yyyy(70xxxx+yyyy) | |
fmulp st(3), st(0) //xxxx,a,xxyy(yyyy+xxxx),yyyy(70xxxx+yyyy) | |
fmul st(0), st(0) //xxxx*xxxx,a,xxyy(yyyy+xxxx),yyyy(70xxxx+yyyy) | |
faddp st(3), st(0) //a,xxyy(yyyy+xxxx),xxxx*xxxx+yyyy(70xxxx+yyyy) | |
fxch st(1) //xxyy(yyyy+xxxx),a,xxxx*xxxx+yyyy(70xxxx+yyyy) | |
fmul qword [edi + 104] | |
fsubp st(2), st(0) //a,xxxx*xxxx+yyyy(70xxxx+yyyy)-28xxyy(yyyy+xxxx) | |
fmulp | |
fadd qword [esi + 24] | |
fstp qword [eax] | |
pop edi | |
pop esi | |
end; | |
procedure HybridCubeSSE2(var x, y, z, w: Double; PIteration3D: TPIteration3D); // is used in alt hybrid without DE on w | |
asm | |
push esi | |
push ebx | |
mov esi, [ebp + 8] //PIteration3D | |
mov ebx, [esi + 48] | |
movupd xmm2, [eax] //[x,y] | |
movsd xmm4, [ecx] //[z] | |
movapd xmm0, xmm2 | |
maxpd xmm0, [ebx - 64] //const:-1,-1,1,1 | |
maxsd xmm4, [ebx - 64] | |
minpd xmm0, [ebx - 48] | |
minsd xmm4, [ebx - 48] | |
addpd xmm0, xmm0 | |
addsd xmm4, xmm4 | |
subpd xmm0, xmm2 | |
subsd xmm4, [ecx] | |
movapd xmm1, xmm0 //x, y | |
movsd xmm5, xmm4 | |
mulpd xmm1, xmm1 //x*x, y*y | |
mulsd xmm5, xmm5 //z*z | |
pshufd xmm6, xmm1, $4E //y*y, x*x copies and swaps hi<>lo | |
addsd xmm1, xmm5 //x*x + z*z | |
addsd xmm1, xmm6 // w = sqr(r) | |
ucomisd xmm1, [ebx - 32] //<dOption2 was:dOpt3 | |
jnb @u1 | |
movsd xmm3, [ebx - 24] //dOption1 | |
jmp @u3 | |
@u1:ucomisd xmm1, [ebx + 32] //<1 ? ucomisd slow? | |
movsd xmm3, [ebx - 16] //dPow = scale //Was:dOpt2 | |
jnb @u3 | |
divsd xmm3, xmm1 | |
@u3:shufpd xmm3, xmm3, 0 | |
movupd xmm5, [esi + 24] //[J1,J2] | |
mulpd xmm0, xmm3 | |
mulsd xmm4, xmm3 | |
addpd xmm0, xmm5 | |
addsd xmm4, [esi + 40] //J3 | |
movlpd [eax], xmm0 | |
movhpd [edx], xmm0 | |
movsd [ecx], xmm4 | |
pop ebx | |
pop esi | |
end; | |
procedure HybridCube(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi //Amazing box x87 with options fold fold x2 | |
push ebx | |
mov esi, [ebp + 8] //PIteration3D | |
mov esi, [esi + 48] //was:PAligned16 | |
mov ebx, eax | |
fld qword [esi - 40] //fold | |
fld qword [eax] //x,fold | |
fld st(0) //x,x,fold folding with x = abs(x+fold) - abs(x-fold) - x | |
fsub st(0), st(2) | |
fabs | |
fadd st(0), st(1) //abs(x-fold)+x,x,fold | |
fxch //x,abs(x-fold)+x,fold | |
fadd st(0), st(2) | |
fabs | |
fsubrp //abs(x+fold)-(abs(x-fold)+x),fold | |
fld qword [edx] //y,fold | |
fld st(0) | |
fsub st(0), st(3) | |
fabs | |
fadd st(0), st(1) | |
fxch | |
fadd st(0), st(3) | |
fabs | |
fsubrp | |
fld qword [ecx] //y,fold | |
fld st(0) | |
fsub st(0), st(4) | |
fabs | |
fadd st(0), st(1) | |
fxch | |
fadd st(0), st(4) | |
fabs | |
fsubrp //z,y,x,fold | |
fld st(0) //7 | |
fmul st(0), st(1) | |
fld st(2) //8 | |
fmul st(0), st(3) | |
faddp //7 | |
fld st(3) //8 | |
fmul st(0), st(4) | |
faddp //r,z,y,x,-fold,fold,fold x2 | |
fcom qword [esi - 32] | |
fnstsw ax | |
shr ah, 1 | |
jnc @@7 | |
fstp st(0) | |
fld qword [esi - 24] | |
jmp @@9 | |
@@7: //r,z,y,x,-fold,fold,fold x2 | |
fld1 | |
fcom st(1) | |
fnstsw ax | |
shr ah, 1 | |
jc @@8 | |
fstp st(0) | |
fdivr qword [esi - 16] | |
jmp @@9 | |
@@8: | |
fcompp | |
fld qword [esi - 16] | |
@@9: | |
fmul st(3), st(0) //mul,zr,yr,xr,-fold,fold, foldx2 | |
fmul st(2), st(0) | |
fmulp //zr,yr,xr,-fold,fold, foldx2 | |
mov esi, [ebp + 8] | |
fadd qword [esi + 40] | |
fstp qword [ecx] | |
fadd qword [esi + 32] | |
fstp qword [edx] | |
fadd qword [esi + 24] | |
fstp qword [ebx] | |
fstp st(0) | |
mov eax, ebx | |
pop ebx | |
pop esi | |
end; | |
procedure HybridCubeDE(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi //Amazing box without adding c x87 with option fold | |
push ebx | |
mov esi, [ebp + 8] //PIteration3D | |
mov ebx, eax | |
mov esi, [esi + 48] //was:PAligned16 | |
fld qword [esi - 40] //fold | |
fld st(0) | |
fchs //-fold,fold | |
fld qword [ebx] //x,-fold,fold | |
fld st(0) //x,x,-fold,fold folding with x = abs(x+fold) - abs(x-fold) - x | |
fadd st(0), st(2) | |
fabs | |
fadd st(0), st(1) | |
fxch //x,abs(x-fold)+x,-fold,fold | |
fadd st(0), st(3) | |
fabs | |
fsubrp //abs(x+fold)-(abs(x-fold)+x),-fold,fold | |
fld qword [edx] //y,x,-fold,fold | |
fld st(0) | |
fadd st(0), st(3) | |
fabs | |
fadd st(0), st(1) | |
fxch | |
fadd st(0), st(4) | |
fabs | |
fsubrp | |
fld qword [ecx] //z,y,x,-fold,fold | |
fld st(0) | |
fadd st(0), st(4) | |
fabs | |
fadd st(0), st(1) | |
fxch | |
fadd st(0), st(5) | |
fabs | |
fsubrp | |
fld st(0) //7 | |
fmul st(0), st(1) | |
fld st(2) //8 | |
fmul st(0), st(3) | |
faddp //7 | |
fld st(3) //8 | |
fmul st(0), st(4) | |
faddp //r,z,y,x,-fold,fold | |
fcom qword [esi - 32] | |
fnstsw ax | |
shr ah, 1 | |
jnc @@7 | |
fstp st(0) | |
fld qword [esi - 24] | |
jmp @@9 | |
@@7: //r,z,y,x,-fold,fold | |
fld1 | |
fcom st(1) | |
fnstsw ax | |
shr ah, 1 | |
jc @@8 | |
fstp st(0) | |
fdivr qword [esi - 16] | |
jmp @@9 | |
@@8: | |
fcompp | |
fld qword [esi - 16] | |
@@9: | |
fld qword [ecx + 8] //w,mul,zr,yr,xr,-fold,fold | |
fmul st(0), st(1) | |
fstp qword [ecx + 8] | |
fmul st(3), st(0) //mul,zr,yr,xr,-fold,fold | |
fmul st(2), st(0) | |
fmulp //zr,yr,xr,-fold,fold | |
mov esi, [ebp + 8] | |
fadd qword [esi + 40] | |
fstp qword [ecx] | |
fadd qword [esi + 32] | |
fstp qword [edx] | |
fadd qword [esi + 24] | |
fstp qword [ebx] | |
fcompp | |
mov eax, ebx | |
pop ebx | |
pop esi | |
end; | |
procedure HybridCubeSSE2DE(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push ebx | |
mov esi, [ebp + 8] //PIteration3D | |
mov ebx, [esi + 48] //was:PAligned16 | |
movupd xmm2, [eax] //[x,y] | |
movsd xmm4, [ecx] //[z] | |
movapd xmm0, xmm2 | |
maxpd xmm0, [ebx - 64] //const:-R,-R,R,R | |
maxsd xmm4, [ebx - 64] | |
minpd xmm0, [ebx - 48] | |
minsd xmm4, [ebx - 48] | |
addpd xmm0, xmm0 | |
addsd xmm4, xmm4 | |
subpd xmm0, xmm2 | |
subsd xmm4, [ecx] | |
movapd xmm1, xmm0 //x, y | |
movsd xmm5, xmm4 | |
mulpd xmm1, xmm1 //x*x, y*y | |
mulsd xmm5, xmm5 //z*z | |
pshufd xmm2, xmm1, $4E //y*y, x*x copies and swaps hi<>lo | |
addsd xmm1, xmm5 | |
addsd xmm1, xmm2 // w = sqr(r) | |
ucomisd xmm1, [ebx - 32] //<dOption2 //7/6 clocks ucomisd latency :-( | |
movsd xmm3, [ebx - 24] //dOption1 | |
jb @u3 | |
ucomisd xmm1, [ebx + 32] //<1 ? | |
movsd xmm3, [ebx - 16] //dPow = scale | |
jnb @u3 | |
divsd xmm3, xmm1 | |
@u3: | |
movhpd xmm4, [ecx + 8] | |
shufpd xmm3, xmm3, 0 //r, r | |
movupd xmm5, [esi + 24] //[J1,J2] | |
mulpd xmm0, xmm3 | |
mulpd xmm4, xmm3 | |
addpd xmm0, xmm5 | |
addsd xmm4, [esi + 40] //J3 | |
movupd [eax], xmm0 | |
movupd [ecx], xmm4 | |
pop ebx | |
pop esi | |
end; | |
procedure HybridItIntPow2scale(var x, y, z, w: Double; PIteration3D: TPIteration3D); //sine bulb with scaling | |
asm | |
push esi | |
push edi | |
mov edi, [ebp + 8] | |
mov esi, [edi + 48] | |
fld qword [ecx] | |
fld qword [edx] | |
fld qword [eax] //x,y,z | |
fld qword [esi - 72] // scaling | |
fld1 | |
fdivrp | |
fmul st(3), st(0) | |
fmul st(2), st(0) | |
fmulp | |
fld st(1) //y,x,y,z | |
fmul st(0), st(2) // y*y,x,y,z | |
fld st(1) // x,y*y,x,y,z | |
fmul st(0), st(2) // x*x, y*y,x,y,z | |
fld st(0) // x*x, x*x, y*y,x,y,z | |
fadd st(0), st(2) // xx+yy, xx, yy,x,y,z | |
fld st(0) // xx+yy, xx+yy, xx, yy,x,y,z | |
fsqrt | |
fmul st(0), st(6) //*z | |
fadd st(0), st(0) //*2 | |
fchs | |
fmul qword [esi - 72] | |
fadd qword [edi + 40] | |
fstp qword [ecx] //xx+yy, xx, yy,x,y,z | |
fld st(5) //z, xx+yy, xx, yy,x,y,z | |
fmulp st(6), st(0) //xx+yy, xx, yy,x,y,z*z | |
fld st(0) //xx+yy, xx+yy, xx, yy,x,y,z*z | |
fsubrp st(6), st(0) //xx+yy, xx, yy,x,y, a - z*z | |
fdivp st(5), st(0) //xx, yy,x,y, a - z*z / a = a | |
fsubrp //xx-yy,x,y, a | |
fmul st(0), st(3) //a(xx-yy),x,y, a | |
fmul qword [esi - 72] | |
fadd qword [edi + 24] | |
fstp qword [eax] //x,y, a | |
fmulp | |
fmulp //x*y*a | |
fadd st(0), st(0) //*2 | |
fmul qword [esi - 72] | |
fadd qword [edi + 32] | |
fstp qword [edx] | |
pop edi | |
pop esi | |
end; | |
procedure HybridFolding(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
push ebx | |
mov esi, [ebp + 8] //PIteration3D | |
mov edi, [esi + 48] | |
fld qword [edi - 24] //fold | |
fld qword [eax] //x,fold | |
fld st(0) //x,x,fold folding with x = abs(x+fold) - abs(x-fold) - x | |
fsub st(0), st(2) | |
fabs | |
fadd st(0), st(1) //abs(x-fold)+x,x,fold | |
fxch //x,abs(x-fold)+x,fold | |
fadd st(0), st(2) | |
fabs | |
fsubrp //abs(x+fold)-(abs(x-fold)+x),fold | |
fstp qword [eax] | |
fld qword [edx] //y,fold | |
fld st(0) | |
fsub st(0), st(2) | |
fabs | |
fadd st(0), st(1) | |
fxch | |
fadd st(0), st(2) | |
fabs | |
fsubrp | |
fstp qword [edx] | |
fld qword [ecx] //z,fold | |
fld st(0) | |
fsub st(0), st(2) //z-fold,z,fold | |
fabs | |
fadd st(0), st(1) //z+abs(z-fold),z,fold | |
fxch st(2) | |
faddp //z+fold,z+abs(z-fold) | |
fabs | |
fsubrp //z' | |
fstp qword [ecx] | |
mov ebx, [ebp + 12] | |
push ebx | |
push esi | |
call [edi - 52] | |
pop ebx | |
pop edi | |
pop esi | |
end; | |
procedure HybridCustomIFS; //for IFS, different calling convention! esi+edi is @it3dext.x+128 and @Pvar | |
asm | |
end; | |
procedure AexionC(var x, y, z, w: Double; PIteration3D: TPIteration3D); | |
asm | |
push esi | |
push edi | |
push ebx | |
push ecx | |
mov esi, [ebp + 8] //PIteration3D | |
mov edi, [esi + 48] | |
fld qword [ecx] | |
fld qword [edx] | |
fld qword [eax] //x,y,z | |
fld st(1) | |
fmul st, st //yy,x,y,z | |
fxch st(2) //y,x,yy,z | |
fld st(3) | |
fmul st, st //zz,y,x,yy,z | |
fld st(2) | |
fmul st, st //xx,zz,y,x,yy,z | |
fld st(1) //zz,xx,zz,y,x,yy,z | |
fadd st, st(1) | |
faddp st(5), st //xx,zz,y,x,r1,z | |
faddp | |
fsqrt //sqrt(xx+zz),y,x,r1,z | |
fxch | |
fpatan //th,x,r1,z | |
fxch st(3) | |
fxch //x,z,r1,th | |
fpatan //ph,r1,th | |
fld qword [edi - 16] //pow,ph,r1,th | |
fmul st(3), st | |
fmul st(1), st | |
fmul qword [edi - 8] //pow*0.5,ph,r1,th | |
fxch //ph,pow',r1,th | |
fxch st(2) //r1,pow',ph,th | |
fldln2 //power function base,expo -> st, st(1) | |
fxch | |
fyl2x | |
fxch | |
fmulp | |
fldl2e | |
fmulp | |
fld st(0) | |
frndint | |
fsub st(1), st(0) | |
fxch | |
f2xm1 | |
fld1 | |
faddp | |
fscale | |
fstp st(1) //r1',ph,th | |
fxch st(2) //th, ph, r1 | |
fsincos //ct,st, ph, r1 | |
fxch st(2) //ph, st,ct, r1 | |
fsincos //cosP,sinP, sinT,cosT, r1 | |
fmul st, st(3) | |
fmul st, st(4) | |
fadd qword [esi + 24] | |
fstp qword [eax] //sinP, sinT,cosT, r1 | |
fmulp st(2), st //sinT,cosT*SinP, r1 | |
fmul st, st(2) | |
fmul qword [edi - 24] | |
fadd qword [esi + 40] | |
fstp qword [ecx] //cosT*SinP, r1 | |
fmulp | |
fadd qword [esi + 32] | |
fstp qword [edx] | |
cmp dword [edi - 28], 0 | |
jz @@1 | |
fld qword [edi - 40] //pd^ | |
cmp dword [edi - 52], 0 | |
jz @@2 | |
fld qword [eax] | |
fsub qword [esi + 24] | |
fmul st, st | |
fld qword [edx] | |
fsub qword [esi + 32] | |
fmul st, st | |
faddp | |
fld qword [ecx] | |
fsub qword [esi + 40] | |
fmul st, st | |
faddp | |
fsqrt | |
fmulp | |
@@2: //pd^ | |
fld qword [esi + 24] | |
fmul st, st | |
fld qword [esi + 32] | |
fmul st, st | |
faddp | |
fld qword [esi + 40] | |
fmul st, st | |
faddp | |
fsqrt //r1, pd^ | |
mov ebx, [edi - 56] | |
test ebx, 16 //Modus Bit1: Flip atan theta, 2: Flip atan phi, 3: Flip theta and phi, 4: Flip CxCy, 5: diffs | |
jz @@4 // r,y z,x y<>x/z y<>x | |
fld qword [eax] | |
fsub qword [esi + 24] | |
fld qword [ecx] | |
fsub qword [esi + 40] | |
fld qword [edx] | |
fsub qword [esi + 32] | |
jmp @@5 | |
@@4: | |
fld qword [esi + 24] | |
fld qword [esi + 40] | |
fld qword [esi + 32] | |
@@5: //Cy, Cz, Cx, r1, pd^ | |
xor eax, eax //offset for cond phi test, normally x, or z if only flip-at2 | |
xor ecx, ecx | |
add ecx, 8 | |
test ebx, 8 //Modus bit4: flip CYCX | |
jz @@6 | |
fxch //st(2) //(Cx,Cz,Cy) | |
add ecx, 8 //(Cz,Cx,Cy) test: Flip Cy<>Cz | |
@@6: //y, z, x, r1, pd^ | |
fld st(1) | |
fmul st, st | |
fld st(3) | |
fmul st, st | |
faddp //xx+zz, y, z, x, r1, pd^ | |
fsqrt //sqrt(sqr(j1)+sqr(j3)), y, z, x, r1, pd | |
test ebx, 1 //flip AT theta | |
jnz @@8 | |
fxch | |
@@8: | |
fpatan //th, Cz, Cx, r1, pd | |
fxch st(2) //Cx, Cz, th, r1, pd | |
test ebx, 2 //flip AT phi | |
jz @@9 | |
fxch | |
add eax, 24 | |
sub eax, ecx | |
@@9: | |
fpatan //ph, th, r1, pd | |
test ebx, 4 | |
jz @@7 | |
fxch | |
mov eax, ecx | |
@@7: | |
cmp dword [edi - 32], 0 | |
jz @@10 | |
test dword [edx + eax - 4], $80000000 | |
jnz @@10 | |
fchs | |
@@10: | |
fmul st, st(3) | |
fxch st(3) //pd, th, r1, ph | |
fmulp //th, r1, ph | |
fsincos //costh,sinth,r1,ph | |
fxch st(3) //ph,sinth,r1,costh | |
fsincos //Cx,Sx,Sy,r1,Cy | |
fmul st, st(4) | |
fmul st, st(3) | |
fstp qword [esi + 24] //Sx,Sy,r1,Cy | |
fmulp st(3), st //Sy,r1,Cy*Sx | |
fmul st, st(1) | |
fmul qword [edi - 48] | |
fstp qword [esi + 40] //r1,Cy*Sx | |
fmulp | |
fstp qword [esi + 32] | |
@@1: | |
pop ecx | |
pop ebx | |
pop edi | |
pop esi | |
end; | |
procedure TCrc32Stream.add(var data; datasize:longint); assembler; register; | |
asm | |
pushad | |
mov edi, eax | |
mov esi, edx | |
jecxz @done | |
mov edx, [TCrc32Stream(edi).curcrc] | |
cld | |
@lp1: | |
xor eax, eax | |
lodsb | |
xor al, dl | |
shl eax, 2 | |
mov ebx, OFFSET CrcTable | |
add ebx, eax | |
mov eax, [ebx] | |
shr edx, 8 | |
xor edx, eax | |
loop @lp1 | |
mov [TCrc32Stream(edi).curcrc], edx | |
@done: | |
popad | |
end; | |
function DotOf2VecNormalize(norm, light, view: TPSVec): Single; | |
asm | |
fld dword [eax] | |
fld dword [eax + 4] | |
fld dword [eax + 8] //norm2, norm1, norm0 | |
fld dword [ecx] | |
fmul st, st(3) | |
fld dword [ecx + 4] | |
fmul st, st(3) | |
faddp | |
fld dword [ecx + 8] | |
fmul st, st(2) | |
faddp | |
fadd st, st //d2, norm2, norm1, norm0 | |
fmul st(3), st | |
fmul st(2), st | |
fmulp //norm2', norm1', norm0' | |
fsubr dword [ecx + 8] | |
fmul dword [edx + 8] | |
fxch | |
fsubr dword [ecx + 4] | |
fmul dword [edx + 4] | |
faddp | |
fxch | |
fsubr dword [ecx] | |
fmul dword [edx] | |
faddp | |
end; | |
procedure calcAmbshadow(var dAmbS, sAmplitude: Single; PsiLight: TPsiLight5); | |
const s1d16383: Single = 1/16383; | |
asm | |
fld1 | |
cmp word [ecx + 12], 16383 | |
jl @@2 | |
fld1 | |
jmp @@3 | |
@@2: | |
fild word [ecx + 12] | |
fmul s1d16383 | |
@@3: | |
fld dword [edx] //Ampl, Shadow, 1 | |
mov edx, eax | |
fcom st(2) | |
fnstsw ax | |
shr ah, 1 | |
jc @@1 | |
fxch | |
fsubr st, st(2) //dAmbS, Ampl, 1 | |
fxch //Ampl,dAmbS,1 | |
fsubrp st(2), st //dAmbS,Ampl-1 | |
fld st | |
fmul st, st | |
fsub st, st(1) //Sqr(dAmbS)-dAmbS,dAmbS,Ampl-1 | |
fmulp st(2), st | |
faddp | |
fstp dword [edx] | |
ret | |
@@1: | |
fmulp | |
fsubp | |
fstp dword [edx] | |
end; | |
function SqrSV255(const sv: TSVec): TSVec; | |
asm | |
fld dword [eax] | |
fmul st, st | |
fld dword [eax + 4] | |
fmul st, st | |
fld dword [eax + 8] | |
fmul st, st | |
fld s1d255 | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fstp [edx + 8] | |
fstp [edx + 4] | |
fstp [edx] | |
end; | |
function ConvertVLight(Win: Integer): Integer; | |
asm | |
push ecx | |
and eax, $3FF | |
mov ecx, eax | |
shr ecx, 7 | |
and eax, $7F | |
shl eax, cl | |
pop ecx | |
mov edx, eax | |
end; | |
function AddSVecWeight(const SPos, SPosPlus: TSVec; const Step: Integer): TSVec; //math3d: procedure AddSVecWeight(V1, V2: TPSVec; W: Double); | |
asm | |
push ecx | |
push ebx | |
mov ebx, [ebp + 8] | |
mov [ebp - 4], ecx | |
fld dword [edx] | |
fld dword [edx + 4] | |
fld dword [edx + 8] | |
fild dword [ebp - 4] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fadd dword [eax + 8] | |
fstp dword [ebx + 8] | |
fadd dword [eax + 4] | |
fstp dword [ebx + 4] | |
fadd dword [eax] | |
fstp dword [ebx] | |
pop ebx | |
pop ecx | |
end; | |
procedure ScaleSVecHDR(sv1: TPSVec); | |
const s09: Single = 0.9; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm0, s09 | |
movups xmm1, cSVec1 | |
shufps xmm0, xmm0, 0 | |
movups xmm2, dqword [eax] | |
movaps xmm3, xmm2 | |
mulps xmm2, xmm0 | |
mulps xmm2, xmm2 | |
addps xmm2, xmm1 | |
rsqrtps xmm2, xmm2 | |
mulps xmm3, xmm2 | |
movups dqword [eax], xmm3 | |
ret | |
@@1: | |
fld1 | |
fld s09 | |
fld dword [eax] | |
fmul st, st(1) | |
fmul st, st | |
fadd st, st(2) | |
fsqrt | |
fdivr dword [eax] | |
fstp dword [eax] | |
fld dword [eax + 4] | |
fmul st, st(1) | |
fmul st, st | |
fadd st, st(2) | |
fsqrt | |
fdivr dword [eax + 4] | |
fstp dword [eax + 4] | |
fld dword [eax + 8] | |
fmulp | |
fmul st, st | |
faddp | |
fsqrt | |
fdivr dword [eax + 8] | |
fstp dword [eax + 8] | |
end; | |
procedure ScaleSingleHDR(var s: Single); | |
const s09: Single = 0.9; | |
s1: Single = 1; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm0, dword [eax] | |
movss xmm1, xmm0 | |
mulss xmm0, s09 | |
mulss xmm0, xmm0 | |
addss xmm0, s1 | |
rsqrtss xmm0, xmm0 | |
mulss xmm1, xmm0 | |
movss dword [eax], xmm1 | |
ret | |
@@1: | |
fld1 //x := x / Sqrt(Sqr(x * 0.9) + 1); | |
fld dword [eax] | |
fmul s09 | |
fmul st, st | |
faddp | |
fsqrt | |
fdivr dword [eax] | |
fstp dword [eax] | |
end; | |
procedure ScaleSingleHDRsqr(var s: Single); | |
const s09: Single = 0.9; | |
s1: Single = 1; | |
asm | |
cmp SupportSSE, 0 | |
jz @@1 | |
movss xmm0, dword [eax] | |
mulss xmm0, xmm0 | |
movss xmm1, xmm0 | |
mulss xmm0, s09 | |
mulss xmm0, xmm0 | |
addss xmm0, s1 | |
rsqrtss xmm0, xmm0 | |
mulss xmm1, xmm0 | |
sqrtss xmm1, xmm1 | |
movss dword [eax], xmm1 | |
ret | |
@@1: | |
fld dword [eax] //x := Sqrt(x*x / Sqrt(Sqr(x*x * 0.9) + 1)); | |
fmul st, st | |
fld st //xx,xx | |
fmul s09 | |
fmul st, st | |
fld1 | |
faddp | |
fsqrt | |
fdivp | |
fsqrt | |
fstp dword [eax] | |
end; | |
procedure SVec2ColSSE(sv1: TPSVec; pc: PCardinal); | |
asm // eax edx | |
add esp, -16 | |
movups xmm0, dqword [eax] | |
movups xmm1, cSVec1 | |
movups xmm2, cSVec255 | |
xorps xmm3, xmm3 | |
minps xmm0, xmm1 | |
maxps xmm0, xmm3 | |
mulps xmm0, xmm2 | |
movups [esp], xmm0 | |
cvtss2si eax, xmm0 | |
fld dword [esp + 8] | |
fistp word [edx] | |
fld dword [esp + 4] | |
fistp word [edx + 1] | |
mov [edx + 2], al | |
add esp, 16 | |
end; | |
procedure LabCubicRootSSE(sv: TPSVec); | |
const wstart: array[0..3] of Single = (0.4275, 0.4275, 0.4275, 0.4275); | |
sftc: array[0..3] of Single = (216/24389, 216/24389, 216/24389, 216/24389); | |
smul: array[0..3] of Single = (841/108, 841/108, 841/108, 841/108); | |
asm | |
movups xmm0, [eax] //r | |
movaps xmm4, xmm0 | |
movups xmm6, sftc | |
movups xmm1, wstart | |
maxps xmm4, xmm6 | |
movups xmm7, smul | |
movaps xmm5, xmm4 | |
minps xmm0, xmm6 | |
addps xmm5, xmm5 //2r | |
mov edx, 3 | |
@ll: movaps xmm2, xmm1 | |
mulps xmm2, xmm2 | |
mulps xmm2, xmm1 //www | |
movaps xmm3, xmm2 | |
addps xmm3, xmm3 | |
addps xmm2, xmm5 | |
addps xmm3, xmm4 | |
mulps xmm1, xmm2 | |
divps xmm1, xmm3 | |
dec edx | |
jnz @ll | |
subps xmm0, xmm6 | |
mulps xmm0, xmm7 | |
addps xmm0, xmm1 | |
movups [eax], xmm0 | |
end; | |
procedure LabCubicRoot2SSE(sv: TPSVec); //rsqrtps less precise! | |
const wstart: array[0..3] of Single = (0.3661, 0.3661, 0.3661, 0.3661); | |
sftc: array[0..3] of Single = (216/24389, 216/24389, 216/24389, 216/24389); | |
smul: array[0..3] of Single = (841/108, 841/108, 841/108, 841/108); | |
s1d3: array[0..3] of Single = (1/3, 1/3, 1/3, 1/3); | |
s4d3: array[0..3] of Single = (4/3, 4/3, 4/3, 4/3); | |
asm | |
movups xmm0, [eax] //r | |
movaps xmm4, xmm0 | |
movups xmm6, sftc | |
movups xmm1, wstart | |
movups xmm5, s4d3 | |
movups xmm3, s1d3 | |
maxps xmm4, xmm6 | |
movups xmm7, smul | |
minps xmm0, xmm6 | |
mov edx, 3 | |
@ll: movaps xmm2, xmm1 | |
mulps xmm1, xmm4 //w*r | |
rsqrtps xmm1, xmm1 | |
mulps xmm2, xmm3 | |
rsqrtps xmm1, xmm1 | |
mulps xmm1, xmm5 | |
subps xmm1, xmm2 | |
dec edx | |
jnz @ll | |
subps xmm0, xmm6 | |
mulps xmm0, xmm7 | |
addps xmm0, xmm1 | |
movups [eax], xmm0 | |
end; | |
procedure LabPow3SSE(sv: TPSVec); | |
const sftc: array[0..3] of Single = (6/29, 6/29, 6/29, 6/29); | |
smul: array[0..3] of Single = (108/841, 108/841, 108/841, 108/841); | |
asm | |
movups xmm0, [eax] //r | |
movaps xmm4, xmm0 | |
movups xmm6, sftc | |
maxps xmm4, xmm6 | |
movups xmm7, smul | |
movaps xmm5, xmm4 | |
minps xmm0, xmm6 | |
mulps xmm5, xmm5 | |
subps xmm0, xmm6 | |
mulps xmm5, xmm4 //rrr | |
mulps xmm0, xmm7 | |
addps xmm0, xmm5 | |
movups [eax], xmm0 | |
end; | |
procedure QuickSortInt(count: Integer; var List: array of TSortItem); | |
procedure QuickSort(const L, R: Integer; List: TPSortItem); //L:eax R:edx List:ecx | |
asm | |
push ebx | |
push esi | |
push edi | |
mov ebx, eax //Lpos := L | |
mov esi, edx //Rpos := R | |
dec ebx | |
mov edi, [ecx + edx * 8] //ListR := List[R].iZ; | |
@@1: | |
inc ebx | |
cmp edi, [ecx + ebx * 8] | |
jg @@1 | |
@@2: | |
dec esi | |
cmp esi, ebx | |
jle @@4 //break | |
cmp edi, [ecx + esi * 8] | |
jl @@2 | |
push eax | |
push edx | |
mov eax, [ecx + ebx * 8] | |
mov edx, [ecx + esi * 8] | |
mov [ecx + esi * 8], eax | |
mov [ecx + ebx * 8], edx | |
mov eax, [ecx + ebx * 8 + 4] | |
mov edx, [ecx + esi * 8 + 4] | |
mov [ecx + esi * 8 + 4], eax | |
mov [ecx + ebx * 8 + 4], edx | |
pop edx | |
pop eax | |
jmp @@1 | |
@@4: | |
mov esi, [ecx + ebx * 8] | |
mov [ecx + edx * 8], esi | |
mov [ecx + ebx * 8], edi | |
mov esi, [ecx + ebx * 8 + 4] | |
mov edi, [ecx + edx * 8 + 4] | |
mov [ecx + edx * 8 + 4], esi | |
mov [ecx + ebx * 8 + 4], edi | |
dec ebx | |
cmp ebx, eax | |
jle @@5 | |
mov esi, edx | |
mov edx, ebx | |
call QuickSort | |
mov edx, esi | |
@@5: | |
add ebx, 2 | |
cmp ebx, edx | |
jge @@6 | |
mov esi, eax | |
mov eax, ebx | |
call QuickSort | |
mov eax, esi | |
@@6: | |
pop edi | |
pop esi | |
pop ebx | |
end; | |
function RMcalcVLight(StepCount: Single): Integer; | |
asm | |
push ecx | |
fld dword [ebp + 8] | |
fistp dword [esp] | |
mov eax, [esp] | |
cmp eax, 16383 | |
jle @1 | |
mov eax, 16383 | |
@1: bsr ecx, eax | |
jz @2 | |
sub ecx, 6 | |
jle @2 | |
shr eax, cl | |
shl ecx, 7 | |
or eax, ecx | |
@2: pop ecx | |
end; | |
procedure RMCalcRoughness(N: TPVec3D; var sRough: Single; dt2, dsG: PDouble); | |
asm | |
cmp SupportSSE2, 0 | |
jz @@1 | |
movupd xmm0, [eax] | |
movsd xmm1, [eax + 16] | |
movsd xmm2, [ecx] | |
mulpd xmm0, xmm0 | |
mulsd xmm1, xmm1 | |
mulsd xmm2, xmm2 | |
addsd xmm1, xmm0 | |
mov eax, [ebp + 8] | |
unpckhpd xmm0, xmm0 | |
mulsd xmm2, [eax] | |
addsd xmm1, xmm0 | |
mulsd xmm2, d7 | |
addsd xmm1, d1em40 | |
addsd xmm2, d1em40 | |
xorpd xmm3, xmm3 | |
divsd xmm2, xmm1 | |
maxsd xmm2, xmm3 | |
sqrtsd xmm2, xmm2 | |
subsd xmm2, d005 | |
maxsd xmm2, xmm3 | |
minsd xmm2, d1p0 | |
cvtsd2ss xmm4, xmm2 | |
movss [edx], xmm4 | |
jmp @end | |
@@1: | |
fld qword [eax] | |
fmul st, st | |
fld qword [eax + 8] | |
fmul st, st | |
faddp | |
fld qword [eax + 16] | |
fmul st, st | |
faddp | |
fadd d1em100 | |
mov eax, [ebp + 8] | |
fld qword [ecx] | |
fmul st, st | |
fmul qword [eax] | |
fmul s7 | |
fadd d1em100 | |
fdivrp | |
ftst | |
fnstsw ax | |
shr ah, 1 | |
jnc @1 | |
fstp st | |
fldz | |
@1: fsqrt | |
fld s005 //0.05, sR' | |
fcom st(1) | |
fnstsw ax | |
shr ah, 1 | |
jc @up | |
fcompp | |
xor eax, eax | |
mov [edx], eax | |
jmp @end | |
@up: | |
fsubp | |
fld1 | |
fcomp st(1) | |
fnstsw ax | |
and ah, 41H | |
jz @up2 | |
fstp st | |
fld1 | |
@up2: | |
fstp dword [edx] | |
@end: | |
end; | |
procedure RMCalculateStartPos(pMCTparas: PMCTparameter; ix, iy: Integer); | |
asm | |
add eax, $78 | |
cmp dword [eax + TMCTparameter.MCTCameraOptic - $78], 2 | |
jne @@2 | |
mov ecx, dword [eax + TMCTparameter.pIt3Dext - $78] | |
fld qword [eax + TMCTparameter.Ystart - $78] | |
fld qword [eax + TMCTparameter.Ystart - $78 + 8] | |
fld qword [eax + TMCTparameter.Ystart - $78 + 16] | |
fstp qword [ecx + TIteration3Dext.C3] | |
fstp qword [ecx + TIteration3Dext.C2] | |
fstp qword [ecx + TIteration3Dext.C1] | |
ret | |
@@2: | |
cmp SupportSSE2, 0 | |
jz @@1 | |
push ecx | |
push edx | |
cvtpi2pd xmm7, [esp] //xx,yy | |
mov ecx, dword [eax + TMCTparameter.pIt3Dext - $78] //+68 | |
lea edx, eax + $78 //TMCTparameter.Ystart | |
movapd xmm6, xmm7 | |
unpckhpd xmm7, xmm7 //yy,yy | |
unpcklpd xmm6, xmm6 //xx,xx | |
movupd xmm0, [eax + TMCTparameter.Vgrads - $78] | |
movupd xmm2, [eax + TMCTparameter.Vgrads - $60] | |
movupd xmm4, [edx + TMCTparameter.Ystart - $78 - $78] | |
mulpd xmm0, xmm6 | |
mulsd xmm6, [eax + TMCTparameter.Vgrads - $68] | |
mulpd xmm2, xmm7 | |
mulsd xmm7, [eax + TMCTparameter.Vgrads - $50] | |
addpd xmm0, xmm2 | |
addsd xmm6, xmm7 | |
addpd xmm0, xmm4 | |
addsd xmm6, [edx + TMCTparameter.Ystart - $78 - $68] | |
movupd [ecx + TIteration3Dext.C1], xmm0 | |
movsd [ecx + TIteration3Dext.C3], xmm6 | |
pop edx | |
pop ecx | |
ret | |
@@1: | |
push ecx | |
fild dword [esp] | |
push edx | |
fild dword [esp] //xx,yy | |
mov ecx, dword [eax + TMCTparameter.pIt3Dext - $78] //+68 | |
lea edx, eax + $78 //TMCTparameter.Ystart | |
fld qword [eax + TMCTparameter.Vgrads - $78] | |
fmul st, st(1) | |
fld qword [eax + TMCTparameter.Vgrads - $78 + 24] | |
fmul st, st(3) | |
faddp | |
fadd qword [edx + TMCTparameter.Ystart - $78 - $78] | |
fstp qword [ecx + TIteration3Dext.C1] | |
fld qword [eax + TMCTparameter.Vgrads - $78 + 8] | |
fmul st, st(1) | |
fld qword [eax + TMCTparameter.Vgrads - $78 + 32] | |
fmul st, st(3) | |
faddp | |
fadd qword [edx + TMCTparameter.Ystart - $70 - $78] | |
fstp qword [ecx + TIteration3Dext.C2] //xx,yy | |
fmul qword [eax + TMCTparameter.Vgrads - $78 + 16] | |
fxch | |
fmul qword [eax + TMCTparameter.Vgrads - $78 + 40] | |
faddp | |
fadd qword [edx + TMCTparameter.Ystart - $68 - $78] | |
fstp qword [ecx + TIteration3Dext.C3] | |
pop edx | |
pop ecx | |
end; | |
procedure RMCalculateVgradsFOV(pMCTparas: PMCTparameter; ix: Integer); | |
asm | |
push ebx | |
push esi | |
push edx //to store ix in [esp] and fiload (esp := esp-4) | |
lea ebx, eax + $1a0 | |
fild dword [esp] //ix | |
fsubr dword [ebx + TMCTparameter.FOVXoff - $1a0] | |
fmul dword [ebx + TMCTparameter.FOVXmul - $1a0] | |
fst qword [ebx + TMCTparameter.CAFX - $1a0] // $1a0 | |
cmp dword [ebx + TMCTparameter.MCTCameraOptic - $1a0], 1 // $1fc | |
je @@3 | |
fstp st | |
lea ecx, [ebx + TMCTparameter.mVgradsFOV - $1a0] | |
lea edx, [ebx + TMCTparameter.CAFX - $1a0] // $1a0 | |
lea eax, [ebx + TMCTparameter.CAFY - $1a0] // $1a8 | |
cmp dword [ebx + TMCTparameter.MCTCameraOptic - $1a0], 0 // $1fc | |
je @@1 | |
call BuildViewVectorDSphereFOV | |
jmp @@2 | |
@@3: | |
fchs | |
fstp qword [ebx + TMCTparameter.mVgradsFOV - $1a0] | |
fld qword [ebx + TMCTparameter.CAFY - $1a0] // $1a8 | |
fstp qword [ebx + TMCTparameter.mVgradsFOV - $1a0 + 8] | |
fld dword [ebx + TMCTparameter.mctPlOpticZ - $1a0] // $204 | |
fstp qword [ebx + TMCTparameter.mVgradsFOV - $1a0 + 16] | |
lea eax, [ebx + TMCTparameter.mVgradsFOV - $1a0] | |
call NormaliseVectorVar | |
jmp @@2 | |
@@1: | |
call BuildViewVectorDFOV | |
@@2: | |
lea edx, [ebx + TMCTparameter.VGrads - $1a0] // $80 | |
lea eax, [ebx + TMCTparameter.mVgradsFOV - $1a0] | |
call RotateVectorReverse | |
pop edx //to Inc(esp, 4) | |
pop esi | |
pop ebx | |
end; | |
procedure RMdoColor(pMCTparas: PMCTparameter); | |
const | |
cd5200: Single = 5200; | |
cd4096: Single = 4096; | |
cd5215: Single = 5215; | |
asm | |
push ebx | |
push edi | |
push edx //just to get dword [esp] | |
mov edi, [eax + TMCTparameter.mPsiLight] | |
mov ebx, [eax + TMCTparameter.pIt3Dext] | |
movzx edx, byte [eax + TMCTparameter.ColorOption] //coloroption | |
cmp edx, 6 | |
jnb @@COelse | |
jmp dword [edx * 4 + @@jmptable] | |
@@jmptable: | |
dd @@COelse, @@CO1, @@CO2, @@CO3, @@CO4, @@CO5 | |
@@CO1: | |
fld qword [ebx + 8] //Rold | |
fld1 | |
faddp | |
fdivr qword [ebx + $70] //Rout | |
fldln2 | |
fxch | |
fyl2x | |
fmul dword [eax + TMCTparameter.mctColorMul] //mctColorMul | |
jmp @@up | |
nop | |
@@CO2: | |
fld qword [ebx+$20] | |
fsub qword [ebx+$40] | |
jmp @1 | |
@@CO3: | |
fld qword [ebx+$28] | |
fsub qword [ebx+$48] | |
@1: fld qword [ebx+$18] | |
fsub qword [ebx+$38] | |
@2: fpatan | |
fldpi | |
faddp | |
fmul cd5200 | |
jmp @@up | |
@@CO4: | |
fld qword [ebx+$28] | |
fsub qword [ebx+$48] | |
fld qword [ebx+$20] | |
fsub qword [ebx+$40] | |
jmp @2 | |
@@CO5: | |
fld qword [ebx+$20] | |
fld st | |
fmul st, st //yy,y | |
fld qword [ebx+$18] //x,yy,y | |
fld st | |
fmul st, st //xx,x,yy,y | |
fxch st(3) //y,x,yy,xx | |
fpatan | |
fldpi | |
faddp | |
fmul cd5215 //s,yy,xx | |
fxch st(2) //xx,yy,s | |
faddp | |
fadd d1em100 | |
fld qword [ebx+$28] //z,yy+xx,s norm vec[2] for arcsin | |
fld st | |
fmul st, st //zz,z,yy+xx,s | |
faddp st(2), st //z,rr,s | |
fxch //rr,z,s | |
fsqrt //r,z,s | |
fdivp //z/r,s | |
@@s2: | |
fld1 //arcsin(x) = arctan2(x, sqrt(1-x*x)) | |
fld st(1) | |
fmul st(0), st(0) | |
fsubp | |
fsqrt | |
fpatan | |
fadd st, st | |
fldpi | |
faddp | |
fmul cd5215 | |
fstp dword [esp] | |
lea edx, [edi + TsiLight5.SIgradient] | |
mov eax, esp | |
call MinMaxClip15bit | |
jmp @@up | |
@@COelse: | |
fld qword [ebx + TIteration3Dext.OTrap] | |
fmul cd4096 | |
@@up: | |
fstp dword [esp] | |
lea edx, [edi + TsiLight5.Otrap] | |
mov eax, esp | |
call MinMaxClip15bit | |
pop edx | |
pop edi | |
pop ebx | |
end; | |
const CS8388352: Single = 8388352; | |
asm | |
push ebx | |
push esi | |
push edi | |
add esp, -24 | |
mov edi, [eax+TMCTparameter.mPsiLight] //PSL | |
mov esi, edx //cutplane | |
lea ebx, eax + 128 //MCTparas | |
test esi, esi //if cutplane>0 | |
jle @@1 | |
fld1 | |
fld qword [ebx+TMCTparameter.mZZ-128] //+104 mZZ^,1 NN := 8388352 - ZcMul * (Sqrt(mZZ * Zcorr + 1) - 1); | |
fmul qword [ebx+TMCTparameter.Zcorr-128] //$274 | |
fadd st, st(1) | |
fsqrt | |
fsubrp | |
fmul qword [ebx+TMCTparameter.ZcMul-128] //$26c | |
fsubr CS8388352 //NN | |
fistp dword [esp] | |
mov eax, [esp] | |
test eax, eax | |
jns @@3 | |
xor eax, eax | |
@@3: | |
shl eax, 8 // PCardinal(@PSL.RoughZposFine)^ := iTmp shl 8; | |
mov [edi+6], eax | |
dec esi //VGrads: +128 | |
fld qword [ebx+esi*8+TMCTparameter.VGrads+$30-128] // if Abs(Vgrads[2, CutPlane]) < 1e-40 | |
fabs | |
fcomp d1em40 | |
fnstsw ax | |
shr ah, 1 | |
jnc @@4 | |
fld dm1e40 // NN := -1e40 | |
jmp @@5 | |
@@4: | |
fld1 // NN := -1 / Vgrads[2, CutPlane]; | |
fchs | |
fdiv qword [ebx+esi*8+TMCTparameter.VGrads+$30-128] | |
@@5: | |
fld qword [ebx+esi*8+TMCTparameter.VGrads-128] // N[0] := Vgrads[0, CutPlane] * NN; | |
fmul st, st(1) | |
fstp qword [esp] | |
fld qword [ebx+esi*8+TMCTparameter.VGrads+$18-128] // N[1] := Vgrads[1, CutPlane] * NN; | |
fmulp | |
fstp qword [esp+8] | |
fld1 | |
fchs // N[2] := -1; | |
fstp qword [esp+16] | |
mov edx, esp // MakeWNormalsFromDVec(TPLNormals(PSL), @N); | |
mov eax, edi | |
call MakeWNormalsFromDVec | |
jmp @@6 | |
@@1: | |
xor eax, eax | |
mov [edi+6], $7fff0000 | |
mov dword [edi], eax | |
mov word [edi+4], $8001 | |
@@6: | |
add esp, 24 | |
pop edi | |
pop esi | |
pop ebx //} | |
end; | |
procedure RMdoBinSearch(pMCTparas: PMCTparameter; var DE, RLastStepWidth{, RLastDE}: Double); | |
asm | |
push ebx | |
push esi | |
push edi | |
push ebp | |
add esp, -8 | |
mov edi, edx //@dTmp | |
lea esi, eax+$38 //@MCTParas (was:qTMandCalcThread) | |
mov ebx, [esi+TMCTparameter.pIt3Dext-$38] | |
mov ebp, [esi+TMCTparameter.iDEAddSteps-$38] //+$40 | |
fld qword [ecx] // RLastStepWidth | |
fmul sm05 | |
jmp @@2 | |
@@4: | |
fld qword [esi+TMCTparameter.mZZ-$38] //+$68 | |
fadd qword [esp] | |
fstp qword [esi+TMCTparameter.mZZ-$38] //+$68 | |
fld qword [esi+TMCTparameter.mVgradsFOV-$38] | |
fld qword [esi+TMCTparameter.mVgradsFOV-$38 + 8] | |
fld qword [esi+TMCTparameter.mVgradsFOV-$38 + 16] | |
fld qword [esp] | |
fmul st(3), st | |
fmul st(2), st | |
fmulp | |
fadd qword [ebx+TIteration3Dext.C1 + 16] | |
fstp qword [ebx+TIteration3Dext.C1 + 16] | |
fadd qword [ebx+TIteration3Dext.C1 + 8] | |
fstp qword [ebx+TIteration3Dext.C1 + 8] | |
fadd qword [ebx+TIteration3Dext.C1] | |
fstp qword [ebx+TIteration3Dext.C1] | |
fld qword [esi+TMCTparameter.mZZ-$38] //+$68 | |
fmul dword [esi+TMCTparameter.mctDEstopFactor-$38] //+$54 | |
fld1 | |
faddp | |
fmul dword [esi+TMCTparameter.DEstop-$38] //+$60 | |
fstp dword [esi+TMCTparameter.msDEstop-$38] //+$38 msDEstop := DEstop * (1 + mZZ * mctDEstopFactor); | |
dec ebp | |
test ebp, ebp | |
jle @@3 | |
lea edx, esi-$38 | |
mov eax, ebx | |
call esi+TMCTparameter.CalcDE-$38 | |
fstp qword [edi] //dTmp | |
fld qword [edi] | |
fcomp dword [esi+TMCTparameter.msDEstop-$38] //+$38 | |
fnstsw ax | |
fld qword [esp] | |
fabs | |
fmul s055 | |
shr ah, 1 | |
jnc @@8 | |
fchs | |
@@8: | |
@@2: | |
fstp qword [esp] | |
fld qword [edi] | |
fsub dword [esi+TMCTparameter.msDEstop-$38] //+$38 | |
fabs | |
fcomp s0001 | |
fnstsw ax | |
shr ah, 1 | |
jnc @@4 | |
@@3: | |
add esp, 8 | |
pop ebp | |
pop edi | |
pop esi | |
pop ebx | |
end; //} | |
procedure CalcZposAndRough(siLight: TPsiLight5; mct: PMCTparameter; const ZZ: Double); | |
push ebx | |
sub esp, 4 | |
fld1 | |
test byte [ebp + 15], 128 //negative zz clip | |
jns @1 | |
fldz | |
jmp @2 | |
@1: fld qword [ebp + 8] | |
@2: fmul qword [edx + TMCTparameter.Zcorr] | |
fadd st(0), st(1) | |
fsqrt //at fsqrt? ZZ * Zcorr > 1?? zz=-642!! | |
fsubrp //invalid fp operation in critical ipol hybrid | |
fmul qword [edx + TMCTparameter.ZcMul] | |
fistp dword [esp] | |
mov ebx, 8388352 | |
sub ebx, dword [esp] | |
test ebx, ebx | |
jnl @up1 | |
xor ebx, ebx | |
@up1: | |
cmp ebx, 8388352 | |
jle @up2 | |
mov ebx, 8388352 | |
@up2: | |
shl ebx, 8 | |
cmp byte [edx + TMCTparameter.iSmNormals], 0 | |
jle @up3 | |
fld dword [edx + TMCTparameter.sRoughness] | |
fmul s255 | |
fistp dword [esp] | |
or ebx, [esp] | |
@up3: | |
mov [eax + 6], ebx | |
add esp, 4 | |
pop ebx | |
end; | |
procedure FirstATlevelHiQ(PIA: TPCardinalArray; PsiLight: TPsiLight5; Leng: Integer); | |
asm | |
push esi | |
dec ecx | |
js @@out | |
inc ecx | |
add edx, 8 | |
@@1: | |
cmp word [edx], $8000 | |
jnb @@2 | |
mov esi, [edx-2] | |
and esi, $ffffff00 | |
shr esi, 1 | |
jmp @@3 | |
@@2: | |
xor esi, esi | |
@@3: | |
mov [eax], esi | |
add edx, 18 | |
add eax, 4 | |
dec ecx | |
jnz @@1 | |
@@out: | |
pop esi | |
end; | |
procedure SmoothH(PIA, SA: TPCardinalArray; ya, Step: Integer); | |
asm | |
add esp, -12 | |
push ebx | |
push esi | |
push edi | |
mov [ebp-8], ecx | |
mov ebx, edx | |
mov edi, [ebp+8] | |
mov edx, ecx | |
test edx, edx | |
jl @@2 | |
inc edx | |
mov [ebp-12], edx | |
xor esi, esi | |
@@1: | |
mov edx, esi | |
sub edx, edi | |
test edx, edx | |
jnl @@3 | |
xor edx, edx | |
@@3: | |
mov ecx, edi | |
add ecx, esi | |
cmp ecx, [ebp-8] | |
jle @@4 | |
mov ecx, [ebp-8] | |
@@4: | |
mov ecx, [ebx+ecx*4] | |
add ecx, [ebx+edx*4] | |
shr ecx, 1 | |
add ecx, [eax] | |
shr ecx, 1 | |
mov [eax], ecx | |
inc esi | |
add eax, 4 | |
dec dword [ebp-12] | |
jnz @@1 | |
@@2: | |
pop edi | |
pop esi | |
pop ebx | |
add esp, 12 | |
end; | |
procedure SmoothV(PIA, SA: TPCardinalArray; ye, Step, wid: Integer); | |
asm | |
add esp, -12 | |
push ebx | |
push esi | |
push edi | |
mov [ebp-8], ecx | |
mov ebx, edx | |
mov edi, [ebp+12] | |
mov edx, ecx | |
test edx, edx | |
jl @@2 | |
inc edx | |
mov [ebp-12], edx | |
xor esi, esi | |
@@1: | |
mov edx, esi | |
sub edx, edi | |
test edx, edx | |
jnl @@3 | |
xor edx, edx | |
@@3: | |
mov ecx, edi | |
add ecx, esi | |
cmp ecx, [ebp-8] | |
jle @@4 | |
mov ecx, [ebp-8] | |
@@4: | |
mov ecx, [ebx+ecx*4] | |
add ecx, [ebx+edx*4] | |
shr ecx, 1 | |
add ecx, [eax] | |
shr ecx, 1 | |
mov [eax], ecx | |
inc esi | |
add eax, dword [ebp+8] | |
dec dword [ebp-12] | |
jnz @@1 | |
@@2: | |
pop edi | |
pop esi | |
pop ebx | |
add esp, 12 | |
end; | |
procedure MinSI(var SI: SmallInt; var i: Integer); | |
asm | |
movsx ecx, word [eax] | |
cmp ecx, [edx] | |
jnl @@1 | |
cmp dword [edx], $7FFF | |
jl @@2 | |
mov word [eax], $7FFF | |
ret | |
@@2: | |
mov edx, [edx] | |
mov word [eax], dx | |
@@1: | |
end; | |
function NotOnlyBackGround4(p: Pointer): Integer; | |
asm | |
mov edx, [eax] | |
and edx, [eax + 18] | |
and edx, [eax + 36] | |
and edx, [eax + 54] | |
and edx, $80000000 | |
mov eax, edx | |
end; | |
procedure MakeZP4(p: Pointer; var zp: array of Integer); | |
asm | |
mov ecx, [eax] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx], ecx | |
mov ecx, [eax + 18] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx + 4], ecx | |
mov ecx, [eax + 36] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx + 8], ecx | |
mov ecx, [eax + 54] | |
and ecx, $FFFFFF00 | |
shr ecx, 1 | |
mov [edx + 12], ecx | |
end; | |
procedure TAmbHiQCalcR.Execute; | |
// ... | |
asm //~13s with 3 steps | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi | |
mov esi, PATL | |
mov edx, psm | |
mov edi, seed | |
mov iDir, 31 | |
xorps xmm2, xmm2 | |
xorps xmm3, xmm3 | |
xorps xmm4, xmm4 | |
movss xmm5, sAbs | |
xorps xmm6, xmm6 | |
xorps xmm7, xmm7 | |
movlps xmm4, ssub //xmm4 = ssub | |
movlps xmm7, sstep //xmm7 = sstep | |
@foriDir: mov eax, PS | |
movlps xmm6, [eax] //xmm6 = PS[0,1] | |
movlps xmm2, sMinRad | |
movaps xmm1, xmm6 | |
mulps xmm1, xmm2 | |
subps xmm1, xmm4 //sxy-ssub | |
mov eax, StepCount | |
mov sc, eax | |
@while: imul edi, $000343FD | |
add edi, $269EC3 | |
mov eax, edi | |
movaps xmm0, xmm1 | |
shr eax, 10 | |
CVTSS2SI ecx, xmm0 //x2 | |
mov ebx, eax | |
and ebx, iand | |
add ecx, ebx | |
shufps xmm0, xmm0, 1 | |
shr eax, 6 | |
CVTSS2SI ebx, xmm0 //y2 | |
and eax, iand | |
add ebx, eax | |
push ecx | |
mov eax, ebx | |
imul ecx, ecx | |
imul eax, eax | |
add eax, ecx | |
pop ecx | |
test eax, eax | |
jz @skip | |
CVTSI2SS xmm2, eax | |
add ecx, dword [xy] | |
add ebx, dword [xy + 4] | |
test ecx, ecx //reflection at borders | |
jns @@1 | |
neg ecx | |
cmp ecx, WLo | |
jge @endwhile | |
jmp @@2 | |
@@1: cmp ecx, MWidth | |
jl @@2 | |
sub ecx, MW2 | |
neg ecx | |
cmp ecx, WHi | |
jl @endwhile | |
@@2: test ebx, ebx | |
jns @@3 | |
neg ebx | |
cmp ebx, HLo | |
jge @endwhile | |
jmp @con | |
@@3: cmp ebx, MHeight | |
jl @con | |
sub ebx, MH2 | |
neg ebx | |
cmp ebx, HHi | |
jl @endwhile | |
@con: imul ebx, MWidth | |
add ebx, ecx | |
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2] | |
sub eax, zp | |
CVTSI2SS xmm0, eax | |
RSQRTSS xmm2, xmm2 | |
mulss xmm0, xmm2 //st | |
movss xmm3, xmm0 | |
andps xmm0, xmm5 | |
mulss xmm3, sit | |
addss xmm0, sZRT | |
mulss xmm3, sZRT | |
rcpss xmm0, xmm0 | |
mulss xmm3, xmm0 | |
minss xmm3, s32767 | |
maxss xmm3, sm32768 | |
CVTSS2SI eax, xmm3 //it := Round(st * sZRT * sit / (sZRT + Abs(st)) ); | |
mov ecx, iDir | |
cmp ax, word [edx + ecx * 2] | |
jle @skip | |
mov word [edx + ecx * 2], ax | |
@skip: movaps xmm3, xmm7 //sstep | |
mulps xmm3, xmm6 //DirXY | |
addps xmm1, xmm3 //sx,sy | |
dec sc | |
jnz @while | |
@endwhile: add PS, 8 | |
dec iDir | |
jns @foriDir | |
mov seed, edi | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
procedure TAmbHiQCalcRpano.Execute; | |
// ... | |
asm //~13s with 3 steps | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi | |
mov esi, PATL | |
mov edx, psm | |
mov edi, seed | |
mov iDir, 31 | |
xorps xmm2, xmm2 | |
xorps xmm3, xmm3 | |
xorps xmm4, xmm4 | |
movss xmm5, sAbs | |
xorps xmm6, xmm6 | |
xorps xmm7, xmm7 | |
movlps xmm4, ssub //xmm4 = ssub | |
movlps xmm7, sstep //xmm7 = sstep | |
@foriDir: mov eax, PS | |
movlps xmm6, [eax] //xmm6 = PS[0,1] | |
movlps xmm2, sMinRad | |
movaps xmm1, xmm6 | |
mulps xmm1, xmm2 | |
subps xmm1, xmm4 //sxy-ssub | |
mov eax, StepCount | |
mov sc, eax | |
@while: imul edi, $000343FD | |
add edi, $269EC3 | |
mov eax, edi | |
movaps xmm0, xmm1 | |
shr eax, 10 | |
CVTSS2SI ecx, xmm0 //x2 | |
mov ebx, eax | |
and ebx, iand | |
add ecx, ebx | |
shufps xmm0, xmm0, 1 | |
shr eax, 6 | |
CVTSS2SI ebx, xmm0 //y2 | |
and eax, iand | |
add ebx, eax | |
push ecx | |
mov eax, ebx | |
imul ecx, ecx | |
imul eax, eax | |
add eax, ecx | |
pop ecx | |
test eax, eax | |
jz @skip | |
CVTSI2SS xmm2, eax | |
add ecx, dword [xy] | |
add ebx, dword [xy + 4] | |
test ecx, ecx // reflection at borders | |
jns @@1 | |
add ecx, MWidth | |
test ecx, ecx | |
jns @@2 | |
jmp @endwhile | |
@@1: cmp ecx, MWidth | |
jl @@2 | |
sub ecx, MWidth | |
cmp ecx, MWidth | |
jnl @endwhile | |
@@2: test ebx, ebx | |
jns @@3 | |
neg ebx | |
cmp ebx, HLo | |
jge @endwhile | |
jmp @con | |
@@3: cmp ebx, MHeight | |
jl @con | |
sub ebx, MH2 | |
neg ebx | |
cmp ebx, HHi | |
jl @endwhile | |
@con: imul ebx, MWidth | |
add ebx, ecx | |
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2] | |
sub eax, zp | |
CVTSI2SS xmm0, eax | |
RSQRTSS xmm2, xmm2 | |
mulss xmm0, xmm2 //st | |
movss xmm3, xmm0 | |
andps xmm0, xmm5 | |
mulss xmm3, sit | |
addss xmm0, sZRT | |
mulss xmm3, sZRT | |
rcpss xmm0, xmm0 | |
mulss xmm3, xmm0 | |
minss xmm3, s32767 | |
maxss xmm3, sm32768 | |
CVTSS2SI eax, xmm3 //it := Round(st * sZRT * sit / (sZRT + Abs(st)) ); | |
mov ecx, iDir | |
cmp ax, word [edx + ecx * 2] | |
jle @skip | |
mov word [edx + ecx * 2], ax | |
@skip: movaps xmm3, xmm7 //sstep | |
mulps xmm3, xmm6 //DirXY | |
addps xmm1, xmm3 //sx,sy | |
dec sc | |
jnz @while | |
@endwhile: add PS, 8 | |
dec iDir | |
jns @foriDir | |
mov seed, edi | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
procedure TAmbHiQCalcRT0.Execute; | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi | |
mov esi, PATL | |
mov edx, PSI | |
mov edi, seed | |
mov iDir, 31 | |
xorps xmm1, xmm1 | |
xorps xmm2, xmm2 | |
xorps xmm3, xmm3 | |
xorps xmm4, xmm4 | |
xorps xmm5, xmm5 | |
xorps xmm6, xmm6 | |
movlps xmm4, ssub //xmm4 = ssub | |
movlps xmm5, sstep //xmm5 = sstep | |
@foriDir: mov eax, PS | |
movlps xmm1, sMinRad // (1.2 at stepw1) | |
movlps xmm6, [eax] //xmm6 = PS[0,1] | |
mulps xmm1, xmm6 | |
subps xmm1, xmm4 //sxy-ssub (-0,5 at stepw1) | |
mov eax, StepCount | |
mov sc, eax | |
@while: imul edi, $000343FD | |
add edi, $269EC3 | |
mov eax, edi | |
movaps xmm0, xmm1 //sx, sy | |
shr eax, 10 | |
CVTSS2SI ecx, xmm0 | |
mov ebx, eax | |
and ebx, iand | |
add ecx, ebx | |
shufps xmm0, xmm0, 1 | |
shr eax, 6 | |
CVTSS2SI ebx, xmm0 | |
and eax, iand | |
add ebx, eax | |
push ecx | |
mov eax, ebx | |
imul ecx, ecx | |
imul eax, eax | |
add eax, ecx | |
pop ecx | |
test eax, eax | |
jz @skip | |
CVTSI2SS xmm2, eax | |
add ecx, dword [xy] | |
add ebx, dword [xy+4] | |
test ecx, ecx // reflection at borders | |
jns @@1 | |
neg ecx | |
cmp ecx, WLo | |
jge @endwhile | |
jmp @@2 | |
@@1: cmp ecx, MWidth | |
jl @@2 | |
sub ecx, MW2 | |
neg ecx | |
cmp ecx, WHi | |
jl @endwhile | |
@@2: test ebx, ebx | |
jns @@3 | |
neg ebx | |
cmp ebx, HLo | |
jge @endwhile | |
jmp @con | |
@@3: cmp ebx, MHeight | |
jl @con | |
sub ebx, MH2 | |
neg ebx | |
cmp ebx, HHi | |
jl @endwhile | |
@con: imul ebx, MWidth | |
add ebx, ecx | |
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2] | |
sub eax, zp | |
CVTSI2SS xmm0, eax // (CVTPI2PS=sse, 2 int to single) | |
RSQRTSS xmm2, xmm2 | |
mulss xmm0, xmm2 //st := (PATL^[y2 * MWidth + x2] - zp) / Sqrt(st); | |
movss xmm3, xmm0 | |
mulss xmm0, xmm0 | |
mulss xmm3, sit | |
addss xmm0, sZRT | |
mulss xmm3, sZRT | |
rcpss xmm0, xmm0 | |
mulss xmm3, xmm0 | |
minss xmm3, s32767 | |
maxss xmm3, sm32768 | |
CVTSS2SI eax, xmm3 //it := Round(st * sit * sZRT / (st * st + sZRT)); | |
mov ecx, iDir | |
cmp ax, word [edx + ecx * 2] | |
jle @skip | |
mov word [edx + ecx * 2], ax | |
@skip: movaps xmm3, xmm5 //sstep | |
mulps xmm3, xmm6 //DirXY | |
addps xmm1, xmm3 //sx,sy | |
dec sc | |
jnz @while | |
@endwhile: add PS, 8 | |
dec iDir | |
jns @foriDir | |
mov seed, edi | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
procedure TAmbHiQCalcRT0pano.Execute; | |
// ... | |
asm | |
push eax | |
push ebx | |
push ecx | |
push edx | |
push esi | |
push edi | |
mov esi, PATL | |
mov edx, PSI | |
mov edi, seed | |
mov iDir, 31 | |
xorps xmm1, xmm1 | |
xorps xmm2, xmm2 | |
xorps xmm3, xmm3 | |
xorps xmm4, xmm4 | |
xorps xmm5, xmm5 | |
xorps xmm6, xmm6 | |
movlps xmm4, ssub //xmm4 = ssub | |
movlps xmm5, sstep //xmm5 = sstep | |
@foriDir: mov eax, PS | |
movlps xmm1, sMinRad // (1.2 at stepw1) | |
movlps xmm6, [eax] //xmm6 = PS[0,1] | |
mulps xmm1, xmm6 | |
subps xmm1, xmm4 //sxy-ssub (-0,5 at stepw1) | |
mov eax, StepCount | |
mov sc, eax | |
@while: imul edi, $000343FD | |
add edi, $269EC3 | |
mov eax, edi | |
movaps xmm0, xmm1 //sx, sy | |
shr eax, 10 | |
CVTSS2SI ecx, xmm0 | |
mov ebx, eax | |
and ebx, iand | |
add ecx, ebx | |
shufps xmm0, xmm0, 1 | |
shr eax, 6 | |
CVTSS2SI ebx, xmm0 | |
and eax, iand | |
add ebx, eax | |
push ecx | |
mov eax, ebx | |
imul ecx, ecx | |
imul eax, eax | |
add eax, ecx | |
pop ecx | |
test eax, eax | |
jz @skip | |
CVTSI2SS xmm2, eax | |
add ecx, dword [xy] | |
add ebx, dword [xy+4] | |
test ecx, ecx // reflection at borders | |
jns @@1 | |
add ecx, MWidth | |
test ecx, ecx | |
jns @@2 | |
jmp @endwhile | |
@@1: cmp ecx, MWidth | |
jl @@2 | |
sub ecx, MWidth | |
cmp ecx, MWidth | |
jnl @endwhile | |
@@2: test ebx, ebx | |
jns @@3 | |
neg ebx | |
cmp ebx, HLo | |
jge @endwhile | |
jmp @con | |
@@3: cmp ebx, MHeight | |
jl @con | |
sub ebx, MH2 | |
neg ebx | |
cmp ebx, HHi | |
jl @endwhile | |
@con: imul ebx, MWidth | |
add ebx, ecx | |
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2] | |
sub eax, zp | |
CVTSI2SS xmm0, eax // (CVTPI2PS=sse, 2 int to single) | |
RSQRTSS xmm2, xmm2 | |
mulss xmm0, xmm2 //st := (PATL^[y2 * MWidth + x2] - zp) / Sqrt(st); | |
movss xmm3, xmm0 | |
mulss xmm0, xmm0 | |
mulss xmm3, sit | |
addss xmm0, sZRT | |
mulss xmm3, sZRT | |
rcpss xmm0, xmm0 | |
mulss xmm3, xmm0 | |
minss xmm3, s32767 | |
maxss xmm3, sm32768 | |
CVTSS2SI eax, xmm3 //it := Round(st * sit * sZRT / (st * st + sZRT)); | |
mov ecx, iDir | |
cmp ax, word [edx + ecx * 2] | |
jle @skip | |
mov word [edx + ecx * 2], ax | |
@skip: movaps xmm3, xmm5 //sstep | |
mulps xmm3, xmm6 //DirXY | |
addps xmm1, xmm3 //sx,sy | |
dec sc | |
jnz @while | |
@endwhile: add PS, 8 | |
dec iDir | |
jns @foriDir | |
mov seed, edi | |
pop edi | |
pop esi | |
pop edx | |
pop ecx | |
pop ebx | |
pop eax | |
end | |
// ... | |
function VolLightMapPosSSE(vd: TPVec3D): LongBool; | |
asm | |
push esi | |
push edx //to get esp buf | |
lea esi, VolumeLightMap | |
fld qword [eax] | |
fsub qword [esi + TVolumetricLightMap.LightPos] | |
fstp dword [esp] | |
fld qword [eax + 8] | |
movss xmm0, [esp] | |
fsub qword [esi + TVolumetricLightMap.LightPos + 8] | |
fstp dword [esp] | |
fld qword [eax + 16] | |
movss xmm1, [esp] | |
fsub qword [esi + TVolumetricLightMap.LightPos + 16] | |
fstp dword [esp] | |
shufps xmm0, xmm0, 0 | |
movss xmm2, [esp] | |
shufps xmm1, xmm1, 0 | |
shufps xmm2, xmm2, 0 | |
movups xmm4, [esi + TVolumetricLightMap.RotMatrix] | |
movups xmm5, [esi + TVolumetricLightMap.RotMatrix + 16] | |
movups xmm6, [esi + TVolumetricLightMap.RotMatrix + 32] | |
mulps xmm4, xmm0 | |
mulps xmm5, xmm1 | |
mulps xmm6, xmm2 | |
addps xmm4, xmm5 | |
addps xmm4, xmm6 | |
xorps xmm2, xmm2 | |
movhlps xmm5, xmm4 | |
movss xmm1, [esi + TVolumetricLightMap.StretchSide1] | |
movss xmm3, [esi + TVolumetricLightMap.HSizeS] | |
movss xmm0, [esi + TVolumetricLightMap.CSizeS] | |
shufps xmm1, xmm1, 0 | |
shufps xmm3, xmm3, 0 | |
shufps xmm0, xmm0, 0 | |
mulps xmm4, xmm1 | |
addps xmm4, xmm3 | |
maxps xmm4, xmm2 | |
minps xmm4, xmm0 | |
cvtss2si eax, xmm4 | |
shufps xmm4, xmm4, 1 | |
cvtss2si edx, xmm4 | |
imul edx, dword [esi + TVolumetricLightMap.CubeSize] | |
mov esi, [esi + TVolumetricLightMap.CubeSides] | |
add edx, eax | |
xor eax, eax | |
comiss xmm5, [esi + edx * 4] | |
jnc @e | |
mov eax, -1 | |
@e: pop edx | |
pop esi | |
end; | |
function GetVolLightMapVecSSE(vd: TPSVec): Single; | |
asm | |
push esi | |
push ebx | |
xorps xmm4, xmm4 | |
lea esi, VolumeLightMap | |
movups xmm5, [eax] | |
movups xmm7, cAbsSVec | |
movaps xmm0, xmm5 | |
movaps xmm1, xmm5 | |
movhlps xmm2, xmm5 | |
shufps xmm1, xmm1, 1 | |
andps xmm5, xmm7 | |
movaps xmm6, xmm5 | |
movhlps xmm7, xmm5 | |
shufps xmm6, xmm6, 1 | |
movss xmm3, [esi + TVolumetricLightMap.SizeFactor] | |
ucomiss xmm5, xmm6 | |
jc @1 | |
ucomiss xmm5, xmm7 | |
jc @2 | |
xor edx, edx | |
ucomiss xmm0, xmm4 | |
adc edx, 0 | |
@3: divss xmm3, xmm0 | |
mulss xmm1, xmm3 | |
mulss xmm2, xmm3 | |
cvtss2si eax, xmm1 | |
cvtss2si ebx, xmm2 | |
jmp @e | |
@2: mov edx, 4 | |
ucomiss xmm2, xmm4 | |
adc edx, 0 | |
@4: divss xmm3, xmm2 | |
mulss xmm0, xmm3 | |
mulss xmm1, xmm3 | |
cvtss2si eax, xmm0 | |
cvtss2si ebx, xmm1 | |
jmp @e | |
@1: ucomiss xmm6, xmm7 | |
jc @2 | |
mov edx, 2 | |
ucomiss xmm1, xmm4 | |
adc edx, 0 | |
@5: divss xmm3, xmm1 | |
mulss xmm0, xmm3 | |
mulss xmm2, xmm3 | |
cvtss2si eax, xmm0 | |
cvtss2si ebx, xmm2 | |
@e: add ebx, [esi + TVolumetricLightMap.HalfSize] | |
add eax, [esi + TVolumetricLightMap.HalfSize] | |
imul ebx, dword [esi + TVolumetricLightMap.CubeSize] | |
mov esi, [esi + edx * 4 + TVolumetricLightMap.CubeSides] | |
add eax, ebx | |
fld dword [esi + eax * 4] | |
pop ebx | |
pop esi | |
end; | |
function TCalcAmbShadowDEThreadGeneral.GetRand: Double; | |
const dm: Double = 1 / $7FFFFF; | |
asm | |
add esp, -4 | |
imul edx, [eax + seed], $343FD | |
add edx, $269EC3 | |
mov [eax + seed], edx | |
shr edx, 8 | |
and edx, $7FFFFF | |
mov [esp], edx | |
fild dword [esp] | |
fmul dm | |
add esp, 4 | |
end; | |
function TCalcAmbShadowDEThreadGeneral2.GetRand: Double; | |
const dm: Double = 1 / $7FFFFF; | |
asm | |
add esp, -4 | |
imul edx, [eax + seed], $343FD | |
add edx, $269EC3 | |
mov [eax + seed], edx | |
shr edx, 8 | |
and edx, $7FFFFF | |
mov [esp], edx | |
fild dword [esp] | |
fmul dm | |
add esp, 4 | |
end; | |
function RdTsc: int64; | |
asm | |
db $0f, $31 // RdTsc | |
end; | |
function Clamp255(i: Integer): Integer; | |
asm | |
cmp eax, 255 | |
jle @up | |
mov eax, 255 | |
@up: | |
end; | |
procedure MakeCubicWeightsFromT(const t: Single; var sv: TSVec); //all weights 6 times bigger! | |
const s3: Single = 3; | |
s6: Single = 6; | |
asm | |
fld dword [ebp + 8] | |
fld st | |
fmul st, st //t*t,t | |
fld st | |
fmul st, st(2) //t³,t²,t | |
fld s3 | |
fmul st(2), st //3, t³=sv[3], 3*t²=sv[2], t | |
fld st(2) //sv[2], 3, sv[3], sv[2], t | |
fsub st, st(2) //sv[2]-sv[3], 3, sv[3], sv[2], t | |
fsub st, st(4) //sv[2]-sv[3]-t, 3, sv[3], sv[2], t | |
fsub st, st(4) //sv[2]-sv[3]-2*t, 3, sv[3], sv[2], t | |
fstp dword [eax] //3, sv[3], sv[2], t | |
fld st(1) //sv[3], 3, sv[3], sv[2], t | |
fmul st, st(1) //3*sv[3], 3, sv[3], sv[2], t | |
fsub st, st(3) //3*sv[3]-sv[2], 3, sv[3], sv[2], t | |
fsub st, st(3) //3*sv[3]-2*sv[2], 3, sv[3], sv[2], t | |
fld st(4) | |
fmul st, st(2) | |
fsubp | |
fadd s6 | |
fstp dword [eax + 4] | |
fmul st, st(1) | |
fsubp st(2), st //t³,3*t²-3*t³,t | |
fsub st, st(2) | |
fstp dword [eax + 12] //3*t²-3*t³,t | |
fxch | |
fmul s6 | |
faddp | |
fstp dword [eax + 8] | |
end; | |
function GetCosTabVal(const Tnr: Integer; const DotP, Rough: Single): Single; | |
// ... | |
asm | |
mov edx, Tnr | |
shl edx, 7 | |
add edx, ip | |
lea eax, DiffCosTabNsmall + edx * 4 | |
movups xmm2, w | |
movups xmm0, [eax] | |
movups xmm1, [eax + $800] | |
mulps xmm0, xmm2 | |
mulps xmm1, xmm2 | |
movaps xmm3, xmm0 | |
unpcklps xmm3, xmm1 | |
unpckhps xmm0, xmm1 | |
addps xmm3, xmm0 | |
movhlps xmm0, xmm3 | |
addps xmm3, xmm0 | |
movaps xmm2, xmm3 | |
shufps xmm2, xmm2, 1 | |
subss xmm2, xmm3 | |
mulss xmm2, Rough | |
addss xmm2, xmm3 | |
movss Result, xmm2 | |
end | |
// ... | |
function GetCosTabValSqr(const Tnr: Integer; const DotP, Rough: Single): Single; | |
// ... | |
asm | |
mov edx, Tnr | |
shl edx, 7 | |
add edx, ip | |
lea eax, DiffCosTabNsmall + edx * 4 | |
movups xmm2, w | |
movups xmm0, [eax] | |
movups xmm1, [eax + $800] | |
mulps xmm0, xmm2 | |
mulps xmm1, xmm2 | |
movaps xmm3, xmm0 | |
unpcklps xmm3, xmm1 | |
unpckhps xmm0, xmm1 | |
addps xmm3, xmm0 | |
movhlps xmm0, xmm3 | |
addps xmm3, xmm0 | |
mulps xmm3, xmm3 | |
movaps xmm2, xmm3 | |
shufps xmm2, xmm2, 1 | |
subss xmm2, xmm3 | |
mulss xmm2, Rough | |
addss xmm2, xmm3 | |
movss Result, xmm2 | |
end | |
// ... | |
function TMCCalcThread.GetRand: Double; | |
asm //begin result := random; end; | |
imul edx, [eax + seed], $343FD | |
add edx, $269EC3 | |
mov [eax + seed], edx | |
and edx, $7FFFFFFF | |
push edx | |
fild dword [esp] | |
fmul dSeedMul | |
pop edx | |
end; | |
function TMCCalcThread.GenSphereSVecOm: TSVec; //fullsphere | |
asm | |
cmp dword [eax + TMCCalcThread.bDoDOF], 0 | |
jnz @@1 | |
fld dword [eax + TMCCalcThread.HaltonDiscY] | |
fld dword [eax + TMCCalcThread.HaltonDiscX] | |
jmp @@2 | |
@@1: | |
push edx | |
call GetRand | |
call GetRand | |
pop edx | |
@@2: | |
fmul PiM2 | |
fsincos //cos,sin,v | |
fld1 | |
fsub st, st(3) | |
fmul st, st(3) | |
fsqrt | |
fadd st, st //r,cos,sin,v | |
fmul st(2), st | |
fmulp //c',s',v | |
fstp dword [edx] | |
fstp dword [edx + 4] | |
fadd st, st | |
fld1 | |
fsubrp | |
fstp dword [edx + 8] | |
xor eax, eax | |
mov [edx + 12], eax //} | |
end; | |
function ByteSwap(const a: integer): integer; | |
asm | |
bswap eax | |
end; | |
function ByteSwap16(inp:word): word; | |
asm | |
bswap eax | |
shr eax, 16 | |
end; | |
function TPngObject.RGB2Quad(RGB: pRGBPixel): TRGBQuad; | |
asm | |
push ecx | |
mov ax, [edx] | |
mov [esp], ax | |
mov al, [edx + 2] | |
mov [esp + 2], al | |
mov eax, [esp] | |
and eax, $00FFFFFF | |
pop edx | |
end; | |
function ReturnAddr: Pointer; | |
asm | |
MOV EAX,[EBP+4] // sysutils.pas says [EBP-4], but this works ! | |
end; | |
function CompareMem(P1, P2: Pointer; Length: Integer): Boolean; assembler; | |
asm | |
PUSH ESI | |
PUSH EDI | |
MOV ESI,P1 | |
MOV EDI,P2 | |
MOV EDX,ECX | |
XOR EAX,EAX | |
AND EDX,3 | |
SHR ECX,1 | |
SHR ECX,1 | |
REPE CMPSD | |
JNE @@2 | |
MOV ECX,EDX | |
REPE CMPSB | |
JNE @@2 | |
@@1: INC EAX | |
@@2: POP EDI | |
POP ESI | |
end; | |
GetSystemInfo(SysInfo); | |
asm | |
MOV EDX, Colors | |
MOV ECX, Count | |
DEC ECX | |
JS @@END | |
LEA EAX, SysInfo | |
CMP [EAX].TSystemInfo.wProcessorLevel, 3 | |
JE @@386 | |
@@1: MOV EAX, [EDX+ECX*4] | |
BSWAP EAX | |
SHR EAX,8 | |
MOV [EDX+ECX*4],EAX | |
DEC ECX | |
JNS @@1 | |
JMP @@END | |
@@386: | |
PUSH EBX | |
@@2: XOR EBX,EBX | |
MOV EAX, [EDX+ECX*4] | |
MOV BH, AL | |
MOV BL, AH | |
SHR EAX,16 | |
SHL EBX,8 | |
MOV BL, AL | |
MOV [EDX+ECX*4],EBX | |
DEC ECX | |
JNS @@2 | |
POP EBX | |
@@END: | |
end; | |
function Scan(Buf: PAnsiChar; Value: Byte; Count: integer): boolean; assembler; | |
asm | |
PUSH EDI | |
MOV EDI, Buf | |
MOV ECX, Count | |
MOV AL, Value | |
REPNE SCASB | |
MOV EAX, False | |
JNE @@1 | |
MOV EAX, True | |
@@1:POP EDI | |
end; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment