Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save valera-rozuvan/28a0ec1dd706a66cf1f08138939d0db1 to your computer and use it in GitHub Desktop.
Save valera-rozuvan/28a0ec1dd706a66cf1f08138939d0db1 to your computer and use it in GitHub Desktop.
All assembly code from thargor6/mb3d project
function FastLocateByte(const Where; Start, BSize: Integer; What: Word): Integer; assembler; pascal;
asm
push edi
mov ecx, [bsize]
sub ecx, [start]
jz @notfound // No data to search
mov edi, [where]
add edi, [start]
mov ax, [what]
@search:
repne scasb
je @found
@notfound:
mov eax, -1
jmp @end
@found:
mov eax, edi
dec eax
sub eax, [where]
@end:
pop edi
end;
function FastLocate2Bytes(const where; start, bsize: integer; what: word):integer; assembler; pascal; far;
asm
push edi
mov ecx, [bsize]
sub ecx, [start]
jz @notfound // No data to search
mov edi, [where]
add edi, [start]
mov ax, [what]
@search:
repne scasb
je @found
@notfound:
mov eax, -1
jmp @end
@found:
cmp [edi], ah
jne @search
mov eax, edi
dec eax
sub eax, [where]
@end:
pop edi
end;
function FastLocateDWord(var Where; BSize: Integer; What: LongInt): Integer; assembler; register;
asm
push edi
mov edi, eax
mov eax, ecx
mov ecx, edx
mov edx, edi
@search:
repne scasd
je @found
@notfound:
mov eax, -1
jmp @end
@found:
mov eax, edi
sub eax, edx
shr eax, 2
dec eax
@end:
pop edi
end;
procedure ZeroMem( var dest; sizeof: integer ); assembler; register;
asm
push edi { protect edi }
mov edi, eax { edi=@dest }
xor eax, eax { eax=0 }
mov ecx, edx
shr ecx, 2
rep stosd
mov ecx, edx
bt ecx, 1
jnc @stobyte
stosw
@stobyte:
bt ecx, 0
jnc @ende
stosb
@ende:
pop edi
end;
procedure FillDWord(var Dest; Count: Integer; Value: Cardinal); assembler; register;
asm
push edi // protect edi
mov edi, eax // edi=@dest
mov eax, ecx // eax=Value
mov ecx, edx
rep stosd
pop edi
end;
procedure FastFillChar( var dest; sizeof: integer; fill: byte ); assembler; register;
asm
push edi { protect edi }
mov edi, eax { edi=@dest }
mov ch, cl
mov ax, cx
bswap eax
mov ax, cx
mov ecx, edx
shr ecx, 2
rep stosd
mov ecx, edx
bt ecx, 1
jnc @stobyte
stosw
@stobyte:
bt ecx, 0
jnc @ende
stosb
@ende:
pop edi
end;
function GetSwap2(A: Word): Word; assembler; register;
asm
mov cl, al
mov al, ah
mov ah, cl
end;
procedure Swap4(var A: Cardinal); assembler; register;
asm
mov ecx, [eax]
bswap ecx
mov [eax], ecx
end;
function GetSwap4(A: Cardinal): Cardinal; assembler; register;
asm
bswap eax
end;
procedure SwapDWords(var A,B); assembler; register;
asm
push ebx
mov ebx, [eax]
mov ecx, [edx]
mov [eax], ecx
mov [edx], ebx
pop ebx
end;
function NotZeroSVec(sv: TPSVec): LongBool; //eax 0, $FFFFFFFF
asm
mov edx, [eax]
or edx, [eax + 4]
or edx, [eax + 8]
xor eax, eax
test edx, edx
jz @@1
mov eax, $FFFFFFFF
@@1:
end;
procedure Clamp0SvecSSE(sv1: TPSVec);
asm
movups xmm0, [eax]
xorps xmm1, xmm1
maxps xmm0, xmm1
movups [eax], xmm0
end;
procedure FlipVecs(V1, V2: TPVec3D);
asm
fld qword [eax]
fld qword [eax + 8]
fld qword [eax + 16]
fld qword [edx]
fld qword [edx + 8]
fld qword [edx + 16]
fstp qword [eax + 16]
fstp qword [eax + 8]
fstp qword [eax]
fstp qword [edx + 16]
fstp qword [edx + 8]
fstp qword [edx]
end;
function YofSVec(sv: TPSVec): Single;
asm // Result := sv[0] * s03 + sv[1] * s059 + sv[2] * s011;
fld dword [eax]
fmul s03
fld dword [eax + 4]
fmul s059
faddp
fld dword [eax + 8]
fmul s011
faddp
ftst
fnstsw ax
shr ah, 1
jnc @@1
fstp st
fldz
@@1:
end;
function MaxOfSVec(sv: TPSVec): Single;
asm
cmp SupportSSE, 0
jz @@1
push edx
movss xmm0, [eax]
maxss xmm0, [eax + 4]
maxss xmm0, [eax + 8]
movss [esp], xmm0
fld dword [esp]
pop edx
ret
@@1:
mov edx, eax
fld dword [eax]
fcom dword [edx + 4]
fnstsw ax
and ah, 41H
jz @@up1
fstp st
fld dword [edx + 4]
@@up1:
fcom dword [edx + 8]
fnstsw ax
and ah, 41H
jz @@up2
fstp st
fld dword [edx + 8]
@@up2:
end;
function D7Bequal(d1, d2: Double7B): LongBool;
asm
push ecx
mov ecx, [eax]
cmp ecx, [edx]
jne @@1
mov cx, [eax + 4]
cmp cx, word [edx + 4]
jne @@1
mov cl, [eax + 6]
cmp cl, byte [edx + 6]
jne @@1
mov eax, $FFFFFFFF
jmp @@2
@@1:
xor eax, eax
@@2:
pop ecx
end;
function D7BtoDouble(const D7B: Double7B): Double;
asm
add esp, -8
xor edx, edx
mov [esp], edx
mov edx, [eax]
mov [esp + 1], edx
mov edx, [eax + 3]
mov [esp + 4], edx
fld qword [esp]
add esp, 8
end;
function DoubleToD7B(const D: Double): Double7B;
asm
mov edx, [ebp + 9]
mov [eax], edx
mov edx, [ebp + 12]
mov [eax + 3], edx
end;
procedure MakeWNormalsFromDVec(PsiLight: TPLNormals; PDVec: TPVec3D);
asm
fld qword [edx]
fld st
fmul st, st //x²,x
fld qword [edx + 8]
fld st
fmul st, st //y²,y,x²,x
faddp st(2), st //y,x²+y²,x
fld qword [edx + 16]
fld st
fmul st, st //z²,z,y,x²+y²,x
faddp st(3), st //z,y,x²+y²+z²,x
fxch st(2) //x²+y²+z²,y,z,x
fadd d1em100
fsqrt
fdivr d32767
fmul st(3), st
fmul st(2), st
fmulp //y',z',x'
fistp word [eax + 2]
fistp word [eax + 4]
fistp word [eax]
end;
function FastIntPow(const base: Single; const expo: Integer): Single; //powers with expo in 2^x x in[1..much] for spec painting, if ipol, expo could be float!
asm
fld dword [ebp + 8]
mov edx, eax
ftst
fnstsw ax
shr ah, 1
jnc @@1
fstp st
fldz
jmp @@end
@@1:
shr edx, 1
@@2:
fmul st, st
shr edx, 1
jnz @@2
@@end:
end;
function Clamp0D(const d: Double): Double;
asm
fld qword [ebp + 8]
ftst
fnstsw ax
shr ah, 1
jnc @@end
fstp st
fldz
@@end:
end;
function Clamp01S(const sv: Single): Single;
asm
fld dword [ebp + 8]
ftst
fnstsw ax
shr ah, 1
jnc @@1
fstp st
fldz
jmp @@end
@@1:
fld1
fcomp st(1)
fnstsw ax
shr ah, 1
jnc @@end
fstp st
fld1
@@end:
end; //ret 4
function Clamp01D(const dv: Double): Double;
asm
fld qword [ebp + 8]
ftst
fnstsw ax
shr ah, 1
jnc @@1
fstp st
fldz
jmp @@end
@@1:
fld1
fcomp st(1)
fnstsw ax
shr ah, 1
jnc @@end
fstp st
fld1
@@end:
end;
function MakeSplineCoeff(const xs: Double): TSVec;
asm
fld d1d6
fld qword [ebp + 8]
fld st
fmul st, st
fmul st, st(1)
fmul st, st(2)
fst dword [eax + 12] //Result[3],xs,d1d6
fld1
fsub st, st(2) //1-xs,Result[3],xs,d1d6
fmul st, st(2)
fmul s05
fsubp st(3), st //Result[3],xs,d1d6 + 0.5 * xs * (xs - 1.0)
fsub st(2), st //Result[3],xs,Result[0]
fxch //xs,Result[3],Result[0]
fadd st, st(2)
fsub st, st(1)
fsub st, st(1) //Result[2],Result[3],Result[0]
fst dword [eax + 8]
fld1
fsubrp //1-Result[2],Result[3],Result[0]
fsubrp //1-Result[2]-Result[3],Result[0]
fsub st, st(1) //1-Result[2]-Result[3]-Result[0],Result[0]
fstp dword [eax + 4]
fstp dword [eax]
end;
function Add2SVecsWeight2(const sv1, sv2: TSVec; const w2: Single): TSVec;
asm
fld dword [edx]
fld dword [edx + 4]
fld dword [edx + 8]
fld dword [ebp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fadd dword [eax + 8]
fstp dword [ecx + 8]
fadd dword [eax + 4]
fstp dword [ecx + 4]
fadd dword [eax]
fstp dword [ecx]
xor eax, eax
mov [ecx + 12], eax
end;
function LinInterpolate2SVecs(const sv1, sv2: TSVec; const w1: Single): TSVec;
asm
cmp SupportSSE, 0
jz @@1
movss xmm2, [ebp + 8]
movups xmm0, [eax]
movups xmm1, [edx]
shufps xmm2, xmm2, 0
subps xmm0, xmm1
mulps xmm0, xmm2
addps xmm0, xmm1
movups [ecx], xmm0
pop ebp
ret 4
@@1:
fld dword [edx]
fld dword [edx + 4]
fld dword [edx + 8]
fld dword [edx + 12]
fld dword [ebp + 8]
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8] //s12,s11,s10,w1,s23,s22,s21,s20
fsub st, st(5)
fmul st, st(3)
faddp st(5), st //s11,s10,w1,s23,result2,s21,s20
fsub st, st(5)
fmul st, st(2)
faddp st(5), st //s10,w1,s23,result2,result1,s20
fsub st, st(5)
fmul st, st(1)
faddp st(5), st //w1,s23,result2,result1,result0
fld dword [eax + 12]
fsub st, st(2) //..,w1,s23,result2,result1,result0
fmulp //..*w1,s23,result2,result1,result0
faddp
fstp dword [ecx + 12]
fstp dword [ecx + 8]
fstp dword [ecx + 4]
fstp dword [ecx]
end;
function Add2SVecsWeight(const sv1, sv2: TSVec; const w1, w2: Single): TSVec;
asm
cmp SupportSSE, 0
jz @@1
movss xmm2, [ebp + 12]
movss xmm3, [ebp + 8]
movups xmm0, [eax]
movups xmm1, [edx]
shufps xmm2, xmm2, $C0
shufps xmm3, xmm3, $C0
mulps xmm0, xmm2
mulps xmm1, xmm3
addps xmm0, xmm1
movups [ecx], xmm0
pop ebp
ret 8
@@1:
fld dword [edx]
fld dword [edx + 4]
fld dword [edx + 8]
fld dword [ebp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fld dword [ebp + 12]
fmul st(3), st
fmul st(2), st
fmulp //s12,s11,s10,s22,s21,s20
xor eax, eax
faddp st(3), st //s11,s10,result2,s21,s20
faddp st(3), st
faddp st(3), st //result2,result1,result0
fstp dword [ecx + 8]
fstp dword [ecx + 4]
fstp dword [ecx]
mov [ecx + 12], eax
end;
procedure ClearSVec(var sv: TSVec);
asm
xor edx, edx
mov [eax], edx
mov [eax + 4], edx
mov [eax + 8], edx
mov [eax + 12], edx
end;
procedure ClearDVec(var dv: TVec3D);
asm
fldz
fst qword [eax]
fst qword [eax + 8]
fstp qword [eax + 16]
end;
procedure mClampSqrtSVecV(v: TPSVec);
asm
xor edx, edx
mov [eax + 12], edx
cmp SupportSSE, 0
jz @@1
movups xmm0, [eax]
xorps xmm1, xmm1
maxps xmm0, xmm1
sqrtps xmm0, xmm0
movups [eax], xmm0
ret
@@1:
mov edx, eax
fld dword [edx]
ftst
fnstsw ax
shr ah, 1
jnc @@2
fstp st
fldz
jmp @@21
@@2:
fsqrt
@@21:
fstp dword [edx]
fld dword [edx + 4]
ftst
fnstsw ax
shr ah, 1
jnc @@3
fstp st
fldz
jmp @@31
@@3:
fsqrt
@@31:
fstp dword [edx + 4]
fld dword [edx + 8]
ftst
fnstsw ax
shr ah, 1
jnc @@4
fstp st
fldz
jmp @@41
@@4:
fsqrt
@@41:
fstp dword [edx + 8]
end;
procedure mClampSqrSVecV(v: TPSVec);
asm
xor edx, edx
mov [eax + 12], edx
cmp SupportSSE, 0
jz @@1
movups xmm0, [eax]
xorps xmm1, xmm1
maxps xmm0, xmm1
mulps xmm0, xmm0
movups [eax], xmm0
ret
@@1:
mov edx, eax
fld dword [edx]
ftst
fnstsw ax
shr ah, 1
jnc @@2
fstp st
fldz
@@2:
fmul st, st
fstp dword [edx]
fld dword [edx + 4]
ftst
fnstsw ax
shr ah, 1
jnc @@3
fstp st
fldz
@@3:
fmul st, st
fstp dword [edx + 4]
fld dword [edx + 8]
ftst
fnstsw ax
shr ah, 1
jnc @@4
fstp st
fldz
@@4:
fmul st, st
fstp dword [edx + 8]
end;
function FastPow(const x, y: Single): Single; //used by vis light 3
asm // Result := x / (y - x * y + x);
fld dword [ebp+12]
fld st
fmul dword [ebp+8]
fsubr dword [ebp+8]
fadd st, st(1)
fdivp
end;
function MakeSVecFromNormalsD(PsiLight: Pointer): TSVec;
const d3: Double = 3.0518509476e-5;
asm
fild word [eax]
fild word [eax + 2]
fild word [eax + 4]
fld d3
fmul st(3), st
fmul st(2), st
fmulp
xor eax, eax
fstp dword [edx + 8]
fstp dword [edx + 4]
fstp dword [edx]
mov [edx + 12], eax
end;
function MinMaxSVecSSE(const smin, smax: Single; const V1: TSVec): TSVec;
asm
movss xmm1, [ebp + 12]
movss xmm2, [ebp + 8]
movups xmm0, [eax]
shufps xmm1, xmm1, 0
shufps xmm2, xmm2, 0
maxps xmm0, xmm1
minps xmm0, xmm2
movups [edx], xmm0
end;
function mSqrtSVec(const V1: TSVec): TSVec;
asm
cmp SupportSSE, 0
jz @@1
movups xmm0, [eax];
sqrtps xmm0, xmm0;
movups [edx], xmm0;
ret
@@1:
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fsqrt
xor eax, eax
fstp dword [edx + 8]
fsqrt
fstp dword [edx + 4]
fsqrt
fstp dword [edx]
mov [edx + 12], eax
end;
function LengthOfVec(const V: TVec3D): Double;
asm // Result := Sqrt(Sqr(V[0]) + Sqr(V[1]) + Sqr(V[2]));
fld qword [eax]
fmul st, st
fld qword [eax+8]
fmul st, st
faddp
fld qword [eax+16]
fmul st, st
faddp
fsqrt
end;
function SqrLengthOfVec(const V: TVec3D): Double;
asm
fld qword [eax]
fmul st, st
fld qword [eax+8]
fmul st, st
faddp
fld qword [eax+16]
fmul st, st
faddp
end;
function SqrLengthOfSVec(const V: TSVec): Single;
asm //eax st Result := Sqr(V[0]) + Sqr(V[1]) + Sqr(V[2]);
fld dword [eax]
fmul st, st
fld dword [eax + 4]
fmul st, st
faddp
fld dword [eax + 8]
fmul st, st
faddp
end;
function NormaliseVector(V: TPVec3D): TVec3D;
asm //max 4 st slots useable because of calling formula
fld qword [eax]
fld st //v0,v0
fmul st, st //v0²,v0
fld qword [eax + 8]
fld st //v1,v1,vo²,vo
fmul st, st //v1²,v1,v0²,v0
faddp st(2), st //v1,v0²+v1²,v0
fld qword [eax + 16]
fmul st, st //v2²,v1,v0²+v1²,v0
fadd d1em100
faddp st(2), st //v1,v0²+v1²+v2²,v0
fxch //v0²+v1²+v2²,v1,v0
fsqrt //r,v1,v0
fld1 //1,r,v1,v0
fdivrp //1/r,v1,v0
fmul st(2), st
fmul st(1), st
fmul qword [eax + 16] //v2',v1',v0'
fstp qword [edx + 16]
fstp qword [edx + 8]
fstp qword [edx] //
end;
procedure NormaliseVectorVar(var V: TVec3D);
asm
fld qword [eax]
fld st //v0,v0
fmul st, st //v0²,v0
fld qword [eax + 8]
fld st
fmul st, st //v1²,v1,v0²,v0
faddp st(2), st //v1,v0²+v1²,v0
fld qword [eax + 16]
fld st //v2,v2,v1,v0²+v1²,v0
fmul st, st //v2²,v2,v1,v0²+v1²,v0
fadd d1em100
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0
fxch st(2) //v0²+v1²+v2²,v1,v2,v0
fsqrt
fld1
fdivrp
fmul st(3), st
fmul st(2), st
fmulp //v1',v2',v0'
fstp qword [eax + 8]
fstp qword [eax + 16]
fstp qword [eax] //}
end;
procedure NormaliseSVectorVar(var V: TSVec);
asm
fld dword [eax]
fld st //v0,v0
fmul st, st //v0²,v0
fld dword [eax + 4]
fld st
fmul st, st //v1²,v1,v0²,v0
faddp st(2), st //v1,v0²+v1²,v0
fld dword [eax + 8]
fld st //v2,v2,v1,v0²+v1²,v0
fmul st, st //v2²,v2,v1,v0²+v1²,v0
fadd s1em30
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0
fxch st(2) //v0²+v1²+v2²,v1,v2,v0
fsqrt
fld1
fdivrp
fmul st(3), st
fmul st(2), st
fmulp //v1',v2',v0'
fstp dword [eax + 4]
fstp dword [eax + 8]
fstp dword [eax]
end;
function NormaliseVectorTo(const n: Double; const V: TVec3D): TVec3D; overload;
asm
fld qword [eax]
fld st //v0,v0
fmul st, st //v0²,v0
fld qword [eax + 8]
fld st
fmul st, st //v1²,v1,v0²,v0
faddp st(2), st //v1,v0²+v1²,v0
fld qword [eax + 16]
fld st //v2,v2,v1,v0²+v1²,v0
fmul st, st //v2²,v2,v1,v0²+v1²,v0
fadd d1em100
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0
fxch st(2) //v0²+v1²+v2²,v1,v2,v0
fsqrt
fld qword [ebp + 8]
fdivrp
fmul st(3), st
fmul st(2), st
fmulp //v1',v2',v0'
fstp qword [edx + 8]
fstp qword [edx + 16]
fstp qword [edx]
end;
procedure NormaliseVectorTo(const n: Double; V: TPVec3D); overload;
asm
fld qword [eax]
fld st //v0,v0
fmul st, st //v0²,v0
fld qword [eax + 8]
fld st
fmul st, st //v1²,v1,v0²,v0
faddp st(2), st //v1,v0²+v1²,v0
fld qword [eax + 16]
fld st //v2,v2,v1,v0²+v1²,v0
fmul st, st //v2²,v2,v1,v0²+v1²,v0
fadd d1em100
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0
fxch st(2) //v0²+v1²+v2²,v1,v2,v0
fsqrt
fld qword [ebp + 8]
fdivrp
fmul st(3), st
fmul st(2), st
fmulp //v1',v2',v0'
fstp qword [eax + 8]
fstp qword [eax + 16]
fstp qword [eax]
end;
function NormaliseSVector(const V: TSVec): TSVec; //..in SSE
asm
fld dword [eax]
fld st //v0,v0
fmul st, st //v0²,v0
fld dword [eax + 4]
fld st
fmul st, st //v1²,v1,v0²,v0
faddp st(2), st //v1,v0²+v1²,v0
fld dword [eax + 8]
fld st //v2,v2,v1,v0²+v1²,v0
fmul st, st //v2²,v2,v1,v0²+v1²,v0
fadd d1em100
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0
fxch st(2) //v0²+v1²+v2²,v1,v2,v0
fsqrt
fld1
fdivrp
fmul st(3), st
fmul st(2), st
fmulp //v1',v2',v0'
fstp dword [edx + 4]
fstp dword [edx + 8]
fstp dword [edx]
end;
procedure SVecToNormals(const sv: TSVec; pn: Pointer);
const d32767: Double = 32767;
asm
fld dword [eax]
fld st //v0,v0
fmul st, st //v0²,v0
fld dword [eax + 4]
fld st
fmul st, st //v1²,v1,v0²,v0
faddp st(2), st //v1,v0²+v1²,v0
fld dword [eax + 8]
fld st //v2,v2,v1,v0²+v1²,v0
fmul st, st //v2²,v2,v1,v0²+v1²,v0
fadd d1em100
faddp st(3), st //v2,v1,v0²+v1²+v2²,v0
fxch st(2) //v0²+v1²+v2²,v1,v2,v0
fsqrt
fld d32767
fdivrp
fmul st(3), st
fmul st(2), st
fmulp //v1',v2',v0'
fistp word [edx + 2]
fistp word [edx + 4]
fistp word [edx + 0]
end;
procedure RotateVector(V: TPVec3D; M: TPMatrix3); //is like reversed S version
asm
fld qword [edx]
fld qword [edx + 24]
fld qword [edx + 48]
fld qword [eax]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
fld qword [edx + 8]
fld qword [edx + 32]
fld qword [edx + 56]
fld qword [eax + 8]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fld qword [edx + 16]
fld qword [edx + 40]
fld qword [edx + 64]
fld qword [eax + 16]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp qword [eax + 16]
fstp qword [eax + 8]
fstp qword [eax]
end;
procedure RotateVectorReverse(V: TPVec3D; M: TPMatrix3);
asm
fld qword [edx]
fld qword [edx + 8]
fld qword [edx + 16]
fld qword [eax]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
fld qword [edx + 24]
fld qword [edx + 32]
fld qword [edx + 40]
fld qword [eax + 8]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fld qword [edx + 48]
fld qword [edx + 56]
fld qword [edx + 64]
fld qword [eax + 16]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp qword [eax + 16]
fstp qword [eax + 8]
fstp qword [eax]
end;
procedure RotateSVector(V: TPSVec; M: TPMatrix3);
asm
fld qword [edx]
fld qword [edx + 8]
fld qword [edx + 16]
fld dword [eax]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
fld qword [edx + 24]
fld qword [edx + 32]
fld qword [edx + 40]
fld dword [eax + 4]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fld qword [edx + 48]
fld qword [edx + 56]
fld qword [edx + 64]
fld dword [eax + 8]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp dword [eax + 8]
fstp dword [eax + 4]
fstp dword [eax]
end;
procedure RotateSVectorReverse(V: TPSVec; M: TPMatrix3);
asm
fld qword [edx]
fld qword [edx + 24]
fld qword [edx + 48]
fld dword [eax]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
fld qword [edx + 8]
fld qword [edx + 32]
fld qword [edx + 56]
fld dword [eax + 4]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fld qword [edx + 16]
fld qword [edx + 40]
fld qword [edx + 64]
fld dword [eax + 8]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp dword [eax + 8]
fstp dword [eax + 4]
fstp dword [eax]
end;
procedure RotateSVectorS(V: TPSVec; M: TPSMatrix3); //in calcpixelcol
asm // eax edx
cmp SupportSSE, 0
jz @@1
movss xmm0, [eax]
movss xmm1, [eax + 4]
movss xmm2, [eax + 8]
shufps xmm0, xmm0, 0
shufps xmm1, xmm1, 0
shufps xmm2, xmm2, 0
movups xmm4, [edx]
movups xmm5, [edx + 16]
movups xmm6, [edx + 32]
mulps xmm4, xmm0 //m0*v0
mulps xmm5, xmm1 //m1*v1
mulps xmm6, xmm2 //m2*v2
addps xmm4, xmm5
addps xmm4, xmm6
movups [eax], xmm4
ret
@@1:
fld dword [edx] //M[0,0]
fld dword [edx + 4]
fld dword [edx + 8] //M[0,2],M[0,1],M[0,0]
fld dword [eax] //V[0],M[0,2],M[0,1],M[0,0]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0) //M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0]
fld dword [edx + 16]
fld dword [edx + 20]
fld dword [edx + 24]
fld dword [eax + 4] //v[1],M[1,2],M[1,1],M[1,0], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0) //M[1,2]*V[1],M[1,1]*V[1],M[1,0]*V[1], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0]
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0) //M[1,2]*V[1]+M[0,2]*V[0], M[1,1]*V[1]+M[0,1]*V[0], M[1,0]*V[1]+M[0,0]*V[0]
fld dword [edx + 32]
fld dword [edx + 36]
fld dword [edx + 40]
fld dword [eax + 8]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp dword [eax + 8] //v2=m02*v0+m12*v1+m22*v2
fstp dword [eax + 4] //v1=m01*v0+m11*v1+m21*v2
fstp dword [eax] //v0=m00*v0+m10*v1+m20*v2
end;
procedure RotateVectorS(V: TPVec3D; M: TPSMatrix3);
asm
fld dword [edx] //M[0,0]
fld dword [edx + 4]
fld dword [edx + 8] //M[0,2],M[0,1],M[0,0]
fld qword [eax] //V[0],M[0,2],M[0,1],M[0,0]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0) //M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0]
fld dword [edx + 16]
fld dword [edx + 20]
fld dword [edx + 24]
fld qword [eax + 8] //v[1],M[1,2],M[1,1],M[1,0], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0) //M[1,2]*V[1],M[1,1]*V[1],M[1,0]*V[1], M[0,2]*V[0],M[0,1]*V[0],M[0,0]*V[0]
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0) //M[1,2]*V[1]+M[0,2]*V[0], M[1,1]*V[1]+M[0,1]*V[0], M[1,0]*V[1]+M[0,0]*V[0]
fld dword [edx + 32]
fld dword [edx + 36]
fld dword [edx + 40]
fld qword [eax + 16]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp qword [eax + 16] //v2=m02*v0+m12*v1+m22*v2
fstp qword [eax + 8] //v1=m01*v0+m11*v1+m21*v2
fstp qword [eax] //v0=m00*v0+m10*v1+m20*v2
end;
procedure RotateVectorReverseS(V: TPVec3D; M: TPSMatrix3);
asm
fld dword [edx]
fld dword [edx + 16]
fld dword [edx + 32]
fld qword [eax]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
fld dword [edx + 4]
fld dword [edx + 20]
fld dword [edx + 36]
fld qword [eax + 8]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fld dword [edx + 8]
fld dword [edx + 24]
fld dword [edx + 40]
fld qword [eax + 16]
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp qword [eax + 16]
fstp qword [eax + 8]
fstp qword [eax]
end;
procedure RotateSVectorReverseS(V: TPSVec; M: TPSMatrix3);
asm
fld dword [edx]
fld dword [edx + 16]
fld dword [edx + 32] //M[2,0], M[1,0], M[0,0]
fld dword [eax] //V[0]
fmul st(1), st(0) //V[0], V[0]*M[2,0], M[1,0], M[0,0]
fmul st(2), st(0) //V[0], V[0]*M[2,0], V[0]*M[1,0], M[0,0]
fmulp st(3), st(0) //V[0]*M[2,0], V[0]*M[1,0], V[0]*M[0,0]
fld dword [edx + 4]
fld dword [edx + 20]
fld dword [edx + 36]
fld dword [eax + 4]
fmul st(1), st(0) //+v[1]*M[x,1]
fmul st(2), st(0)
fmulp st(3), st(0) //V[1]*M[2,1], V[1]*M[1,1], V[1]*M[0,1], V[0]*M[2,0], V[0]*M[1,0], V[0]*M[0,0]
faddp st(3), st(0) //v0*m20+v1*m21
faddp st(3), st(0)
faddp st(3), st(0)
fld dword [edx + 8]
fld dword [edx + 24]
fld dword [edx + 40]
fld dword [eax + 8]
fmul st(1), st(0) //+v[2]*M[x,2]
fmul st(2), st(0)
fmulp st(3), st(0) //v0*m20+v1*m21+v2*m22
faddp st(3), st(0)
faddp st(3), st(0)
faddp st(3), st(0)
fstp dword [eax + 8] //v0*m20+v1*m21+v2*m22
fstp dword [eax + 4] //v0*m10+v1*m11+v2*m12
fstp dword [eax] //v0*m00+v1*m01+v2*m02
end;
function AddSVectors(const V1, V2: TSVec): TSVec; overload;
asm
fld dword [eax]
fadd dword [edx]
fstp dword [ecx]
fld dword [eax + 4]
fadd dword [edx + 4]
fstp dword [ecx + 4]
fld dword [eax + 8]
fadd dword [edx + 8]
fstp dword [ecx + 8]
xor eax, eax
mov [ecx + 12], eax
end;
procedure AddSVectors(V1: TPSVec; const V2: TSVec); overload;
asm
fld dword [eax]
fadd dword [edx]
fstp dword [eax]
fld dword [eax + 4]
fadd dword [edx + 4]
fstp dword [eax + 4]
fld dword [eax + 8]
fadd dword [edx + 8]
fstp dword [eax + 8]
end;
function MakeSVecMultiplierFromDynFogCol(sv: TSVec): TSVec; //not used
asm
cmp SupportSSE2, 0
jz @@1
@@1:
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fld s1d255
fmul st(3), st(0)
fmul st(2), st(0)
fmulp
fld1
fsubr st(3), st(0)
fsubr st(2), st(0)
fsubr st(1), st(0)
fxch st(3) //vs0,vs2,vs1,1
ftst
fnstsw ax
shr ah, 1
jnc @up1
fstp st(0)
fldz
@up1:
fcom st(3)
fnstsw ax
shr ah, 1
jc @skip1
fstp st(0)
fld1
@skip1:
fstp dword [edx]
ftst
fnstsw ax
shr ah, 1
jnc @up2
fstp st(0)
fldz
@up2:
fcom st(2)
fnstsw ax
shr ah, 1
jc @skip2
fstp st(0)
fld1
@skip2:
fstp dword [edx + 8]
ftst
fnstsw ax
shr ah, 1
jnc @up3
fstp st(0)
fldz
@up3:
fcom st(1)
fnstsw ax
shr ah, 1
jc @skip3
fstp st(0)
fld1
@skip3:
fstp dword [edx + 4]
fstp st(0)
xor eax, eax
mov dword [edx + 12], eax
end;
procedure AddSVecWeightS(V1, V2: TPSVec; const W: Single); overload;
asm
cmp SupportSSE, 0
jz @@1
movss xmm2, [ebp + 8]
movups xmm0, [edx]
shufps xmm2, xmm2, 0
movups xmm1, [eax]
mulps xmm0, xmm2
addps xmm0, xmm1
movups [eax], xmm0
pop ebp
ret 4
@@1:
fld dword [edx]
fld dword [edx + 4]
fld dword [edx + 8]
fld dword [ebp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fadd dword [eax + 8]
fstp dword [eax + 8]
fadd dword [eax + 4]
fstp dword [eax + 4]
fadd dword [eax]
fstp dword [eax]
end; //ret 4
procedure AddSVecWeightS(var V1: TSVec; const V2: TSVec; const W: Single); overload;
asm
cmp SupportSSE, 0
jz @@1
movss xmm2, [ebp + 8]
movups xmm0, [edx]
shufps xmm2, xmm2, 0
movups xmm1, [eax]
mulps xmm0, xmm2
addps xmm0, xmm1
movups [eax], xmm0
pop ebp
ret 4
@@1:
fld dword [edx]
fld dword [edx + 4]
fld dword [edx + 8]
fld dword [ebp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fadd dword [eax + 8]
fstp dword [eax + 8]
fadd dword [eax + 4]
fstp dword [eax + 4]
fadd dword [eax]
fstp dword [eax]
end;
function DotOfSVectors(const V1, V2: TSVec): Single;
asm
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp
fld dword [eax + 8]
fmul dword [edx + 8]
faddp
end;
function SubtractVectors2s(const V1, V2: TVec3D): TSVec;
asm
fld qword [eax]
fsub qword [edx]
fstp dword [ecx]
fld qword [eax + 8]
fsub qword [edx + 8]
fstp dword [ecx + 4]
fld qword [eax + 16]
fsub qword [edx + 16]
fstp dword [ecx + 8]
xor eax, eax
mov [ecx + 12], eax
end;
function SubtractVectors(const V1, V2: TVec3D): TVec3D; overload;
asm
fld qword [eax]
fsub qword [edx]
fstp qword [ecx]
fld qword [eax + 8]
fsub qword [edx + 8]
fstp qword [ecx + 8]
fld qword [eax + 16]
fsub qword [edx + 16]
fstp qword [ecx + 16]
end;
// eax edx ecx
function SubtractVectors(V1: TPVec3D; const V2: TVec3D): TVec3D; overload;
asm
fld qword [eax]
fsub qword [edx]
fstp qword [ecx]
fld qword [eax + 8]
fsub qword [edx + 8]
fstp qword [ecx + 8]
fld qword [eax + 16]
fsub qword [edx + 16]
fstp qword [ecx + 16]
end;
function SubtractVectors(const V1: TVec3D; V2: TPVec3D): TVec3D; overload;
asm
fld qword [eax]
fsub qword [edx]
fstp qword [ecx]
fld qword [eax + 8]
fsub qword [edx + 8]
fstp qword [ecx + 8]
fld qword [eax + 16]
fsub qword [edx + 16]
fstp qword [ecx + 16]
end;
function SubtractSVectors(V1: TPSVec; const V2: TSVec): TSVec;
asm
fld dword [eax]
fsub dword [edx]
fstp dword [ecx]
fld dword [eax + 4]
fsub dword [edx + 4]
fstp dword [ecx + 4]
fld dword [eax + 8]
fsub dword [edx + 8]
fstp dword [ecx + 8]
xor eax, eax
mov [ecx + 12], eax
end;
function AddSVecS(const V1: TSVec; const s: Single): TSVec;
asm
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fld dword [esp + 8]
fadd st(3), st
fadd st(2), st
faddp
fstp dword [edx + 8]
fstp dword [edx + 4]
fstp dword [edx]
xor eax, eax
mov [edx + 12], eax
end;
procedure ScaleSVectorV(V1: TPSVec; const s: Single);
asm
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fld dword [esp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fstp dword [eax + 8]
fstp dword [eax + 4]
fstp dword [eax]
end;
function MultiplySVectors(const V1, V2: TSVec): TSVec;
asm
fld dword [eax + 8]
fld dword [eax + 4]
fld dword [eax]
fmul dword [edx]
fstp dword [ecx]
fmul dword [edx + 4]
fstp dword [ecx + 4]
fmul dword [edx + 8]
fstp dword [ecx + 8]
xor eax, eax
mov [ecx + 12], eax
end;
procedure MultiplySVectorsV(V1, V2: TPSVec); overload;
asm
fld dword [eax + 8]
fld dword [eax + 4]
fld dword [eax]
fmul dword [edx]
fstp dword [eax]
fmul dword [edx + 4]
fstp dword [eax + 4]
fmul dword [edx + 8]
fstp dword [eax + 8]
end;
procedure MultiplySVectorsV(V1: TPSVec; const V2: TSVec); overload;
asm
fld dword [eax + 8]
fld dword [eax + 4]
fld dword [eax]
fmul dword [edx]
fstp dword [eax]
fmul dword [edx + 4]
fstp dword [eax + 4]
fmul dword [edx + 8]
fstp dword [eax + 8]
end;
function ScaleSVector(const V1: TSVec; const s: Single): TSVec;
asm
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fld dword [esp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fstp dword [edx + 8]
fstp dword [edx + 4]
fstp dword [edx]
end;
function ScaleSVector4(const V1: TSVec; const s: Single): TSVec;
asm
cmp SupportSSE, 0
jz @1
movss xmm1, [esp + 8]
movups xmm0, [eax]
shufps xmm1, xmm1, 0
mulps xmm0, xmm1
movups [eax], xmm0
ret 4
@1: fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fld dword [eax + 12]
fld dword [esp + 8]
fmul st(4), st
fmul st(3), st
fmul st(2), st
fmulp
fstp dword [edx + 12]
fstp dword [edx + 8]
fstp dword [edx + 4]
fstp dword [edx]
end;
function ScaleSVectorD(V1: TPSVec; const d: Double): TSVec;
asm
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8]
fld qword [esp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fstp dword [edx + 8]
fstp dword [edx + 4]
fstp dword [edx]
xor eax, eax
mov [edx + 12], eax
end;
procedure BuildViewVectorDFOV(var xa, ya: Double; v: TPVec3D);
asm // -sinY, sinX, cosX*cosY ...pano: sinX*cosY, sinY, -cosX*cosY
fld qword [eax]
fsincos //cosX,sinX
fld qword [edx]
fsincos //cosY,sinY,cosX,sinX
fmulp st(2), st(0) //sinY,cosX*cosY,sinX
fchs
fld st(0) //normalize
fmul st(0), st(1)
fld st(2)
fmul st(0), st(3)
faddp
fld st(3)
fmul st(0), st(4)
faddp
fsqrt
fld1
fdivrp
fmul st(3), st(0)
fmul st(2), st(0)
fmulp
fstp qword [ecx] //cosX*cosY,sinX
fstp qword [ecx + 16] //sinX
fstp qword [ecx + 8]
end;
procedure BuildViewVectorDSphereFOV(var xa, ya: Double; v: TPVec3D);
asm //x<->y
fld qword [edx]
fsincos //cosY,sinY
fld qword [eax]
fsincos //cosX,sinX,cosY,sinY
fmul st(2), st(0) //cosX,sinX,cosX*cosY,sinY // pano: sinX*cosY, sinY, cosX*cosY
fmulp st(3), st(0) //sinX,cosX*cosY,sinY*cosX
fstp qword [ecx + 8] //cosX*cosY,sinX*cosY
fstp qword [ecx + 16]
fchs
fstp qword [ecx]
end;
procedure BuildViewVectorSphereFOV(var xa, ya: Double; v: TPSVec);
asm
fld qword [edx]
fsincos //cosX,sinX X<->Y
fld qword [eax]
fsincos //cosY,sinY,cosX,sinX
fmul st(2), st(0) //cosY,sinY,cosX*cosY,sinX // pano: sinX*cosY, sinY, cosX*cosY
fmulp st(3), st(0) //sinY,cosX*cosY,sinX*cosY
fstp dword [ecx + 4] //cosX*cosY,sinX*cosY
fstp dword [ecx + 8]
fchs
fstp dword [ecx]
fldz
fstp dword [ecx + 12]
end;
procedure BuildViewVectorFOV(var xa, ya: Double; v: TPSVec);
asm // -sinY, sinX, cosX*cosY
fld qword [eax]
fsincos //cosX,sinX
fld qword [edx]
fsincos //cosY,sinY,cosX,sinX
fmulp st(2), st(0) //sinY,cosX*cosY,sinX
fchs //x,z,y
fld st(0) //normalize
fmul st(0), st(1)
fld st(2)
fmul st(0), st(3)
faddp
fld st(3)
fmul st(0), st(4)
faddp
fsqrt
fld1
fdivrp
fmul st(1), st(0)
fmul st(2), st(0)
fmulp st(3), st(0)
fstp dword [ecx] //cosX*cosY,sinX
fstp dword [ecx + 8] //sinX
fstp dword [ecx + 4]
fldz
fstp dword [ecx + 12]
end;
procedure SVectorChangeSign(V1: TPSVec);
asm
mov edx, $80000000
xor [eax], edx
xor [eax + 4], edx
xor [eax + 8], edx
end;
procedure mAddVecWeight(V1, V2: TPVec3D; const W: Double);
asm
cmp SupportSSE2, 0
jz @@1
movlpd xmm1, [ebp + 8]
movupd xmm2, [edx]
unpcklpd xmm1, xmm1
movupd xmm0, [eax]
mulpd xmm2, xmm1
mulsd xmm1, [edx + 16]
addpd xmm0, xmm2
addsd xmm1, [eax + 16]
movupd [eax], xmm0
movsd [eax + 16], xmm1
pop ebp
ret 8
@@1:
fld qword [edx]
fld qword [edx + 8]
fld qword [edx + 16]
fld qword [ebp + 8]
fmul st(3), st
fmul st(2), st
fmulp
fadd qword [eax + 16]
fstp qword [eax + 16]
fadd qword [eax + 8]
fstp qword [eax + 8]
fadd qword [eax]
fstp qword [eax]
end;
procedure mCopyAddVecWeight(V1, V2, V3: TPVec3D; const W: Double);
asm //dest,src,add weight
cmp SupportSSE2, 0
jz @@1
movlpd xmm1, [ebp + 8]
movupd xmm2, [ecx]
unpcklpd xmm1, xmm1
movupd xmm0, [edx]
mulpd xmm2, xmm1
mulsd xmm1, [ecx + 16]
addpd xmm0, xmm2
addsd xmm1, [edx + 16]
movupd [eax], xmm0
movsd [eax + 16], xmm1
pop ebp
ret 8
@@1:
fld qword [ecx]
fld qword [ecx + 8]
fld qword [ecx + 16]
fld qword [ebp + 8]
fmul st(3), st(0)
fmul st(2), st(0)
fmulp
fadd qword [edx + 16]
fstp qword [eax + 16]
fadd qword [edx + 8]
fstp qword [eax + 8]
fadd qword [edx]
fstp qword [eax]
end;
procedure mCopyVec(Vd, Vs: TPVec3D);
asm
fld qword [edx + 16]
fld qword [edx + 8]
fld qword [edx]
fstp qword [eax]
fstp qword [eax + 8]
fstp qword [eax + 16]
end;
procedure CopyVecSSE2(V1, V2: TPVec3D); //not used
asm
movupd xmm0, [edx]
movlpd xmm1, [edx + 16]
movupd [eax], xmm0
movlpd [eax + 16], xmm1
end;
procedure CopyVec4SSE2(V1, V2: TPVec4D);
asm
movupd xmm0, [edx]
movupd xmm1, [edx + 16]
movupd [eax], xmm0
movupd [eax + 16], xmm1
end;
procedure AddSubVecWeightSSE2(V1, V2, V3: TPVec3D; const W: Double);
asm
movlpd xmm7, [ebp + 8]
movhpd xmm7, [ebp + 8]
movupd xmm2, [ecx]
movupd xmm0, [edx]
movlpd xmm1, [edx + 16]
subpd xmm0, xmm2
subsd xmm1, [ecx + 16]
movupd xmm4, [eax]
mulpd xmm0, xmm7
mulsd xmm1, xmm7
addpd xmm0, xmm4
addsd xmm1, [eax + 16]
movupd [eax], xmm0
movlpd [eax + 16], xmm1
end;
function MaxCS(s1, s2: Single): Single;
asm
fld dword [ebp + 8]
fcomp dword [ebp + 12]
fnstsw ax
shr ah, 1
jc @S2isSmallerThanS1
fld dword [ebp + 8]
jmp @end
@S2isSmallerThanS1:
fld dword [ebp + 12]
@end:
end;
function Max0S(s: Single): Single;
asm
fld dword [ebp + 8]
ftst
fnstsw ax
shr ah, 1
jnc @@1
fstp st
fldz
@@1:
end;
function MinCS(const s1, s2: Single): Single;
asm
fld dword [ebp + 8]
fcomp dword [ebp + 12]
fnstsw ax
shr ah, 1
jc @S2isSmallerThanS1
fld dword [ebp + 12]
jmp @end
@S2isSmallerThanS1:
fld dword [ebp + 8]
@end:
end;
procedure MinMaxSvar(const smin, smax: Single; var s: Single);
asm
cmp SupportSSE, 0
jz @@1
movss xmm0, [eax]
maxss xmm0, [ebp + 12]
minss xmm0, [ebp + 8]
movss [eax], xmm0
pop ebp
ret 8
@@1:
mov edx, eax
fld dword [eax]
fcom dword [ebp + 12]
fnstsw ax
shr ah, 1
jc @SminIsSmallerThanS
fcom dword [ebp + 8]
fnstsw ax
shr ah, 1
jc @end
fstp st(0)
fld dword [ebp + 8]
jmp @end
@SminIsSmallerThanS:
fstp st(0)
fld dword [ebp + 12]
@end:
fstp dword [edx]
end;
function MinMaxCS(const smin, s, smax: Single): Single;
asm
fld dword [ebp + 12]
fcom dword [ebp + 16]
fnstsw ax
shr ah, 1
jc @SminIsSmallerThanS
fcom dword [ebp + 8]
fnstsw ax
shr ah, 1
jc @end
fstp st(0)
fld dword [ebp + 8]
jmp @end
@SminIsSmallerThanS:
fstp st(0)
fld dword [ebp + 16]
@end:
end;
function Min0MaxCS(const s, smax: Single): Single;
asm
fld dword [ebp + 12]
ftst
fnstsw ax
shr ah, 1
jc @SminIsSmallerThanS
fcom dword [ebp + 8]
fnstsw ax
shr ah, 1
jc @end
fstp st(0)
fld dword [ebp + 8]
jmp @end
@SminIsSmallerThanS:
fstp st(0)
fldz
@end:
end;
procedure MaxCDvar(var ds, ddest: Double);
asm
fld qword [eax]
fcom qword [edx]
fnstsw ax
shr ah, 1
jc @@1
fstp qword [edx]
ret
@@1:
fstp st
end;
procedure Clamp1Svar(var s: Single);
asm
fld1
mov edx, eax
fcom dword [eax]
fnstsw ax
shr ah, 1
jnc @@1
fstp dword [edx]
ret
@@1:
fstp st
end;
function Min0MaxCD(const d, dmax: Double): Double;
asm
fld qword [ebp + 16]
ftst
fnstsw ax
shr ah, 1
jc @@1
fcom qword [ebp + 8]
fnstsw ax
shr ah, 1
jc @end
fstp st(0)
fld qword [ebp + 8]
jmp @end
@@1:
fstp st(0)
fldz
@end:
end;
function MinCD(const s1, s2: Double): Double;
asm
fld qword [ebp + 8]
fcomp qword [ebp + 16]
fnstsw ax
shr ah, 1
jc @S2isSmallerThanS1
fld qword [ebp + 16]
jmp @end
@S2isSmallerThanS1:
fld qword [ebp + 8]
@end:
end;
function MaxCD(const s1, s2: Double): Double;
asm
fld qword [ebp + 8]
fcomp qword [ebp + 16]
fnstsw ax
shr ah, 1
jc @S2isSmallerThanS1
fld qword [ebp + 8]
jmp @end
@S2isSmallerThanS1:
fld qword [ebp + 16]
@end:
end;
function MaxAbsCD(const s1, s2: Double): Double;
asm
fld qword [ebp + 16]
fabs
fld qword [ebp + 8]
fabs
fcompp
fnstsw ax
shr ah, 1
jc @S2isSmallerThanS1
fld qword [ebp + 8]
jmp @end
@S2isSmallerThanS1:
fld qword [ebp + 16]
@end:
end;
function MinAbsCD(const s1, s2: Double): Double;
asm
fld qword [ebp + 16]
fabs
fld qword [ebp + 8]
fabs
fcompp
fnstsw ax
shr ah, 1
jc @S2isSmallerThanS1
fld qword [ebp + 16]
jmp @end
@S2isSmallerThanS1:
fld qword [ebp + 8]
@end:
end;
procedure SinCosD(const a: Double; var Sin, Cos: Double);
asm
fld a
fsincos
fstp qword ptr [edx] // Cos
fstp qword ptr [eax] // Sin
end;
procedure SinCosS(const a: Double; var Sin, Cos: Single);
asm
fld a
fsincos
fstp dword ptr [edx] // Cos
fstp dword ptr [eax] // Sin
end;
function FracSingle(const s: Single): Single;
asm
fld s //ebp+8
fld st(0)
sub esp, 4
fnstcw [esp].word // save
fnstcw [esp + 2].word // scratch
or [esp + 2].word, $0F00 // trunc toward zero, full precision
fldcw [esp + 2].word
frndint
fldcw [esp].word
add esp, 4
fsubp
end;
function MonitorComponent(Component: TComponent): Boolean;
// ...
asm
mov eax,[ebp+4]
mov Addr,eax
end;
// ...
constructor TMonitorObject.Create;
// ...
asm
mov eax,[ebp+4]
mov Addr,eax
end;
// ...
procedure GetMem(var P; Size: Integer);
// ...
asm
mov eax,[ebp+4]
mov Addr,eax
end;
// ...
procedure BuildATlevels(MWidth, MHeight: Integer);
// ...
asm
push eax
push ebx
push ecx
push esi
push edi
mov ebx, iStep2
mov ecx, MWidth
mov esi, PATL2
sub ecx, ebx
mov edi, PATL
shr ecx, 2
sub esi, ebx
mov eax, ecx
sub edi, esi
shl eax, 2
add x2, eax
@ll: movq mm0, [esi] // calculate 4 words at once
pavgw mm0, [esi + ebx * 2]
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, 8
dec ecx
jnz @ll
add edi, esi
mov PATL, edi
add esi, ebx
mov PATL2, esi
pop edi
pop esi
pop ecx
pop ebx
pop eax
end;
// ...
asm
push eax
push ebx
push ecx
push edx
push esi
push edi
movq mm1, W4tmp
mov ebx, MWidth2step
mov esi, PATL2
mov edi, PATL
mov ecx, iStep
mov edx, MWidth
sub esi, ebx
add edx, edx
sub edi, esi
dec ecx
@l1: movq mm0, [esi + ebx * 2]
pavgw mm0, mm1
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, edx
dec ecx
jns @l1
mov ecx, MHeight
sub ecx, iStep2
dec ecx
js @u2
@l2: movq mm0, [esi]
pavgw mm0, [esi + ebx * 2]
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, edx
dec ecx
jns @l2
@u2:
movq mm1, W4tmp2
mov ecx, iStep
dec ecx
@l3: movq mm0, [esi]
pavgw mm0, mm1
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, edx
dec ecx
jns @l3
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
// ...
asm
emms
end;
// ...
procedure TAmbShadowCalc.Execute;
// ...
asm // stmxcsr i
stmxcsr x
end; // if i<>$1f80 then i:=0; //=8064
// ...
function BuildATlevels(PsiLight, MWidth, MHeight: Integer; PATlevel: TPATlevel; var CorrMul: Single; var Zsub: Integer): Integer;
// ...
asm
emms
end;
// ...
asm
push eax
push ebx
push ecx
push esi
push edi
mov ebx, iStep2
mov ecx, MWidth
mov esi, PATL2
sub ecx, ebx
mov edi, PATL
shr ecx, 2
sub esi, ebx
mov eax, ecx
sub edi, esi
shl eax, 2
add x2, eax
@ll: movq mm0, [esi] // calculate 4 words at once
pavgw mm0, [esi + ebx * 2]
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, 8
dec ecx
jnz @ll
add edi, esi
mov PATL, edi
add esi, ebx
mov PATL2, esi
pop edi
pop esi
pop ecx
pop ebx
pop eax
end;
// ...
asm
push eax
push ebx
push ecx
push edx
push esi
push edi
movq mm1, W4tmp
mov ebx, MWidth2step
mov esi, PATL2
mov edi, PATL
mov ecx, iStep
mov edx, MWidth
sub esi, ebx
add edx, edx
sub edi, esi
dec ecx
@l1: movq mm0, [esi + ebx * 2]
pavgw mm0, mm1
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, edx
dec ecx
jns @l1
mov ecx, MHeight
sub ecx, iStep2
dec ecx
js @u2
@l2: movq mm0, [esi]
pavgw mm0, [esi + ebx * 2]
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, edx
dec ecx
jns @l2
@u2:
movq mm1, W4tmp2
mov ecx, iStep
dec ecx
@l3: movq mm0, [esi]
pavgw mm0, mm1
pavgw mm0, [esi + ebx]
movq [edi + esi], mm0
add esi, edx
dec ecx
jns @l3
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
// ...
asm
emms
end;
// ...
procedure TAmbShadowCalc.Execute;
// ...
asm
stmxcsr x //set roundingmode sse
end; // if i<>$1f80 then i:=0; //=8064 }
// ...
asm
push eax
push ebx
push ecx
push esi
push edi
movss xmm7, RM
mov esi, PATL
lea edi, zp4
cvtsi2ss xmm4, RadS
movss xmm5, sZRTLev //sZRT
rsqrtss xmm4, xmm4
movzx eax, word [esi]
movzx ebx, word [esi + 2]
mulss xmm7, xmm4
sub eax, [edi]
sub ebx, [edi + 4]
cvtss2si ecx, xmm7 //iC
shufps xmm4, xmm4, 0 //R1d
cvtsi2ss xmm0, eax
cvtsi2ss xmm1, ebx
movzx eax, word [esi + 4]
movzx ebx, word [esi + 6]
sub eax, [edi + 8]
sub ebx, [edi + 12]
cvtsi2ss xmm2, eax
cvtsi2ss xmm3, ebx
shufps xmm0, xmm1, 0
shufps xmm2, xmm3, 0
shufps xmm5, xmm5, 0
shufps xmm0, xmm2, $88
mov eax, iAngC
mov ebx, ecx
shr ebx, 1
sub eax, ebx
and eax, 31
add eax, eax
mulps xmm0, xmm4
lea esi, [AngMaxArr4 + eax * 8]
minps xmm0, xmm5
@ll: movups xmm1, [esi]
maxps xmm1, xmm0
movups [esi], xmm1
add esi, 16
dec ecx
jns @ll
pop edi
pop esi
pop ecx
pop ebx
pop eax
end
// ...
asm
push eax
push ebx
push esi
push edi
mov esi, PATL
lea edi, zp4
cvtsi2ss xmm4, RadS
movzx eax, word [esi]
movzx ebx, word [esi + 2]
sub eax, [edi]
sub ebx, [edi + 4]
shufps xmm4, xmm4, 0
movss xmm5, sZRTLev
cvtsi2ss xmm0, eax
cvtsi2ss xmm1, ebx
rsqrtps xmm4, xmm4 //only 4..6 clocks, not slower than scalar
movzx eax, word [esi + 4]
movzx ebx, word [esi + 6]
sub eax, [edi + 8]
sub ebx, [edi + 12]
cvtsi2ss xmm2, eax
cvtsi2ss xmm3, ebx
shufps xmm5, xmm5, 0
shufps xmm0, xmm1, 0
shufps xmm2, xmm3, 0
mov eax, iAngC
shufps xmm0, xmm2, $88
add eax, eax
mulps xmm0, xmm4
movups xmm1, dqword [AngMaxArr4 + eax * 8]
minps xmm0, xmm5
maxps xmm1, xmm0
movups dqword [AngMaxArr4 + eax * 8], xmm1
pop edi
pop esi
pop ebx
pop eax
end
// ...
function BuildATlevelsT0(PsiLight, MWidth, MHeight: Integer; PATlevel: TPATlevel; sZRT: Single): Integer;
// ...
asm
push eax
push ebx
push ecx
push edx
mov ecx, xa
mov edx, PATL2
mov eax, iStep2
add ecx, 4
mov ebx, PATL
sub edx, eax
@@1: cmp ecx, iwids
jg @@3
movq mm4, [edx + eax]
movq mm1, [edx] //it1
paddw mm4, iTh4 //PATL2^ + iTh
psubw mm4, sub32k
movq mm2, [edx + eax * 2] //it2
psubw mm1, sub32k
psubw mm2, sub32k
pminsw mm1, mm4 //only signed word, therefore first sub, afterwards add
pminsw mm2, mm4
paddw mm1, sub32k
paddw mm2, sub32k
pavgw mm1, mm2 //Average unsigned words
pavgw mm1, [edx + eax]
movq [ebx], mm1
add ebx, 8
add edx, 8
add ecx, 4
jmp @@1
@@3: sub ecx, 4
add edx, eax
mov xa, ecx
mov PATL, ebx
mov PATL2, edx
pop edx
pop ecx
pop ebx
pop eax
end; // xa
// ...
asm
emms
end;
// ...
procedure TAmbShadowCalcT0.Execute;
// ...
asm
stmxcsr x
end; // if i<>$1f80 then i:=0; //=8064 }
// ...
asm
push eax
push ebx
push ecx
push esi
push edi
movss xmm7, RM
mov esi, PATL
lea edi, zp4
cvtsi2ss xmm4, RadS
movss xmm5, sZRTLev
rsqrtss xmm4, xmm4
movzx eax, word [esi]
movzx ebx, word [esi + 2]
mulss xmm7, xmm4
sub eax, [edi]
sub ebx, [edi + 4]
cvtss2si ecx, xmm7 //iC
shufps xmm4, xmm4, 0 //R1d
cvtsi2ss xmm0, eax
cvtsi2ss xmm1, ebx
movzx eax, word [esi + 4]
movzx ebx, word [esi + 6]
sub eax, [edi + 8]
sub ebx, [edi + 12]
cvtsi2ss xmm2, eax
cvtsi2ss xmm3, ebx
shufps xmm0, xmm1, 0
shufps xmm2, xmm3, 0
shufps xmm5, xmm5, 0
shufps xmm0, xmm2, $88
mov eax, iAngC
mov ebx, ecx
shr ebx, 1
sub eax, ebx
and eax, 31
add eax, eax
mulps xmm0, xmm4
minps xmm0, xmm5
rcpps xmm2, xmm5 //approx 1/x
lea esi, [AngMaxArr4 + eax * 8]
mulps xmm2, xmm0
movaps xmm4, xmm2
mulps xmm2, xmm2
mulps xmm2, xmm4
mulps xmm2, xmm0
subps xmm0, xmm2
@ll: movups xmm1, [esi]
maxps xmm1, xmm0
movups [esi], xmm1
add esi, 16
dec ecx
jns @ll
pop edi
pop esi
pop ecx
pop ebx
pop eax
end
// ...
asm
push eax
push ebx
push esi
push edi
mov esi, PATL
lea edi, zp4
cvtsi2ss xmm4, RadS
movzx eax, word [esi]
movzx ebx, word [esi + 2]
sub eax, [edi]
sub ebx, [edi + 4]
shufps xmm4, xmm4, 0
movss xmm5, sZRTLev
cvtsi2ss xmm0, eax
cvtsi2ss xmm1, ebx
rsqrtps xmm4, xmm4 //only 4..6 clocks, not slower than scalar
movzx eax, word [esi + 4]
movzx ebx, word [esi + 6]
sub eax, [edi + 8]
sub ebx, [edi + 12]
cvtsi2ss xmm2, eax
cvtsi2ss xmm3, ebx
shufps xmm5, xmm5, 0
shufps xmm0, xmm1, 0
shufps xmm2, xmm3, 0
mov eax, iAngC
shufps xmm0, xmm2, $88
add eax, eax
mulps xmm0, xmm4
minps xmm0, xmm5
movups xmm1, dqword [AngMaxArr4 + eax * 8]
rcpps xmm2, xmm5
mulps xmm2, xmm0
movaps xmm4, xmm2
mulps xmm2, xmm2
mulps xmm2, xmm4
mulps xmm2, xmm0
subps xmm0, xmm2
@up: maxps xmm1, xmm0
movups dqword [AngMaxArr4 + eax * 8], xmm1
pop edi
pop esi
pop ebx
pop eax
end
// ...
function ColToSVecFlipRBc(c: Cardinal): TSVec;
asm
add esp, -4
mov ecx, eax
shr ecx, 16
and ecx, $FF
mov [esp], ecx
fild dword [esp]
fstp dword [edx]
mov ecx, eax
shr ecx, 8
and ecx, $FF
mov [esp], ecx
fild dword [esp]
fstp dword [edx + 4]
and eax, $FF
mov [esp], eax
fild dword [esp]
fstp dword [edx + 8]
pop edx
end;
function ColAToSVecFlipRBc(c: Cardinal): TSVec;
asm
mov ecx, eax
shr ecx, 24
push ecx
fild dword [esp]
fstp dword [edx + 12]
mov ecx, eax
shr ecx, 16
and ecx, $FF
mov [esp], ecx
fild dword [esp]
fstp dword [edx]
mov ecx, eax
shr ecx, 8
and ecx, $FF
mov [esp], ecx
fild dword [esp]
fstp dword [edx + 4]
and eax, $FF
mov [esp], eax
fild dword [esp]
fstp dword [edx + 8]
pop edx
end;
function SVecToColNoScale(sv: TSVec): Cardinal;
asm
add esp, -16
push 0
push $437f0000
lea edx, [esp + 8]
call [mMinMaxSVec]
fld dword [esp]
fistp word [esp]
fld dword [esp + 4]
fistp word [esp + 1]
fld dword [esp + 8]
fistp word [esp + 2]
mov eax, [esp]
add esp, 16
end;
function SVecToColNoScaleFlipXZ(var sv: TSVec): Cardinal;
asm
add esp, -16
push 0
push $437f0000
lea edx, [esp + 8] //2x pushed, +8 is esp..esp+16 for svec
call [mMinMaxSVec] //mMinMaxSVec(const smin, smax: Single; const V1: TSVec): TSVec; ret8
fld dword [esp + 8] // ebp+12, ebp+8, eax edx
fistp word [esp + 8]
fld dword [esp + 4]
fistp word [esp + 9]
fld dword [esp]
fistp word [esp + 10]
mov eax, [esp + 8]
add esp, 16
end;
procedure MinMaxClip15bit(var s: Single; var w: Word);
const s32767: Single = 32767;
asm
cmp SupportSSE, 0
jz @@1
movss xmm0, [eax]
xorps xmm1, xmm1
minss xmm0, s32767
maxss xmm0, xmm1
cvtss2si eax, xmm0
mov word [edx], ax
ret
@@1:
fld dword [eax]
ftst
fnstsw ax
and ah, 41H
jz @biggerThanZero
fstp st(0)
mov word [edx], 0
jmp @e
@biggerThanZero:
fcom s32767
fnstsw ax
shr ah, 1
jc @SmallerThanS3
fstp st(0)
mov word [edx], 32767
jmp @e
@SmallerThanS3:
fistp word [edx]
@e:
end;
function CPUID_Supported: Boolean;
asm
pushfd
pop eax
mov edx, eax
xor eax, $200000
push eax
popfd
pushfd
pop eax
xor eax, edx
setnz al
end;
function GetCPUID(AInfoRequired: Integer): TRegisters;
asm
push ebx
push esi
mov esi, edx
cpuid
mov TRegisters[esi].RegEAX, eax
mov TRegisters[esi].RegEBX, ebx
mov TRegisters[esi].RegECX, ecx
mov TRegisters[esi].RegEDX, edx
pop esi
pop ebx
end;
procedure FastMove(const Source; var Dest; count: Integer);
asm
cmp eax, edx
je @@Exit
cmp ecx, 32
ja @@LargeMove //Count > 32 or Count < 0
sub ecx, 8
jg @@SmallMove
@@TinyMove: //0..8 Byte Move
jmp dword [@@JumpTable + 32 + ecx * 4]
@@SmallMove: //9..32 Byte Move
fild qword [eax + ecx]
fild qword [eax]
cmp ecx, 8
jle @@Small16
fild qword [eax + 8]
cmp ecx, 16
jle @@Small24
fild qword [eax + 16]
fistp qword [edx + 16]
@@Small24:
fistp qword [edx + 8]
@@Small16:
fistp qword [edx]
fistp qword [edx + ecx]
@@Exit:
ret
nop //4-Byte Align JumpTable
nop
@@JumpTable:
dd @@Exit, @@M01, @@M02, @@M03, @@M04, @@M05, @@M06, @@M07, @@M08
@@LargeForwardMove:
push edx
fild qword [eax]
lea eax, [eax + ecx - 8]
lea ecx, [ecx + edx - 8]
fild qword [eax] //fp stack check error
push ecx
neg ecx
and edx, -8
lea ecx, [ecx + edx + 8]
pop edx
@FwdLoop:
fild qword [eax + ecx]
fistp qword [edx + ecx]
add ecx, 8
jl @FwdLoop
fistp qword [edx]
pop edx
fistp qword [edx]
ret
@@LargeMove:
jng @@LargeDone // Count < 0
cmp eax, edx
ja @@LargeForwardMove
sub edx, ecx
cmp eax, edx
lea edx, [edx + ecx]
jna @@LargeForwardMove
sub ecx, 8
push ecx
fild qword [eax + ecx]
fild qword [eax]
add ecx, edx
and ecx, -8
sub ecx, edx
@BwdLoop:
fild qword [eax + ecx]
fistp qword [edx + ecx]
sub ecx, 8
jg @BwdLoop
pop ecx
fistp qword [edx]
fistp qword [edx + ecx]
@@LargeDone:
ret
@@M01:
movzx ecx, [eax]
mov [edx], cl
ret
@@M02:
movzx ecx, word [eax]
mov [edx], cx
ret
@@M03:
mov cx, [eax]
mov al, [eax + 2]
mov [edx], cx
mov [edx + 2], al
ret
@@M04:
mov ecx, [eax]
mov [edx], ecx
ret
@@M05:
mov ecx, [eax]
mov al, [eax + 4]
mov [edx], ecx
mov [edx + 4], al
ret
@@M06:
mov ecx, [eax]
mov ax, [eax + 4]
mov [edx], ecx
mov [edx + 4], ax
ret
@@M07:
mov ecx, [eax]
mov eax, [eax + 3]
mov [edx], ecx
mov [edx + 3], eax
ret
@@M08:
fild qword [eax]
fistp qword [edx]
end;
procedure fill0bytes(const p: Pointer; const anz: Integer; const useSSE: Boolean);
// ...
asm
push eax
push ecx
mov ecx, x4
mov eax, p1
xorps xmm0, xmm0
@loop: movaps [eax], xmm0
add eax, 16
sub ecx, 1
jnz @loop
mov p1, eax
pop ecx
pop eax
end;
// ...
procedure doFFT(const d: Double);
// ...
asm
push edx
push ecx
push ebx
push eax
push esi
push edi
mov ebx, pFFTreal
mov ecx, pFFTimag
movsd xmm7, d
shufpd xmm7, xmm7, 0
mov eax, l
@loo0: shl eax, 1 // while l<=fl2
mov edi, eax // war: ischritt, eax
xor eax, eax // eax=m
mov tabnr, eax
@loo1: mov edx, tabnr // for m:=0 to l-1
mov esi, pFFTcos
movlpd xmm3, [esi + edx * 8]
mov esi, pFFTsin
movlpd xmm4, [esi + edx * 8]
shufpd xmm3, xmm3, 0 // xmm3 = [wichreal, wichreal]
shufpd xmm4, xmm4, 0 // xmm4 = [wichimag, wichimag]
mulpd xmm4, xmm7 // xorpd xmm4, [sign]
mov edx, eax // edx=i=m
@loo2: mov esi, edx
add esi, l // j=i+l
movlpd xmm0, [ebx + esi * 8] // hi lo
movhpd xmm0, [ecx + esi * 8] // xmm0 = [imag, real]
movapd xmm1, xmm0
shufpd xmm1, xmm1, 1 // xmm1 = [real, imag] (,1=swap)
mulpd xmm0, xmm3 // xmm0 = [imag*wichreal, real*wichreal]
mulpd xmm1, xmm4 // xmm1 = [real*wichimag, imag*wichimag]
movapd xmm2, xmm0
addpd xmm0, xmm1 // xmm0 = [i*wr+r*wi, r*wr+i*wi]
subpd xmm2, xmm1 // xmm2 = [i*wr-r*wi, r*wr-i*wi]
shufpd xmm2, xmm0, 2 // xmm2 = [i*wr+r*wi, r*wr-i*wi]?
// tmpimag tmpreal
movlpd xmm0, [ebx + edx * 8]
movhpd xmm0, [ecx + edx * 8] // xmm0 = [imag_i, real_i]
movapd xmm1, xmm0
subpd xmm0, xmm2
addpd xmm1, xmm2
movlpd [ebx + esi * 8], xmm0
movhpd [ecx + esi * 8], xmm0
movlpd [ebx + edx * 8], xmm1
movhpd [ecx + edx * 8], xmm1
add edx, edi
cmp edx, fftlength
jl @loo2
mov esi, fl3
add tabnr, esi
add eax, 1
cmp eax, l // for m:=0 to l-1
jl @loo1
shr fl3, 1
mov eax, edi // ischritt
mov l, eax
cmp eax, fl2 // while l<=fl2
jle @loo0
pop edi
pop esi
pop eax
pop ebx
pop ecx
pop edx
end;
// ...
procedure FirstATlevelCAO(PIA: TPCardinalArray; PsiLight: TPsiLight5; Leng: Integer);
asm
push esi
dec ecx
js @@out
inc ecx
add edx, 8
@@1:
cmp word [edx], $8000
jnb @@2
mov esi, [edx-2]
and esi, $ffffff00
shr esi, 1
jmp @@3
@@2:
xor esi, esi
@@3:
mov [eax], esi
add edx, 18
add eax, 4
dec ecx
jnz @@1
@@out:
pop esi
end;
procedure SmoothH(PIA, SA: TPCardinalArray; ya, Step: Integer);
asm
add esp, -12
push ebx
push esi
push edi
mov [ebp-8], ecx
mov ebx, edx
mov edi, [ebp+8]
mov edx, ecx
test edx, edx
jl @@2
inc edx
mov [ebp-12], edx
xor esi, esi
@@1:
mov edx, esi
sub edx, edi
test edx, edx
jnl @@3
xor edx, edx
@@3:
mov ecx, edi
add ecx, esi
cmp ecx, [ebp-8]
jle @@4
mov ecx, [ebp-8]
@@4:
mov ecx, [ebx+ecx*4]
add ecx, [ebx+edx*4]
shr ecx, 1
add ecx, [eax]
shr ecx, 1
mov [eax], ecx
inc esi
add eax, 4
dec dword [ebp-12]
jnz @@1
@@2:
pop edi
pop esi
pop ebx
add esp, 12
end;
procedure SmoothV(PIA, SA: TPCardinalArray; ye, Step, wid: Integer);
asm
add esp, -12
push ebx
push esi
push edi
mov [ebp-8], ecx
mov ebx, edx
mov edi, [ebp+12]
mov edx, ecx
test edx, edx
jl @@2
inc edx
mov [ebp-12], edx
xor esi, esi
@@1:
mov edx, esi
sub edx, edi
test edx, edx
jnl @@3
xor edx, edx
@@3:
mov ecx, edi
add ecx, esi
cmp ecx, [ebp-8]
jle @@4
mov ecx, [ebp-8]
@@4:
mov ecx, [ebx+ecx*4]
add ecx, [ebx+edx*4]
shr ecx, 1
add ecx, [eax]
shr ecx, 1
mov [eax], ecx
inc esi
add eax, dword [ebp+8]
dec dword [ebp-12]
jnz @@1
@@2:
pop edi
pop esi
pop ebx
add esp, 12
end;
procedure MinSI(var SI: SmallInt; var i: Integer);
asm
movsx ecx, word [eax]
cmp ecx, [edx]
jnl @@1
cmp dword [edx], $7FFF
jl @@2
mov word [eax], $7FFF
ret
@@2:
mov edx, [edx]
mov word [eax], dx
@@1:
end;
function NotOnlyBackGround4(p: Pointer): Integer;
asm
mov edx, [eax]
and edx, [eax + 18]
and edx, [eax + 36]
and edx, [eax + 54]
and edx, $80000000
mov eax, edx
end;
procedure MakeZP4(p: Pointer; var zp: array of Integer);
asm
mov ecx, [eax]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx], ecx
mov ecx, [eax + 18]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx + 4], ecx
mov ecx, [eax + 36]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx + 8], ecx
mov ecx, [eax + 54]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx + 12], ecx
end;
procedure isMemberQuat(PIteration3D: TPIteration3D);
// ...
asm
push esi
push edi
push ecx
mov esi, PIteration3D
xor ecx, ecx
mov edi, [esi + 48]
@u: movupd xmm0, [esi] // C1, C2 = X1, X2
movsd xmm1, [esi + 16] // C3, 0 = X3, X4
movapd xmm2, xmm0
movapd xmm3, xmm1
mulpd xmm2, xmm0 // X1*X1, X2*X2
mulpd xmm3, xmm1 // X3*X3, X4*X4
movapd xmm4, xmm2
addpd xmm4, xmm3 // X1*X1 + X3*X3, X2*X2 + X4*X4
pshufd xmm5, xmm4, $4E // X2*X2 + X4*X4, X1*X1 + X3*X3
addsd xmm4, xmm5 // Rout
@a: addsd xmm3, xmm5 // X3*X3 + X2*X2 + X4*X4
movlpd Rold, xmm4
pshufd xmm7, xmm0, $4E // X2, X1
subsd xmm2, xmm3 // X1*X1 - X2*X2 - X3*X3 - X4*X4
movapd xmm5, xmm0 // X1, X2
mulsd xmm7, xmm0 // X2*X1
pshufd xmm6, xmm1, $4E // X4, X3
addsd xmm2, [esi]
movapd xmm3, xmm6 // X4, X3
movapd xmm0, xmm2 // X1 = X1*X1 - X2*X2 - X3*X3 - X4*X4 + C1;
mulpd xmm3, xmm5 // X4*X1, X3*X2
mulsd xmm6, xmm1 // X4*X3
mulpd xmm5, xmm1 // X1*X3, X2*X4
addsd xmm7, xmm6 // X2*X1 + X4*X3
pshufd xmm1, xmm5, $4E // X2*X4, X1*X3
addsd xmm7, xmm7 // 2 * (X2*X1 + X4*X3)
addsd xmm7, [esi + 8] // X2 = 2 * (X2*X1 + X3*X4) + C2
subsd xmm5, xmm1 // X2*X4*O1 + X1*X3 sub
shufpd xmm0, xmm7, 0 // X1, X2
addsd xmm5, xmm5 // 2 * (X2*X4*O1 + X1*X3)
pshufd xmm6, xmm3, $4E // X3*X2, X4*X1
addsd xmm5, [esi + 16] // X3 = 2 * (X2*X4*O1 + X1*X3) + C3
addsd xmm6, xmm3 // X3*X2 + X4*X1
movsd xmm1, xmm5
addsd xmm6, xmm6 // X4 = 2 * (X4*X1 + X3*X2)
shufpd xmm1, xmm6, 0 // X3, X4
movapd xmm2, xmm0
movapd xmm3, xmm1
mulpd xmm2, xmm0 // X1*X1, X2*X2
mulpd xmm3, xmm1 // X3*X3, X4*X4
movapd xmm4, xmm2
addpd xmm4, xmm3 // X1*X1 + X3*X3, X2*X2 + X4*X4
pshufd xmm5, xmm4, $4E //
addsd xmm4, xmm5 // Rout
inc ecx
cmp ecx, [esi + 68]
jge @c
ucomisd xmm4, [edi + 160] //>8?
jb @a
@c: movlpd [esi + 56], xmm4 // Rout = double
mov [esi + 64], ecx // ItResultI
pop ecx
pop edi
pop esi
end
// ...
procedure UpdateScaledImage(StartYh, EndYh: Integer);
// ...
asm
push eax
push ebx
push ecx
push edx
push edi
push esi
mov ecx, wid
mov esi, PB1
mov edi, PBh
mov ebx, mFSIoffset
@ll: movzx eax, byte ptr [esi]
movzx edx, byte ptr [esi + 4]
add eax, edx
movzx edx, byte ptr [esi + ebx]
add eax, edx
movzx edx, byte ptr [esi + ebx + 4]
lea eax, [eax + edx + 2]
shr eax, 2
mov [edi], al
movzx eax, byte ptr [esi + 1]
movzx edx, byte ptr [esi + 5]
add eax, edx
movzx edx, byte ptr [esi + ebx + 1]
add eax, edx
movzx edx, byte ptr [esi + ebx + 5]
lea eax, [eax + edx + 2]
shr eax, 2
mov [edi + 1], al
movzx eax, byte ptr [esi + 2]
movzx edx, byte ptr [esi + 6]
add eax, edx
movzx edx, byte ptr [esi + ebx + 2]
add eax, edx
movzx edx, byte ptr [esi + ebx + 6]
lea eax, [eax + edx + 2]
shr eax, 2
mov [edi + 2], al
add esi, 8
add edi, 4
dec ecx
jnz @ll
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax
end
// ...
asm
push eax
push ebx
push ecx
push edx
push edi
push esi
mov ecx, wid
mov esi, PB1
mov edi, PBh
mov ebx, mFSIoffset
@ll: movzx eax, byte ptr [esi]
movzx edx, byte ptr [esi + 4]
add eax, edx
movzx edx, byte ptr [esi + 8]
add eax, edx
movzx edx, byte ptr [esi + ebx]
add eax, edx
movzx edx, byte ptr [esi + ebx + 4]
add eax, edx
movzx edx, byte ptr [esi + ebx + 8]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 4]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 8]
lea eax, [eax + edx + 4]
div b
mov [edi], al
movzx eax, byte ptr [esi + 1]
movzx edx, byte ptr [esi + 5]
add eax, edx
movzx edx, byte ptr [esi + 9]
add eax, edx
movzx edx, byte ptr [esi + ebx + 1]
add eax, edx
movzx edx, byte ptr [esi + ebx + 5]
add eax, edx
movzx edx, byte ptr [esi + ebx + 9]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 1]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 5]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 9]
lea eax, [eax + edx + 4]
div b
mov [edi + 1], al
movzx eax, byte ptr [esi + 2]
movzx edx, byte ptr [esi + 6]
add eax, edx
movzx edx, byte ptr [esi + 10]
add eax, edx
movzx edx, byte ptr [esi + ebx + 2]
add eax, edx
movzx edx, byte ptr [esi + ebx + 6]
add eax, edx
movzx edx, byte ptr [esi + ebx + 10]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 2]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 6]
add eax, edx
movzx edx, byte ptr [esi + ebx * 2 + 10]
lea eax, [eax + edx + 4]
div b
mov [edi + 2], al
add esi, 12
add edi, 4
dec ecx
jnz @ll
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax
end
// ...
asm //sum rows to buf
push eax
push ebx
push ecx
push edx
push edi
push esi
mov ebx, ImageScale
dec ebx
mov y2, ebx
mov edi, PC1
lea eax, ebx * 4 - 1
mov itmp, eax
@@0: mov ecx, wid
mov w2, ecx
mov esi, PB1
mov eax, mFSIoffset
mul y2
add esi, eax
@ll: mov ecx, ebx
movzx eax, byte ptr [esi]
@@1: add esi, 4
movzx edx, byte ptr [esi]
add eax, edx
dec ecx
jnz @@1
mov [edi], eax
sub esi, itmp
mov ecx, ebx
movzx eax, byte ptr [esi]
@@2: add esi, 4
movzx edx, byte ptr [esi]
add eax, edx
dec ecx
jnz @@2
mov [edi + 4], eax
sub esi, itmp
mov ecx, ebx
movzx eax, byte ptr [esi]
@@3: add esi, 4
movzx edx, byte ptr [esi]
add eax, edx
dec ecx
jnz @@3
mov [edi + 8], eax
add edi, 12
add esi, 2
dec w2
jnz @ll
dec y2
jns @@0
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax
end;
// ...
asm //sum columns
push eax
push ebx
push ecx
push edx
push edi
push esi
mov eax, ImageScale
dec eax
mov y2, eax
mov edx, PC1
mov ebx, wid
mov w2, ebx
shl ebx, 2
lea ebx, ebx * 2 + ebx
mov edi, PBh
@ll: mov ecx, y2
mov esi, edx
mov eax, [esi]
@@1: add esi, ebx
add eax, [esi]
dec ecx
jnz @@1
add eax, a
div b
mov [edi], al
add edx, 4
mov ecx, y2
mov esi, edx
mov eax, [esi]
@@2: add esi, ebx
add eax, [esi]
dec ecx
jnz @@2
add eax, a
div b
mov [edi + 1], al
add edx, 4
mov ecx, y2
mov esi, edx
mov eax, [esi]
@@3: add esi, ebx
add eax, [esi]
dec ecx
jnz @@3
add eax, a
div b
mov [edi + 2], al
add edx, 4
add edi, 4
dec w2
jnz @ll
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax
end;
// ...
function ColToSVecFlipRBc4(c: T4Cardinal): T4SVec;
asm
push ebx
push esi
push edi
add esp, -16
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 2] //dereferenz
mov ecx, [ecx + 2]
mov esi, [esi + 2]
mov edi, [edi + 2]
and ebx, $FF
and ecx, $FF
and esi, $FF
and edi, $FF
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fstp dword [edx + 48]
fstp dword [edx + 32]
fstp dword [edx + 16]
fstp dword [edx]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 1] //dereferenz
mov ecx, [ecx + 1]
mov esi, [esi + 1]
mov edi, [edi + 1]
and ebx, $FF
and ecx, $FF
and esi, $FF
and edi, $FF
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fstp dword [edx + 52]
fstp dword [edx + 36]
fstp dword [edx + 20]
fstp dword [edx + 4]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx] //dereferenz
mov ecx, [ecx]
mov esi, [esi]
mov edi, [edi]
and ebx, $FF
and ecx, $FF
and esi, $FF
and edi, $FF
xor eax, eax
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
mov [edx + 12], eax
mov [edx + 28], eax
mov [edx + 44], eax
mov [edx + 60], eax
fstp dword [edx + 56]
fstp dword [edx + 40]
fstp dword [edx + 24]
fstp dword [edx + 8]
add esp, 16
pop edi
pop esi
pop ebx
end;
procedure ColToSVecSSE2(c: T4Cardinal; sv, svout: TPSVec);
asm //CVTDQ2PS: sse2 - 4 ints to 4 singles
MOVDQU xmm5, [edx] //PSRLDQ: sse2 - xmm1, imm8 Shift xmm1 right by imm8 while shifting in 0s.
add esp, -16
mov edx, [eax]
mov edx, [edx]
mov [esp], edx
mov edx, [eax + 4]
mov edx, [edx]
mov [esp + 4], edx
mov edx, [eax + 8]
mov eax, [eax + 12]
mov edx, [edx]
mov eax, [eax]
mov [esp + 8], edx
mov [esp + 12], eax
movss xmm0, s1d255
MOVDQU xmm1, [esp] //[eax] 4 cardinal colors
MOVDQU xmm4, sva1
MOVDQA xmm2, xmm1 //todo: use input pointers, load vals before
MOVDQA xmm3, xmm1
PSRLDQ xmm2, 1 //green
PSRLDQ xmm3, 2 //blue
shufps xmm0, xmm0, 0
andps xmm1, xmm4 //red or $FF000000FF000000FF000000FF
andps xmm2, xmm4
andps xmm3, xmm4
mulps xmm5, xmm0
CVTDQ2PS xmm1, xmm1
CVTDQ2PS xmm2, xmm2
CVTDQ2PS xmm3, xmm3
mulps xmm1, xmm5
mulps xmm2, xmm5
mulps xmm3, xmm5
MOVLHPS xmm4, xmm1 //HADDD L1,.. (H,L)
movhlps xmm4, xmm3 //L1,H3
shufps xmm3, xmm1, $E4 //H1,L3
MOVHLPS xmm0, xmm2 //..,H2
addps xmm4, xmm3 //11,33
addps xmm0, xmm2 //..,22
pshufd xmm5, xmm4, $B1 //can't copy 1 dw to more than 1 dest!
pshufd xmm2, xmm0, $B1
addps xmm5, xmm4 //3,1
addss xmm2, xmm0 //.,2
movups [ecx], xmm5 // r,.,b
movss [ecx + 4], xmm2 // .,g,.
add esp, 16
end;
procedure ColToSVecSqrSSE2(c: T4Cardinal; sv, svout: TPSVec); //svout := sumof([0..3] cardinal colors * sv[0..3])
const scmul: Single = 1 / 65025;
asm // eax edx ecx
MOVDQU xmm5, [edx]
add esp, -16
mov edx, [eax]
mov edx, [edx]
mov [esp], edx
mov edx, [eax + 4]
mov edx, [edx]
mov [esp + 4], edx
mov edx, [eax + 8]
mov eax, [eax + 12]
mov edx, [edx]
mov eax, [eax]
mov [esp + 8], edx
mov [esp + 12], eax
movss xmm0, scmul
MOVDQU xmm1, [esp]
MOVDQU xmm4, sva1
MOVDQA xmm2, xmm1
MOVDQA xmm3, xmm1
PSRLDQ xmm2, 1
PSRLDQ xmm3, 2
shufps xmm0, xmm0, 0
andps xmm1, xmm4
andps xmm2, xmm4
andps xmm3, xmm4
mulps xmm5, xmm0
CVTDQ2PS xmm1, xmm1
CVTDQ2PS xmm2, xmm2
CVTDQ2PS xmm3, xmm3
mulps xmm1, xmm1
mulps xmm2, xmm2
mulps xmm3, xmm3
mulps xmm1, xmm5
mulps xmm2, xmm5
mulps xmm3, xmm5
MOVLHPS xmm4, xmm1 //HADDD L1,.. (H,L)
movhlps xmm4, xmm3 //L1,H3
shufps xmm3, xmm1, $E4 //H1,L3
MOVHLPS xmm0, xmm2 //..,H2
addps xmm4, xmm3 //11,33
addps xmm0, xmm2 //..,22
pshufd xmm5, xmm4, $B1 //can't copy 1 dw to more than 1 dest!
pshufd xmm2, xmm0, $B1
addps xmm5, xmm4 //3,1
addss xmm2, xmm0 //.,2
movups [ecx], xmm5 // r,.,b
movss [ecx + 4], xmm2 // .,g,.
add esp, 16
end;
procedure ColToSVecSqrSSE2_16(c: T4Cardinal; sv, svout: TPSVec); //svout := sumof([0..3] cardinal colors * sv[0..3])
const csmul: Single = 1 {255.0} / (65535.0 * 65535.0);
asm
MOVDQU xmm5, [edx]
add esp, -16
mov edx, [eax]
mov edx, [edx]
mov [esp], edx
mov edx, [eax + 4]
mov edx, [edx]
mov [esp + 4], edx
mov edx, [eax + 8]
mov edx, [edx]
mov [esp + 8], edx
mov edx, [eax + 12]
mov edx, [edx]
mov [esp + 12], edx
movss xmm0, csmul
MOVDQU xmm1, [esp]
MOVDQU xmm4, sva16
MOVDQA xmm2, xmm1
mov edx, [eax]
mov edx, [edx + 4]
mov [esp], edx
mov edx, [eax + 4]
mov edx, [edx + 4]
mov [esp + 4], edx
mov edx, [eax + 8]
mov eax, [eax + 12]
mov edx, [edx + 4]
mov eax, [eax + 4]
mov [esp + 8], edx
mov [esp + 12], eax
MOVDQA xmm2, xmm1
MOVDQU xmm3, [esp]
PSRLDQ xmm2, 2
shufps xmm0, xmm0, 0
andps xmm1, xmm4
andps xmm2, xmm4
andps xmm3, xmm4
mulps xmm5, xmm0
CVTDQ2PS xmm1, xmm1
CVTDQ2PS xmm2, xmm2
CVTDQ2PS xmm3, xmm3
mulps xmm1, xmm1
mulps xmm2, xmm2
mulps xmm3, xmm3
mulps xmm1, xmm5
mulps xmm2, xmm5
mulps xmm3, xmm5
MOVLHPS xmm4, xmm1
movhlps xmm4, xmm3
shufps xmm3, xmm1, $E4
MOVHLPS xmm0, xmm2
addps xmm4, xmm3
addps xmm0, xmm2
pshufd xmm5, xmm4, $B1
pshufd xmm2, xmm0, $B1
addps xmm5, xmm4
addss xmm2, xmm0
movups [ecx], xmm5 // r,.,b
movss [ecx + 4], xmm2 // .,g,.
add esp, 16
end;
procedure ColToSVecSSE2_16(c: T4Cardinal; sv, svout: TPSVec); //svout := sumof([0..3] cardinal colors * sv[0..3])
const csmul: Single = 1 {255.0} / 65535.0;
asm
MOVDQU xmm5, [edx]
add esp, -16
mov edx, [eax]
mov edx, [edx]
mov [esp], edx
mov edx, [eax + 4]
mov edx, [edx]
mov [esp + 4], edx
mov edx, [eax + 8]
mov edx, [edx]
mov [esp + 8], edx
mov edx, [eax + 12]
mov edx, [edx]
mov [esp + 12], edx
movss xmm0, csmul
MOVDQU xmm1, [esp]
MOVDQU xmm4, sva16
MOVDQA xmm2, xmm1
mov edx, [eax]
mov edx, [edx + 4]
mov [esp], edx
mov edx, [eax + 4]
mov edx, [edx + 4]
mov [esp + 4], edx
mov edx, [eax + 8]
mov eax, [eax + 12]
mov edx, [edx + 4]
mov eax, [eax + 4]
mov [esp + 8], edx
mov [esp + 12], eax
MOVDQA xmm2, xmm1
MOVDQU xmm3, [esp]
PSRLDQ xmm2, 2
shufps xmm0, xmm0, 0
andps xmm1, xmm4
andps xmm2, xmm4
andps xmm3, xmm4
mulps xmm5, xmm0
CVTDQ2PS xmm1, xmm1
CVTDQ2PS xmm2, xmm2
CVTDQ2PS xmm3, xmm3
mulps xmm1, xmm5
mulps xmm2, xmm5
mulps xmm3, xmm5
MOVLHPS xmm4, xmm1
movhlps xmm4, xmm3
shufps xmm3, xmm1, $E4
MOVHLPS xmm0, xmm2
addps xmm4, xmm3
addps xmm0, xmm2
pshufd xmm5, xmm4, $B1
pshufd xmm2, xmm0, $B1
addps xmm5, xmm4
addss xmm2, xmm0
movups [ecx], xmm5 // r,.,b
movss [ecx + 4], xmm2 // .,g,.
add esp, 16
end;
function ColToSVecFlipRBc4sqr(c: T4Cardinal): T4SVec;
asm
push ebx
push esi
push edi
add esp, -16
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 2] //dereferenz
mov ecx, [ecx + 2]
mov esi, [esi + 2]
mov edi, [edi + 2]
and ebx, $FF
and ecx, $FF
and esi, $FF
and edi, $FF
fld s1d255
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fmul st, st(0)
fmul st, st(4)
fstp dword [edx + 48]
fmul st, st(0)
fmul st, st(3)
fstp dword [edx + 32]
fmul st, st(0)
fmul st, st(2)
fstp dword [edx + 16]
fmul st, st(0)
fmul st, st(1)
fstp dword [edx]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 1] //dereferenz
mov ecx, [ecx + 1]
mov esi, [esi + 1]
mov edi, [edi + 1]
and ebx, $FF
and ecx, $FF
and esi, $FF
and edi, $FF
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fmul st, st(0)
fmul st, st(4)
fstp dword [edx + 52]
fmul st, st(0)
fmul st, st(3)
fstp dword [edx + 36]
fmul st, st(0)
fmul st, st(2)
fstp dword [edx + 20]
fmul st, st(0)
fmul st, st(1)
fstp dword [edx + 4]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx] //dereferenz
mov ecx, [ecx]
mov esi, [esi]
mov edi, [edi]
and ebx, $FF
and ecx, $FF
and esi, $FF
and edi, $FF
xor eax, eax
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
mov [edx + 12], eax
fmul st, st(0)
mov [edx + 28], eax
mov [edx + 44], eax
fmul st, st(4)
mov [edx + 60], eax
fstp dword [edx + 56]
fmul st, st(0)
fmul st, st(3)
fstp dword [edx + 40]
fmul st, st(0)
fmul st, st(2)
fstp dword [edx + 24]
fmul st, st(0)
fmulp
fstp dword [edx + 8]
add esp, 16
pop edi
pop esi
pop ebx
end;
function ColToSVecFlipRBc4sqr16(c: T4Cardinal): T4SVec;
const cdmul: Double = 255.0 / (65535.0 * 65535.0);
asm
push ebx
push esi
push edi
add esp, -16
mov ebx, [eax] //pointers
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 4] //dereferenz
mov ecx, [ecx + 4]
mov esi, [esi + 4]
mov edi, [edi + 4]
and ebx, $FFFF
and ecx, $FFFF
and esi, $FFFF
and edi, $FFFF
fld cdmul
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fmul st, st(0)
fmul st, st(4)
fstp dword [edx + 48]
fmul st, st(0)
fmul st, st(3)
fstp dword [edx + 32]
fmul st, st(0)
fmul st, st(2)
fstp dword [edx + 16]
fmul st, st(0)
fmul st, st(1)
fstp dword [edx]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 2] //dereferenz
mov ecx, [ecx + 2]
mov esi, [esi + 2]
mov edi, [edi + 2]
and ebx, $FFFF
and ecx, $FFFF
and esi, $FFFF
and edi, $FFFF
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp] //loads signed integer, therefore 16 bit direct iload would fail
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fmul st, st(0)
fmul st, st(4)
fstp dword [edx + 52]
fmul st, st(0)
fmul st, st(3)
fstp dword [edx + 36]
fmul st, st(0)
fmul st, st(2)
fstp dword [edx + 20]
fmul st, st(0)
fmul st, st(1)
fstp dword [edx + 4]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx] //dereferenz
mov ecx, [ecx]
mov esi, [esi]
mov edi, [edi]
and ebx, $FFFF
and ecx, $FFFF
and esi, $FFFF
and edi, $FFFF
xor eax, eax
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
mov [edx + 12], eax
mov [edx + 28], eax
fmul st, st(0)
mov [edx + 44], eax
fmul st, st(4)
mov [edx + 60], eax
fstp dword [edx + 56]
fmul st, st(0)
fmul st, st(3)
fstp dword [edx + 40]
fmul st, st(0)
fmul st, st(2)
fstp dword [edx + 24]
fmul st, st(0)
fmulp
fstp dword [edx + 8]
add esp, 16
pop edi
pop esi
pop ebx
end;
function ColToSVecFlipRBc416(c: T4Cardinal): T4SVec;
asm
push ebx
push esi
push edi
add esp, -16
mov ebx, [eax] //pointers
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 4] //dereferenz
mov ecx, [ecx + 4]
mov esi, [esi + 4]
mov edi, [edi + 4]
and ebx, $FFFF
and ecx, $FFFF
and esi, $FFFF
and edi, $FFFF
fld d1d256
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fmul st, st(4)
fstp dword [edx + 48]
fmul st, st(3)
fstp dword [edx + 32]
fmul st, st(2)
fstp dword [edx + 16]
fmul st, st(1)
fstp dword [edx]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx + 2] //dereferenz
mov ecx, [ecx + 2]
mov esi, [esi + 2]
mov edi, [edi + 2]
and ebx, $FFFF
and ecx, $FFFF
and esi, $FFFF
and edi, $FFFF
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp] //loads signed integer, therefore 16 bit direct iload would fail
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fmul st, st(4)
fstp dword [edx + 52]
fmul st, st(3)
fstp dword [edx + 36]
fmul st, st(2)
fstp dword [edx + 20]
fmul st, st(1)
fstp dword [edx + 4]
mov ebx, [eax]
mov ecx, [eax + 4]
mov esi, [eax + 8]
mov edi, [eax + 12]
mov ebx, [ebx] //dereferenz
mov ecx, [ecx]
mov esi, [esi]
mov edi, [edi]
and ebx, $FFFF
and ecx, $FFFF
and esi, $FFFF
and edi, $FFFF
xor eax, eax
mov [esp], ebx
mov [esp + 4], ecx
mov [esp + 8], esi
mov [esp + 12], edi
fild dword [esp]
fild dword [esp + 4]
fild dword [esp + 8]
fild dword [esp + 12]
fmul st, st(4)
mov [edx + 12], eax
mov [edx + 28], eax
mov [edx + 44], eax
mov [edx + 60], eax
fstp dword [edx + 56]
fmul st, st(3)
fstp dword [edx + 40]
fmul st, st(2)
fstp dword [edx + 24]
fmulp
fstp dword [edx + 8]
add esp, 16
pop edi
pop esi
pop ebx
end;
procedure HybridCustomIFStest;
asm
movupd xmm0, [esi - 120] //x,y
movsd xmm1, [esi - 104] //z
mulpd xmm0, [edi - 32]
mulsd xmm1, [edi - 16]
addsd xmm1, xmm0
unpckhpd xmm0, xmm0
addsd xmm1, xmm0
subsd xmm1, [edi - 40]
cmp [edi - 68], 0
jne @up
andpd xmm1, [edi]
@up: movsd [esi - 32], xmm1 //Rout: Double; //+56
mov edx, [edi - 52]
test edx, edx
jz @out
push ecx //otrap coloring
add esp, -32
fld qword [edi - 16]
fld qword [edi - 24]
fld qword [edi - 32] //nx,ny,nz
fld st //makeorthovecs
fabs
fcomp s011
fnstsw ax
and ah, 41H
jnz @@1
fld st(2)
fmul st, st
fld st(1)
fmul st, st
faddp
fsqrt
fld1
fdivrp //1/Sqrt(rr)
fldz
fld st(4)
fmul st, st(2)
fld st(3)
fchs
fmulp st(2), st //vo[0],0,vo[2],nx,ny,nz
jmp @@2
@@1:
fld st(2)
fmul st, st
fld st(2)
fmul st, st
faddp
fsqrt
fld1
fdivrp //1/Sqrt(rr)
fld st(3)
fchs
fmul st, st(1)
fld st(3)
fmulp st(2), st //0,vo[1],vo[2],nx,ny,nz
fldz
@@2:
fld st
fmul qword [esi - 120] //x
fld st(2)
fmul qword [esi - 112] //y
faddp
fld st(3)
fmul qword [esi - 104] //z
faddp
fmul qword [edi - 48]
fstp qword [esp]
fld st(5)
fmul st, st(2)
fld st(5)
fmul st, st(4)
fsubrp //r0,vo[0],vo[1],vo[2],nx,ny,nz
fxch
fmul st(6), st //vo[0],r0,vo[1],vo[2],nx,ny,nz*vo[0]
fxch st(4)
fmul st(3), st //nx,r0,vo[1],vo[2]*nx,vo[0],ny,nz*vo[0]
fmulp st(2), st //r0, vo[1]*nx, vo[2]*nx, vo[0], ny, nz*vo[0]
fxch st(4) //ny, vo[1]*nx, vo[2]*nx, vo[0], r0, nz*vo[0]
fmulp st(3), st //vo[1]*nx, vo[2]*nx, vo[0]*ny, r0, nz*vo[0]
fsubrp st(2), st //vo[2]*nx, vo[1]*nx-vo[0]*ny=r2, r0, nz*vo[0]
fsubp st(3), st //r2, r0, nz*vo[0] - vo[2]*nx = r1
fmul qword [esi - 104] //z
fxch
fmul qword [esi - 120] //x
faddp
fxch
fmul qword [esi - 112] //y
faddp
fmul qword [edi - 48]
fstp qword [esp + 8]
mov eax, esp
mov ecx, esp
call [esi + 268] //+356 - 88 = 268
mov ecx, [edi - 56]
and ecx, 3
fld qword [esp + ecx * 8] //col of map
fadd dword [edi - 60]
fmul dword [edi - 64]
fstp qword [esi + 128]
add esp, 32
pop ecx
@out:
end;
procedure ipow2(var x, y: Double); //x:=x*x-y*y y:=2xy
asm
fld qword [eax]
fld qword [edx]
fld st(0) //y,y,x
fmul st(0), st(2) //y*x,y,x
fadd st(0), st(0)
fstp qword [edx]
fmul st(0), st(0) //y*y,x
fxch
fmul st(0), st(0) //x*x,y*y
fsubrp st(1), st(0)
fstp qword [eax]
end;
procedure ComplexSqr(var xy: TComplex); //x:=x*x-y*y y:=2xy
asm
fld qword [eax]
fld qword [eax + 8] //y,x
fld st(0) //y,y,x
fmul st(0), st(2) //y*x,y,x
fadd st(0), st(0)
fstp qword [eax + 8]
fmul st(0), st(0) //y*y,x
fxch
fmul st(0), st(0) //x*x,y*y
fsubrp st(1), st(0)
fstp qword [eax]
end;
procedure doInterpolHybridSSE2(PIteration3D: TPIteration3D); // new ext version
asm
push eax
push ebx
push ecx
push edx
push esi
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, (i = edi + 212 = btmp = esi - 44)
add esp, -72
mov edi, eax //was: Rold = esp, Rstop = esp + 8, aligned16: esp + 16, X1 = a16 X2 = a16+8.. Y1 = a16+32 ..
lea esi, eax + 256
mov eax, esp
add eax, 35
and eax, $FFFFFFF0
mov [esp], eax // aligned 16 Ybuf aligned16: esp, X1 = a16.. = Y1 = (aligned)
cvtps2pd xmm7, [edi + 76] //nHybrid[0] +76 weights in double for s1,s2 (lo,hi part)
movupd xmm0, [edi]
movsd xmm1, [edi + 16]
movupd [edi - 32], xmm0 //xyz=C
movupd [edi - 16], xmm1
cmp dword [esi - 104], 0 //DoJulia:+152
jz @sjup
movupd xmm2, [esi + 64]
movsd xmm3, [esi + 80]
movupd [edi + 24], xmm2 //J=Ju
movsd [edi + 40], xmm3
jmp @skipIfJulia
@sjup:
movupd [edi + 24], xmm0 //J=C
movsd [edi + 40], xmm1
@skipIfJulia:
mulpd xmm0, xmm0
mulsd xmm1, xmm1
CVTSS2SD xmm5, [edi + 72] //RStop in double
addsd xmm1, xmm0
unpckhpd xmm0, xmm0
movsd [edi - 40], xmm5
addsd xmm1, xmm0
xor ebx, ebx
movsd [esi - 64], xmm1 //OTrap=Rout
movsd [edi + 56], xmm1 //Rout
mov [esi - 48], ebx //bFirstIt := 0; +208
mov [edi + 64], ebx //ItresultI :=0 +64
@Repeat:
movsd xmm2, [edi + 56]
mov ebx, [edi + 100] //fHPVar[0] +100
mov eax, [esp]
mov [edi + 48], ebx //PVars: +48
movsd [edi - 48], xmm2 //Rold := Rout
movupd xmm0, [edi - 32] //Y:=xyz
movupd xmm1, [edi - 16]
movapd [eax], xmm0
movapd [eax + 16], xmm1
lea eax, edi - 32 // x
lea edx, edi - 24 // y
lea ecx, edi - 16 // z
lea ebx, edi - 8 // w
push ebx
push edi
call [edi + 124] //fHybrid[0] of ThybridIteration2
mov eax, [esp]
movupd xmm0, [edi - 32] // mCopyVec4(@x1, @x);
movupd xmm1, [edi - 16] // mCopyVec4(@x, @Y1);
movapd xmm2, [eax]
movapd xmm3, [eax + 16]
movapd [eax], xmm0
movapd [eax + 16], xmm1
movupd [edi - 32], xmm2 //xyz=Y1
movupd [edi - 16], xmm3
mov ebx, [edi + 104] //fHPVar[1]
mov [edi + 48], ebx //PVars: +48
lea eax, edi - 32 // x
lea edx, edi - 24 // y
lea ecx, edi - 16 // z
lea ebx, edi - 8 // w
push ebx
push edi
call [edi + 128] //fHybrid[1] of ThybridIteration2
mov eax, [esp]
movupd xmm0, [edi - 32] //x,y was: y1
movapd xmm2, [eax] //x[0,1]
movupd xmm1, [edi - 16] //z,w
movapd xmm3, [eax + 16] //x[2,3]
movapd xmm5, xmm0 //x,y
movapd xmm6, xmm2 //x[0,1]
mulpd xmm0, xmm0 //x²,y²
mulpd xmm2, xmm2 //x[0]²,x[1]²
mulsd xmm1, xmm1 //z²,w
mulsd xmm3, xmm3 //x[2]²
addsd xmm1, xmm0 //z²+x²
addsd xmm3, xmm2 //x[2]²+x[0]²
unpckhpd xmm0, xmm0 //y²
unpckhpd xmm2, xmm2 //x[1]²
addsd xmm1, xmm0 //x²+y²+z²
addsd xmm3, xmm2 //x[0]²+x[1]²+x[2]²
unpcklpd xmm3, xmm1 //x[0]²+x[1]²+x[2]²,x²+y²+z²
sqrtpd xmm0, xmm3 //xx,yy
mulpd xmm0, xmm7 //xx*s1,yy*s2
pshufd xmm2, xmm0, $4E
addsd xmm0, xmm2 //XX = xx*s1+yy*s2
pshufd xmm3, xmm7, $4E //wy
movsd xmm2, xmm7 //wx
unpcklpd xmm3, xmm3 //s2,s2
unpcklpd xmm2, xmm2 //s1,s1
movupd xmm1, [edi - 16] //z,w
mulpd xmm5, xmm3 //x,y *s2
mulpd xmm6, xmm2 //x[0,1] *s1
mulpd xmm3, xmm1 //z,w *s2
mulpd xmm2, [eax + 16] //x[2,3] *s1
addpd xmm5, xmm6 //x,y
addpd xmm3, xmm2 //z,w
movapd xmm4, xmm5 //x,y
movsd xmm2, xmm3 //z
mulpd xmm4, xmm4 //x²,y²
mulsd xmm2, xmm2 //z² 4D: mulpd
addsd xmm2, xmm4 //z²+x² 4D: addpd ...
unpckhpd xmm4, xmm4 //y²
addsd xmm4, xmm2 //x²+y²+z²
addsd xmm4, d1em40
sqrtsd xmm4, xmm4
movsd xmm2, xmm0 //XX
divsd xmm2, xmm4 //YY := XX / Sqrt(x * x + y * y + z * z + 1e-40);
unpcklpd xmm2, xmm2 //YY,YY
mulpd xmm5, xmm2
mulsd xmm3, xmm2
movupd [edi - 32], xmm5 //x,y
movupd [edi - 16], xmm3 //z,w
mulsd xmm0, xmm0
movsd [edi + 56], xmm0 //Rout := XX * XX;
movsd xmm1, xmm0
inc dword [edi + 64] //Inc(ItResultI)
minsd xmm0, [esi - 64]
movsd [esi - 64], xmm0 //OTrap := Min(Rout, OTrap);
mov eax, [edi + 64]
cmp eax, [edi + 68] //maxIt: +68
jnl @out
comisd xmm1, [edi - 40] //RStop
jc @Repeat
@out:
cmp byte [esi - 108], 0 //CalcSIT: +148
jz @NoCalcSITout
mov eax, edi
xor edx, edx
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer);
@NoCalcSITout:
add esp, 72
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
function doInterpolHybridDESSE2(PIteration3D: TPIteration3D): Double; // new ext version
asm
push eax
push ebx
push ecx
push edx
push esi
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, i = edi + 212 = btmp = esi - 44
add esp, -72
mov edi, eax //was: Rold = esp, Rstop = esp + 8, aligned16: esp + 16, X1 = a16 X2 = a16+8.. Y1 = a16+32 ..
lea esi, eax + 256
mov eax, esp
add eax, 35
and eax, $FFFFFFF0
mov [esp], eax // aligned 16 Ybuf aligned16: esp, X1 = a16.. = Y1 = (aligned)
cvtps2pd xmm7, [edi + 76] //nHybrid[0] +76 weights in double for s1,s2 (lo,hi part)
movupd xmm0, [edi]
movsd xmm1, [edi + 16]
movupd [edi - 32], xmm0 //xyz=C
movupd [edi - 16], xmm1
cmp dword [esi - 104], 0 //DoJulia:+152
jz @sjup
movupd xmm2, [esi + 64]
movsd xmm3, [esi + 80]
movupd [edi + 24], xmm2 //J=Ju
movsd [edi + 40], xmm3
jmp @skipIfJulia
@sjup:
movupd [edi + 24], xmm0 //J=C
movsd [edi + 40], xmm1
@skipIfJulia:
mulpd xmm0, xmm0
mulsd xmm1, xmm1
CVTSS2SD xmm5, [edi + 72] //RStop in double
addsd xmm1, xmm0
unpckhpd xmm0, xmm0
movsd [edi - 40], xmm5
addsd xmm1, xmm0
xor ebx, ebx
movsd [esi - 64], xmm1 //OTrap=Rout
movsd [edi + 56], xmm1 //Rout
mov [esi - 48], ebx //bFirstIt := 0; +208
mov [edi + 64], ebx //ItresultI :=0 +64
mov eax, [esi - 96] //DEoption +160
and eax, $18
sub eax, 16
jnz @UU1
fld qword [edi + 56]
jmp @UU2
@UU1:
fld1
@UU2:
fstp qword [edi - 8] // if (DEoption and $18) = 16 then w := Rout else w := 1;
@Repeat:
movsd xmm2, [edi + 56]
mov ebx, [edi + 100] //fHPVar[0] +100
mov eax, [esp]
mov [edi + 48], ebx //PVars: +48
movsd [edi - 48], xmm2 //Rold := Rout
movupd xmm0, [edi - 32] //Y:=xyz
movupd xmm1, [edi - 16]
movapd [eax], xmm0
movapd [eax + 16], xmm1
lea eax, edi - 32 // x
lea edx, edi - 24 // y
lea ecx, edi - 16 // z
lea ebx, edi - 8 // w
push ebx
push edi
call [edi + 124] //fHybrid[0] of ThybridIteration2
mov eax, [esp]
movupd xmm0, [edi - 32] // mCopyVec4(@x1, @x);
movupd xmm1, [edi - 16] // mCopyVec4(@x, @Y1);
movapd xmm2, [eax]
movapd xmm3, [eax + 16]
movapd [eax], xmm0
movapd [eax + 16], xmm1
movupd [edi - 32], xmm2 //xyz=Y1
movupd [edi - 16], xmm3
mov ebx, [edi + 104] //fHPVar[1]
mov [edi + 48], ebx //PVars: +48
lea eax, edi - 32 // x
lea edx, edi - 24 // y
lea ecx, edi - 16 // z
lea ebx, edi - 8 // w
push ebx
push edi
call [edi + 128] //fHybrid[1] of ThybridIteration2
mov eax, [esp]
movupd xmm0, [edi - 32] //x,y was: y1
movapd xmm2, [eax] //x[0,1]
movupd xmm1, [edi - 16] //z,w
movapd xmm3, [eax + 16] //x[2,3]
movapd xmm5, xmm0 //x,y
movapd xmm6, xmm2 //x[0,1]
mulpd xmm0, xmm0 //x²,y²
mulpd xmm2, xmm2 //x[0]²,x[1]²
mulsd xmm1, xmm1 //z²,w
mulsd xmm3, xmm3 //x[2]²
addsd xmm1, xmm0 //z²+x²
addsd xmm3, xmm2 //x[2]²+x[0]²
unpckhpd xmm0, xmm0 //y²
unpckhpd xmm2, xmm2 //x[1]²
addsd xmm1, xmm0 //x²+y²+z²
addsd xmm3, xmm2 //x[0]²+x[1]²+x[2]²
unpcklpd xmm3, xmm1 //x[0]²+x[1]²+x[2]²,x²+y²+z²
sqrtpd xmm0, xmm3 //xx,yy
mulpd xmm0, xmm7 //xx*s1,yy*s2
pshufd xmm2, xmm0, $4E
addsd xmm0, xmm2 //XX = xx*s1+yy*s2
pshufd xmm3, xmm7, $4E //wy
movsd xmm2, xmm7 //wx
unpcklpd xmm3, xmm3 //s2,s2
unpcklpd xmm2, xmm2 //s1,s1
movupd xmm1, [edi - 16] //z,w
mulpd xmm5, xmm3 //x,y *s2
mulpd xmm6, xmm2 //x[0,1] *s1
mulpd xmm3, xmm1 //z,w *s2
mulpd xmm2, [eax + 16] //x[2,3] *s1
addpd xmm5, xmm6 //x,y
addpd xmm3, xmm2 //z,w
movapd xmm4, xmm5 //x,y
movsd xmm2, xmm3 //z
mulpd xmm4, xmm4 //x²,y²
mulsd xmm2, xmm2 //z² 4D: mulpd
addsd xmm2, xmm4 //z²+x² 4D: addpd ...
unpckhpd xmm4, xmm4 //y²
addsd xmm4, xmm2 //x²+y²+z²
addsd xmm4, d1em40
sqrtsd xmm4, xmm4
movsd xmm2, xmm0 //XX
divsd xmm2, xmm4 //YY := XX / Sqrt(x * x + y * y + z * z + 1e-40);
unpcklpd xmm2, xmm2 //YY,YY
mulpd xmm5, xmm2
mulsd xmm3, xmm2
movupd [edi - 32], xmm5 //x,y
movupd [edi - 16], xmm3 //z,w
mulsd xmm0, xmm0
movsd [edi + 56], xmm0 //Rout := XX * XX;
movsd xmm1, xmm0
inc dword [edi + 64] //Inc(ItResultI)
minsd xmm0, [esi - 64]
movsd [esi - 64], xmm0 //OTrap := Min(Rout, OTrap);
mov eax, [edi + 64]
cmp eax, [edi + 68] //maxIt: +68
jnl @out
comisd xmm1, [edi - 40] //RStop
jc @Repeat
@out:
mov eax, [esi - 96] //DEoption +160
and eax, 7
sub eax, 4
jnz @UU3 //Result := Abs(z) * Ln(Abs(z)) / w;
fld qword [edi - 16]
fabs
fldln2
fld st(1)
fyl2x
fmulp
fdiv qword [edi - 8] //Result
jmp @UU6
@UU3:
sub eax, 3 // / intPower faster?
jnz @UU4 //Result := Sqrt(Rout/RStop) * Power(PDouble(Integer(PVar) - 16)^, -ItResultI);
mov eax, [edi + 48]
fild dword [edi + 64] //ItResultI
fchs //-ItresultI
fld qword [eax - 16] //(Pvar-16)^ (= scale or something)
fldln2 //power function base,expo -> st, st(1)
fxch
fyl2x
fxch
fmulp
fldl2e
fmulp
fld st(0)
frndint
fsub st(1), st(0)
fxch
f2xm1
fld1
faddp
fscale
fstp st(1) //end of power function
fld qword [edi + 56]
fdiv dword [edi + 72] //rout/rstop,pow
fsqrt
fmulp
jmp @UU6
@UU4: // else Result := Sqrt(Rout) / Abs(w);
fld qword [edi + 56]
fsqrt
fld qword [edi - 8]
fabs
fdivp
@UU6:
cmp byte [esi - 108], 0 //CalcSIT: +148
jz @NoCalcSITout
mov eax, edi
xor edx, edx
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer);
@NoCalcSITout:
add esp, 72
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
function doHybridIFS3D(PIteration3D: TPIteration3D): Double;
asm
push eax
push ebx
push ecx
push edx
push esi
push edi //x = esi-128 y = esi-120 .. btmp = esi+116 (eax+212)
lea esi, eax + 88
movupd xmm0, [eax]
movsd xmm1, [eax + 16]
movupd [eax - 32], xmm0 //X=Cx
movsd [eax - 16], xmm1
lea edx, esi + 128
cmp dword [esi + 64], 0 //DoJulia:+152
jz @sjup
movupd xmm0, [edx + 104] //J=Ju +320 -88=+232 -128=104
movsd xmm1, [edx + 120]
@sjup:
movupd [eax + 24], xmm0 //J=C
movsd [eax + 40], xmm1
@skipIfJulia:
xor ebx, ebx //n:=0
mov eax, [esi + 296]
mov [esi + 120], ebx //bFirstIt := 0; +208
mov [esi - 24], ebx //ItresultI:=0 +64
mov [esi - 36], eax //bIsInsideRender tmp in SmothIts
movzx ebx, word [esi + 102]
fldz
fld d65535 //minDE ini
fld1
fstp qword [esi + 112] //VaryScale: //+200 absScale, must be changed in formulas
fstp qword [esi + TIteration3Dext.OTrap - 144] // 104 OTrap: Double; //+192 min of AbsScale
fstp qword [edx + TIteration3Dext.Dfree1 - 144 - 128] //+248 +56
mov edi, [esi + ebx * 4 + 12] //fHPVar[0] +100
mov ecx, [esi + ebx * 4 - 12] //i:=nHybrid[0] +76
and ecx, $7FFFFFFF
@Repeat:
cmp ecx, 0
jnle @up2
@While:
inc ebx
cmp bx, word [esi + 62] //5 wEndTo: Word; //+150
jle @up3
movzx ebx, word [esi + 100] //n := iRepeatFrom //+188
@up3:
mov ecx, [esi + ebx * 4 - 12] //i := nHybrid[n]; +76
and ecx, $7FFFFFFF
jle @While
mov edi, [esi + ebx * 4 + 12] //fHPVar:array[0..5] of Pointer; //+100
@up2:
call [esi + ebx * 4 + 36] //fHybrid[0..5] of ThybridIteration2; //+124
dec ecx //Dec(i)
cmp [esi + ebx * 4 - 12], 0
jl @Repeat
movsd xmm0, [esi - 32] //DEout relative; Rout: Double; //+56
inc dword [esi - 24] //Inc(ItResultI) //+64
divsd xmm0, [esi + 112] //abs Scale VaryScale: Double; //+200
mov eax, [esi - 24]
ucomisd xmm0, [esi + 104] // memorize the smallest DE for itresult
jnc @skip
lea edx, esi + 104
mov [esi + 124], eax // bTmp: Integer; //+212
fld qword [edx + TIteration3Dext.Dfree1 - 144-104] //+128
movsd [edx], xmm0 //result DE output
fstp qword [edx + TIteration3Dext.Dfree2 - 144-104] //+136
cmp dword [esi - 36], 0 //was: +384 -88=296 bIsInsideRender
js @skip //if outside, compare if DE is lower than minDE
ucomisd xmm0, [esi - 128] //compare with RstopD, that contains the DEstop condition. Stop if nearer.
jc @out
@skip:
cmp eax, [esi - 20] //maxIt: +68
jl @Repeat
@out:
fild dword [esi + 124]
mov eax, [esi + 124] //it on minDE
fstp dword [esi - 36] //SmoothItD: Single; //+52
mov [esi - 24], eax //ItResultI
add esi, 104
fld qword [esi] //MinDE in OTrap
fld qword [esi + 32] //Dfree2
fstp qword [esi] //OTrap
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
function doHybridIFS3DnoVecIni(PIteration3D: TPIteration3D): Double; //to use behind common fractals, use the new vec for it
asm
push eax
push ebx
push ecx
push edx
push esi
push edi //x = esi-128 y = esi-120 .. btmp = esi+116 (eax+212)
lea esi, eax + 88 //
xor ebx, ebx //n:=0
mov eax, [esi + 296]
mov [esi + 120], ebx //bFirstIt := 0; +208
mov [esi + 124], ebx
mov [esi - 24], ebx //ItresultI:=0 +64
mov [esi - 36], eax //bIsInsideRender tmp in SmothIts
movzx ebx, word [esi + 102] //n := iStartFrom
fldz
fld d65535 //minDE ini
fld1
fstp qword [esi + 112] //VaryScale: //+200 absScale, must be changed in formulas
fstp qword [esi + TIteration3Dext.OTrap - 144] // 104 OTrap: Double; //+192 min of AbsScale
fstp qword [esi + TIteration3Dext.Dfree1 - 144] //+248 +56
mov edi, [esi + ebx * 4 + 12] //fHPVar[0] +100
mov ecx, [esi + ebx * 4 - 12] //i:=nHybrid[n] +76
and ecx, $7FFFFFFF
@Repeat:
cmp ecx, 0
jnle @up2
@While:
inc ebx
cmp bx, word [esi + 62] //5 wEndTo: Word; //+150
jle @up3
movzx ebx, word [esi + 100] //n := iRepeatFrom //+188
@up3:
mov ecx, [esi + ebx * 4 - 12] //i := nHybrid[n]; +76
and ecx, $7FFFFFFF
jle @While
mov edi, [esi + ebx * 4 + 12] //fHPVar:array[0..5] of Pointer; //+100
@up2:
call [esi + ebx * 4 + 36] //fHybrid[0..5] of ThybridIteration2; //+124
dec ecx //Dec(i)
cmp [esi + ebx * 4 - 12], 0
jl @Repeat
movsd xmm0, [esi - 32] //DEout relative; Rout: Double; //+56
inc dword [esi - 24] //Inc(ItResultI) //+64
divsd xmm0, [esi + 112] //abs Scale VaryScale: Double; //+200
mov eax, [esi - 24]
ucomisd xmm0, [esi + 104] // memorize the smallest DE for itresult
jnc @skip
lea edx, esi + 104
mov [esi + 124], eax // bTmp: Integer; //+212
fld qword [edx + TIteration3Dext.Dfree1 - 144-104] //+128 otrap color option
movsd [edx], xmm0 //result DE output
fstp qword [edx + TIteration3Dext.Dfree2 - 144-104] //+136
cmp dword [esi - 36], 0 //was: +384 -88=296 bIsInsideRender
jne @skip //if outside, compare if DE is lower than minDE
ucomisd xmm0, [esi - 128] //compare with RstopD, that contains the DEstop condition. Stop if nearer.
jc @out
@skip:
cmp eax, [esi - 20] //maxIt: +68
jl @Repeat
@out:
fild dword [esi + 124]
mov eax, [esi + 124] //it on minDE
fstp dword [esi - 36] //SmoothItD: Single; //+52
mov [esi - 24], eax //ItResultI
add esi, 104
fld qword [esi] //MinDE in OTrap
fld qword [esi + 32] //Dfree2
fstp qword [esi] //OTrap
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
procedure CalcSmoothIterations(PIt3D: TPIteration3D; n: Integer);
asm
add eax, $34
cmp dword [eax + TIteration3D.Rout + 4 - $34], $3FF00000 //Rout <= 1? [Rout+4] //+$3c cmp with $3FF0.. does not work always!!!
jg @@1
fild dword [eax + TIteration3D.ItResultI - $34] //+$40
fstp dword [eax + TIteration3D.SmoothItD - $34] //+$34
ret
@@1:
fld qword [eax + TIteration3D.Rout - $34] //+$38 Rout
cmp dword [eax + TIteration3Dext.Rold - 56 + 4 - $34], $3FF00000 //Rold <= 1? -$2c
jnb @@2
fldln2
fxch //Rout,ln
fyl2x
fmul s05 //ln(Rout)*0.5
fldln2
fxch
fyl2x
fmul dword [eax + edx * 4 + TIteration3D.fHln - $34] // PIt3D.fHln[n] +$00a4
fild dword [eax + TIteration3D.ItResultI - $34] //+$40
fadd dword [eax + TIteration3D.LNRStop - $34] //+$009c
fsubrp
fstp dword [eax + TIteration3D.SmoothItD - $34] //+$34
ret
@@2:
fldln2
fxch
fyl2x //ln(Rout)
fmul s05
fldln2
fxch
fyl2x //d
fldln2 //ln2,d
fld qword [eax + TIteration3Dext.Rold - 56 - $34] //Rold,ln2,d
fyl2x
fmul s05
fldln2
fxch
fyl2x
fsubr st, st(1) //d - Ln(0.5 * Ln(PIt3D.Rold)), d
fld dword [eax + TIteration3D.LNRStop - $34] //+$009c
fsubrp st(2), st //d - Ln(0.5 * Ln(PIt3D.Rold)), PIt3D.LNRStop - d
fadd d1em100 //test
fdivp //div0 sometimes
fiadd dword [eax + TIteration3D.ItResultI - $34] //+$40
fstp dword [eax + TIteration3D.SmoothItD - $34] //+$34
end;
procedure doHybrid4DSSE2(PIteration3D: TPIteration3D); //new ext version
asm
push eax
push ebx
push ecx
push edx
push esi
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, i = edi + 212 = btmp = esi - 44
mov edi, eax
lea esi, eax + 256
lea edx, edi -32
mov ecx, esi
call Rotate4Dex //(@C1, @x, SMatrix4); C1=It3D=eax
movupd xmm6, [edi - 32]
movupd xmm7, [edi - 16]
cmp dword [esi - 104], 0 //DoJulia:+152
jz @sjup
movupd xmm2, [esi + 64]
movupd xmm3, [esi + 80]
movupd [edi + 24], xmm2 //J=Ju
movlpd [edi + 40], xmm3
movhpd [edi - 56], xmm3
jmp @skipIfJulia
@sjup:
movupd [edi + 24], xmm6 //J=C
movlpd [edi + 40], xmm7
movhpd [edi - 56], xmm7 //J4 = edi - 56
@skipIfJulia:
mulpd xmm6, xmm6
mulpd xmm7, xmm7
CVTSS2SD xmm5, [edi + 72] //RStop in double
addpd xmm7, xmm6
pshufd xmm6, xmm7, $4E
movsd [edi - 40], xmm5
addsd xmm7, xmm6 //xmm7=Rout
movsd [esi - 64], xmm7 //OTrap=Rout
movsd [edi + 56], xmm7 //Rout
xor ebx, ebx //n:=0
mov [esi - 48], ebx //bFirstIt := 0; +208
mov [edi + 64], ebx //ItresultI:=0 +64
movzx ebx, word [esi - 66] //n:=iStartFrom
mov eax, [edi + ebx * 4 + 100] //fHPVar[0] +100
mov [edi + 48], eax //PVars: +48
mov eax, [edi + ebx * 4 + 76] //i:=nHybrid[0] +76
and eax, $7FFFFFFF
mov [esi - 44], eax //i(=It3D.btmp)
@Repeat:
movsd [edi - 48], xmm7 //Rold := Rout
cmp dword [esi - 44], 0
jnle @up2
@While:
inc ebx
cmp bx, word [esi - 106] //5 wEndTo: Word; //+150
jle @up3
movzx ebx, word [esi - 68] //n := iRepeatFrom
@up3:
mov eax, [edi + ebx * 4 + 76] //i := nHybrid[n]; +76
and eax, $7FFFFFFF
jle @While
mov [esi - 44], eax
mov eax, [edi + ebx * 4 + 100] //fHPVar:array[0..5] of Pointer;
mov [edi + 48], eax //PVars: +48
@up2:
lea eax, edi - 8 //was: esp + 24 w
push eax
push edi
lea edx, edi - 24 //was: esp + 16 y
lea ecx, edi - 16 //was: esp + 24 z
add eax, -24 // x
call [edi + ebx * 4 + 124] //fHybrid[0..5] of ThybridIteration2; //+124
dec [esi - 44] //Dec(i) write at addr... false dIFS??
cmp [edi + ebx * 4 + 76], 0 //nHybrid[fnr]
jl @Repeat //SkipMaxItTest
movupd xmm6, [edi - 32]
movupd xmm7, [edi - 16]
mulpd xmm6, xmm6
mulpd xmm7, xmm7
addpd xmm7, xmm6
pshufd xmm6, xmm7, $4E
addsd xmm7, xmm6 //xmm7=Rout
movsd xmm5, xmm7
minsd xmm5, qword [esi - 64]
movsd [edi + 56], xmm7 //Rout
movsd [esi - 64], xmm5 //OTrap
inc dword [edi + 64] //Inc(ItResultI)
mov eax, [edi + 64]
cmp eax, [edi + 68] //maxIt: +68
jnl @out
comisd xmm7, [edi - 40] //RStop
jc @Repeat
@out:
cmp byte [esi - 108], 0 //CalcSIT: +148
jz @NoCalcSITout
mov eax, edi
mov edx, ebx
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer);
@NoCalcSITout:
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
procedure doHybridSSE2(PIteration3D: TPIteration3D); //new ext version
asm
push eax
push ebx
push ecx
push edx
push esi
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, (i = edi + 212 = btmp = esi - 44)
mov edi, eax // = [edi - 32]
lea esi, eax + 256
movupd xmm6, [edi] //Iteration3D by calcMissed not aligned16?!
movsd xmm7, [edi + 16]
movupd [edi - 32], xmm6 //X=C
movupd [edi - 16], xmm7
cmp dword [esi - 104], 0 //DoJulia:+152
jz @sjup
movupd xmm2, [esi + 64]
movsd xmm3, [esi + 80]
movupd [edi + 24], xmm2 //J=Ju
movsd [edi + 40], xmm3
jmp @skipIfJulia
@sjup:
movupd [edi + 24], xmm6 //J=C
movsd [edi + 40], xmm7
@skipIfJulia:
mulpd xmm6, xmm6
mulsd xmm7, xmm7
CVTSS2SD xmm5, [edi + 72] //RStop in double
addsd xmm7, xmm6
shufpd xmm6, xmm6, 1
movsd [edi - 40], xmm5
addsd xmm7, xmm6 //xmm7=Rout
movsd [esi - 64], xmm7 //OTrap=Rout
movsd [edi + 56], xmm7 //Rout
xor ebx, ebx
mov [esi - 48], ebx //bFirstIt := 0; +208
mov [edi + 64], ebx //ItresultI:=0 +64
movzx ebx, word [esi - 66] //n := iStartFrom
mov eax, [edi + ebx * 4 + 100] //fHPVar[0] +100
mov [edi + 48], eax //PVars: +48
mov eax, [edi + ebx * 4 + 76] //i:=nHybrid[0] +76
and eax, $7FFFFFFF
mov [esi - 44], eax //btmp
@Repeat:
movsd [edi - 48], xmm7 //Rold := Rout
cmp dword [esi - 44], 0
jnle @up2
@While:
inc ebx
cmp bx, word [esi - 106] //5 wEndTo: Word; //+150
jle @up3
movzx ebx, word [esi - 68] //n := iRepeatFrom
@up3:
mov eax, [edi + ebx * 4 + 76] //i := nHybrid[n]; +76
and eax, $7FFFFFFF
jle @While
mov [esi - 44], eax //was btmp, now own var
mov eax, [edi + ebx * 4 + 100] //fHPVar:array[0..5] of Pointer;
mov [edi + 48], eax //PVars: +48
@up2:
lea eax, edi - 8 // w
push eax
push edi
lea edx, edi - 24
lea ecx, edi - 16
add eax, -24
call [edi + ebx * 4 + 124] //fHybrid[0..5] of ThybridIteration2; //+124 fp overflow: it3dex.z > 1eXXX !
dec [esi - 44] //Dec(i)
cmp [edi + ebx * 4 + 76], 0
jl @Repeat //SkipMaxItTest
movupd xmm6, [edi - 32]
movupd xmm7, [edi - 16]
mulpd xmm6, xmm6
mulsd xmm7, xmm7 //4D: mulpd
addsd xmm7, xmm6 //4D: addpd
shufpd xmm6, xmm6, 1 //4D: pshufd xmm6, xmm7, $4E
addsd xmm7, xmm6 //xmm7=Rout
movsd xmm5, xmm7
minsd xmm5, qword [esi - 64]
movsd [edi + 56], xmm7 //Rout
movsd [esi - 64], xmm5 //OTrap
inc dword [edi + 64] //Inc(ItResultI)
mov eax, [edi + 64]
cmp eax, [edi + 68] //maxIt: +68
jnl @out
comisd xmm7, [edi - 40] //RStop
jc @Repeat
@out:
cmp byte [esi - 108], 0 //CalcSIT: +148
jz @NoCalcSITout
mov eax, edi
mov edx, ebx
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer);
@NoCalcSITout:
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
function doHybridDESSE2(PIteration3D: TPIteration3D): Double; //result in st(0) new ext version
asm
push eax
push ebx
push ecx
push edx
push esi
push edi //x = edi-32 y = edi-24 .. Rold = edi - 48, Rstop = edi - 40, (i = edi + 212 = btmp = esi - 44)
mov edi, eax
lea esi, eax + 256
movupd xmm6, [edi] //Iteration3D by calcMissed not aligned16?!
movsd xmm7, [edi + 16]
movupd [edi - 32], xmm6 //X=C
movupd [edi - 16], xmm7
cmp dword [esi - 104], 0 //DoJulia:+152
jz @sjup
movupd xmm2, [esi + 64]
movsd xmm3, [esi + 80]
movupd [edi + 24], xmm2 //J=Ju
movsd [edi + 40], xmm3
jmp @skipIfJulia
@sjup:
movupd [edi + 24], xmm6 //J=C
movsd [edi + 40], xmm7
@skipIfJulia:
mulpd xmm6, xmm6
mulsd xmm7, xmm7
CVTSS2SD xmm5, [edi + 72] //RStop in double
addsd xmm7, xmm6
shufpd xmm6, xmm6, 1
movsd [edi - 40], xmm5
addsd xmm7, xmm6 //xmm7=Rout
movsd [esi - 64], xmm7 //OTrap=Rout
movsd [edi + 56], xmm7 //Rout
movsd [edi - 48], xmm7 //Rold := Rout
xor ebx, ebx //n:=0
mov [edi + 208], ebx //mov [esi - 48], ebx //bFirstIt := 0; +208
mov [edi + 64], ebx //ItresultI:=0 +64
movzx ebx, word [esi - 66] //n := iStartFrom
mov eax, [edi + ebx * 4 + 100] //fHPVar[n] +100
mov [edi + 48], eax //PVars: +48
mov eax, [edi + ebx * 4 + 76] //i:=nHybrid[n] +76
and eax, $7FFFFFFF
mov [esi - 44], eax
mov eax, [esi - 96] //DEoption +160
and eax, $38 // case (DEoption and $38) of
sub eax, 16
jnz @UU1
fld qword [edi + 56] // 16: w := Rout;
jmp @UU2
@UU1:
sub eax, 16
jnz @UU
fld1
fstp qword [esi - 24] // deriv1
fldz // 32: begin Deriv1 := 1; Deriv2 := 0; Deriv3 := 0; end;
fst qword [esi - 16]
fst qword [esi - 8]
jmp @UU2
@UU:
fld1 // else w := 1;
@UU2:
fstp qword [edi - 8] //w := Rout,1,0
@Repeat:
movsd [edi - 48], xmm7 //Rold := Rout
cmp dword [esi - 44], 0
jnle @up2
@While:
inc ebx
cmp bx, word [esi - 106] //5 wEndTo: Word; //+150
jle @up3
movzx ebx, word [esi - 68] //n := iRepeatFrom
@up3:
mov eax, [edi + ebx * 4 + 76] //i := nHybrid[n]; +76
and eax, $7FFFFFFF
jle @While
mov [esi - 44], eax
mov eax, [edi + ebx * 4 + 100] //fHPVar:array[0..5] of Pointer;
mov [edi + 48], eax //PVars: +48
@up2:
lea eax, edi - 8 //was: esp + 24 w
push eax
push edi
lea edx, edi - 24 //was: esp + 16 y
lea ecx, edi - 16 //was: esp + 24 z
add eax, -24 // x
call [edi + ebx * 4 + 124] //fHybrid[0..5] of ThybridIteration2; //+124 error in called function sometimes!!!
dec [esi - 44] //Dec(i) //Write off...??? bug in call... of mandbox or menger??! abox as testhybrid! esi has changed?
cmp [edi + ebx * 4 + 76], 0
jl @Repeat //SkipMaxItTest
movupd xmm6, [edi - 32]
movupd xmm7, [edi - 16]
mulpd xmm6, xmm6
mulsd xmm7, xmm7
addsd xmm7, xmm6
shufpd xmm6, xmm6, 1
addsd xmm7, xmm6 //xmm7=Rout
movsd xmm5, xmm7
minsd xmm5, qword [esi - 64]
movsd [edi + 56], xmm7 //Rout
movsd [esi - 64], xmm5 //OTrap
inc dword [edi + 64] //Inc(ItResultI)
mov eax, [edi + 64]
cmp eax, [edi + 68] //maxIt: +68
jnl @out
comisd xmm7, [edi - 40] //RStop
jc @Repeat
@out:
mov eax, [esi - 96] //DEoption +160 if (DEoption and $38) = 32 then
and eax, 38
sub eax, 32
jnz @JU1
fld qword [edi + 56] //rout Result := Sqrt(Rout) * 0.5 * Ln(Rout) / RoutDeriv
fldln2
fld st(1) //rout,ln2,rout
fyl2x //ln(rout),rout
fxch
fsqrt
fmulp
fmul cs05
fdiv qword [esi - 24] //Deriv1
jmp @UU6
@JU1:
mov eax, [esi - 96] //DEoption +160
and eax, 7
sub eax, 4
jnz @UU3 //Result := Abs(X3) * Ln(Abs(X3)) / X4;
fld qword [edi - 16] //X3
fabs
fldln2
fld st(1) //absX3,ln2,absX3
fyl2x //ln(absX3),absX3
fmulp
fdiv qword [edi - 8] //Result
jmp @UU6
@UU3:
sub eax, 3
jnz @UU4 //Result := Sqrt(Rout/RStop) * Power(PDouble(Integer(PVar) - 16)^, -ItResultI);
mov eax, [edi + 48]
fild dword [edi + 64] //ItResultI
fchs //-ItresultI
fld [eax - 16] //(Pvar-16)^ (= scale or something)
fldln2 //power function x,pow
fxch
fyl2x
fxch
fmulp
fldl2e
fmulp
fld st(0)
frndint
fsub st(1), st(0)
fxch
f2xm1
fld1
faddp
fscale
fstp st(1) //end of power function
fld qword [edi + 56]
fdiv dword [edi + 72] //rout/rstop,pow
fsqrt
fmulp
jmp @UU6
@UU4: // else Result := Sqrt(Rout) / Abs(X4);
fld qword [edi + 56]
fsqrt
fld qword [edi - 8]
fabs
fdivp
@UU6:
cmp byte [esi - 108], 0 //CalcSIT: +148
jz @NoCalcSITout
mov eax, edi
mov edx, ebx
call CalcSmoothIterations //(PIt3D: TPIteration3D; n: Integer);
@NoCalcSITout:
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end;
procedure HybridItTricorn(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
fld qword [edx]
fld st(0)
fmul st(0), st(1) // y*y, y
fld qword [eax] // x, y*y, y
mov esi, [ebp + 8] // PIteration3D
fld st(0) // x, x, y*y, y
fmul st(0), st(1) // x*x, x, y*y, y
fld qword [ecx] // z, x*x, x, y*y, y
fld st(0)
mov edi, [esi + 48]
fmul st(0), st(1) // z*z, z, x*x, x, y*y, y
faddp st(4), st(0) // z, x*x, x, y*y+z*z, y
fmul st(0), st(2) // z*x, x*x, x, y*y+z*z, y
fmul qword [edi - 16]
fld qword [esi + 40]
fmul qword [edi - 24]
faddp
fstp qword [ecx] // x*x, x, y*y+z*z, y
fsubrp st(2), st(0) // x, x*x-y*y-z*z, y
fmulp st(2), st(0) // x*x-y*y-z*z, y*x
fadd qword [esi + 24]
fstp qword [eax] // y*x
fadd st(0), st(0)
fadd qword [esi + 32]
fstp qword [edx]
pop edi
pop esi
end;
procedure HybridQuatSSE2(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
mov esi, [ebp + 8]
mov edi, [esi + 48] //PVars
movupd xmm0, [eax] //x,y
movupd xmm1, [ecx] //z,w
movapd xmm6, xmm0 //x,y
movapd xmm5, xmm1 //z,w
movapd xmm3, xmm1 //z,w
xorpd xmm4, xmm4 //0,0
mulpd xmm6, xmm6 //xx,yy
mulpd xmm5, xmm5 //zz,ww
movupd xmm2, [edx] //y,z
subsd xmm4, xmm5 //-zz
shufpd xmm3, xmm0, 1 //w,x
shufpd xmm4, xmm5, 2 //-zz, ww
mulpd xmm2, xmm0 //yx, zy
addpd xmm4, xmm6 //xx-zz, yy+ww
mulpd xmm0, xmm1 //xz, yw
mulpd xmm3, xmm1 //wz, xw
pshufd xmm6, xmm0, $4E //yw, xz
pshufd xmm1, xmm4, $4E //yy+ww, xx-zz
mulsd xmm6, [edi - 16] //ywMul, xz
addpd xmm2, xmm3 //yx+wz, zy+xw -> y, w
addsd xmm6, xmm0 //ywMul + xz -> z
subpd xmm4, xmm1 //xx-zz-yy-ww -> x
addpd xmm2, xmm2 //y,w
addsd xmm6, xmm6 //z
shufpd xmm2, xmm2, 1 //w, y
movupd xmm5, [esi + 24] //J1,J2
addsd xmm2, [edi - 24]
addsd xmm2, [esi - 56] //+J4
shufpd xmm6, xmm2, 0 //z, w
shufpd xmm4, xmm2, 2 //x, y
addsd xmm6, [esi + 40] //+J3
addpd xmm4, xmm5 //+J1,2
movupd [eax], xmm4
movupd [ecx], xmm6
pop edi
pop esi
end;
procedure HybridItIntPow2(var x, y, z, w: Double; PIteration3D: TPIteration3D); //sine bulb
asm
push esi
push edi
fld qword [ecx]
fld qword [edx]
fld qword [eax] //x,y,z
mov esi, [ebp + 8] //PIteration3D
fld st(1) //y,x,y,z
fmul st(0), st(2) // y*y,x,y,z
fld st(1) // x,y*y,x,y,z
fmul st(0), st(2) // x*x, y*y,x,y,z
fld st(0) // x*x, x*x, y*y,x,y,z
fadd st(0), st(2) // xx+yy, xx, yy,x,y,z
fld st(0) // xx+yy, xx+yy, xx, yy,x,y,z
fsqrt
mov edi, [esi + 48]
fmul qword [edi - 16] //*dOption1=Zmul
fmul st(0), st(6) //*z
fadd st(0), st(0) //*2
fadd qword [esi + 40] //+cz nly for test
fstp qword [ecx] //xx+yy, xx, yy,x,y,z
fld st(5) //z, xx+yy, xx, yy,x,y,z
fmulp st(6), st(0) //xx+yy, xx, yy,x,y,z*z
fld st(0) //xx+yy, xx+yy, xx, yy,x,y,z*z
fsubrp st(6), st(0) //xx+yy, xx, yy,x,y, a - z*z
fdivp st(5), st(0) //xx, yy,x,y, a - z*z / a = a
fsubrp //xx-yy,x,y, a
fmul st(0), st(3) //a(xx-yy),x,y, a
fadd qword [esi + 24]
fstp qword [eax] //x,y, a
fmulp
fmulp //x*y*a
fadd st(0), st(0) //*2
fadd qword [esi + 32] //+ cy only for test
fstp qword [edx]
pop edi
pop esi //SineP2
end;
procedure HybridItIntPow2SSE2(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push ebx
mov esi, [ebp + 8]
movlpd xmm0, [eax] // x
movhpd xmm0, [edx] // x, y
movlpd xmm1, [ecx] // z
movapd xmm2, xmm0
mov ebx, [esi + 48] //Pvars
movsd xmm3, xmm1
mulpd xmm2, xmm2 // S1, S2
mulsd xmm3, xmm3 // S3
pshufd xmm5, xmm2, $4E // S2, S1
movapd xmm4, xmm5
addpd xmm5, xmm2 // S1+S2
subsd xmm2, xmm4 // S1-S2
movapd xmm6, xmm5
mulsd xmm1, [ebx - 16] // z*dZmul
sqrtsd xmm4, xmm6 // Sqrt(S2+S1)
addsd xmm1, xmm1 // z*dZmul*2
subsd xmm6, xmm3 // (S1+S2)-S3
mulsd xmm1, xmm4 // z*dZmul*2*Sqrt(S2+S1)
movsd xmm3, [edx] // y
addsd xmm1, [esi + 40] // z*dZmul*Sqrt(S2+S1)+J3 = z
divsd xmm6, xmm5 // (XT-S3)/XT = XT
addsd xmm3, xmm3 // y*2
movsd [ecx], xmm1 // z
mulsd xmm3, xmm0 // y*2*x
mulsd xmm2, xmm6 // (S1-S2)*XT
mulsd xmm3, xmm6 // y*2*x*XT
addsd xmm2, [esi + 24] // (S1-S2)*XT+J1 = x
addsd xmm3, [esi + 32] // y*2*x*XT+J2 = y
movsd [eax], xmm2 // x
movsd [edx], xmm3 // y
pop ebx
pop esi
end;
procedure HybridFloatPow(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
mov esi, [ebp + 8] //PIteration3D
mov edi, [esi + 48]
fld qword [edi - 16]
fld qword [edx]
fld qword [eax]
fld st(1)
fld st(1)
fpatan //theta, x, y, pow
fmul st, st(3)
fsincos //Costheta, Sintheta, x, y, pow
fld qword [ecx] //z,Costheta, Sintheta,x,y,pow
fxch st(3) //x,Costheta, Sintheta,z,y,pow
fmul st, st
fxch st(4) //y,Costheta, Sintheta,z,xx,pow
fmul st, st
faddp st(4), st //Costheta,Sintheta,z,xx+yy,pow
fxch st(2) //z,Sintheta,Costheta,xx+yy,pow
fxch //Sintheta,z,Costheta,xx+yy,pow
fxch st(3) //xx+yy,z,Costheta,Sintheta,pow
fsqrt
fpatan //phi,Costheta, Sintheta,pow
fmul st, st(3)
fsincos //Cosphi,Sinphi,Costheta,Sintheta,pow
fxch st(4) //pow,Sinphi,Costheta,Sintheta,Cosphi
fmul qword [edi - 8] //*0.5 because of Rout=sqr(R)
fld qword [esi + 56] //SqrRadius, pow*0.5,Sinphi,Costheta,Sintheta,Cosphi
fldln2 //power function x,pow
fxch
fyl2x
fxch
fmulp
fldl2e
fmulp
fld st(0)
frndint
fsub st(1), st(0)
fxch
f2xm1
fld1
faddp
fscale
fstp st(1) //NewRadius,Sinphi,Costheta,Sintheta,Cosphi
fxch st(2) //Costheta,Sinphi,NewRadius,Sintheta,Cosphi
fmul st, st(4)
fmul st, st(2)
fadd qword [esi + 24]
fstp qword [eax] //Sinphi,NewRadius,Sintheta,Cosphi
fxch st(3) //Cosphi,NewRadius,Sintheta,Sinphi
fmulp st(2), st //NewRadius,Sintheta*Cosphi,Sinphi
fmul st(1), st
fmulp st(2), st //Sintheta*Cosphi*r,Sinphi*r
fadd qword [esi + 32]
fstp qword [edx]
fmul qword [edi - 24]
fadd qword [esi + 40]
fstp qword [ecx]
pop edi
pop esi
end;
procedure HybridItIntPow3(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
mov esi, [ebp + 8] //PIteration3D
fld qword [edx]
fmul st, st // y*y
fld qword [eax] // x, y*y
mov edi, [esi + 48] // PVars
fmul st, st // x*x, y*y
fld st(0) // x*x, x*x, y*y
fadd st(0), st(2) // x*x+y*y = R, x*x = sx, y*y = sy
fld qword [ecx]
fmul st, st // sz, R, sx, sy
fld qword [edi + 120] // 3, sz, R, sx, sy
fld st(1) // sz, 3, sz, R, sx, sy
fmul st(0), st(1) // 3*sz, 3, sz, R, sx, sy
fld st(3)
fadd qword [edi + 24]
fdivp // 3*sz/R, 3, sz, R, sx, sy
fld1
fsubrp
fld st(1) // 3, A, 3, sz, R, sx, sy
fmul st(0), st(6) // 3*sy, ..
fsubr st(0), st(5) // sx-3*sy, ..
fmul st(0), st(1) // A*(sx-3*sy), A, 3, sz, R, sx, sy
fmul qword [eax]
fadd qword [esi + 24]
fstp qword [eax] // A, 3, sz, R, sx, sy
fxch st(4) // sx, 3, sz, R, A, sy
fmul st(0), st(1) // 3*sx, 3, sz, R, A, sy
fsubrp st(5), st(0) // 3, sz, R, A, 3*sx-sy was: sy-3*sx!
fmulp st(2), st(0) // sz, 3*R, A, 3*sx-sy
fsubrp // sz-3*R, A, 3*sx-sy
fmul qword [ecx]
fmul qword [edi - 16] //*dZmul
fsubr qword [esi + 40]
fstp qword [ecx] // A, 3*sx-sy
fmulp // A*(3*sx-sy)
fmul qword [edx]
fadd qword [esi + 32]
fstp qword [edx]
pop edi
pop esi
end;
procedure HybridItIntPow4(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
mov esi, [ebp + 8] //PIteration3D
fld qword [edx]
fmul st, st // y*y
fld qword [eax] // x, y*y
mov edi, [esi + 48] // PVars
fld st
fmul st, st // x*x, x, y*y
fld st(0) // x*x, x*x, x, y*y
fadd st(0), st(3) // x*x+y*y = R, sx, x, sy
fld qword [ecx]
fmul st, st // sz, R, sx, x, sy
fld qword [edi + 144] // 6, sz, R, sx, x, sy
fmul st, st(2) // 6*R, sz, R, sx, x, sy
fsubr st, st(1) // sz - 6*R, sz, R, sx, x, sy
fmul st(0), st(1) // sz * (sz - 6 * R), sz, R, sx, x, sy
fld st(2)
fmul st, st // R*R, sz * (sz - 6 * R), sz, R, sx, x, sy
fadd qword [edi + 24] // 24-112 +1e-40
fdivp // sz * (sz - 6 * R) / R*R, sz, R, sx, x, sy
fld1
faddp // A, sz, R, sx, x, sy
fld st(5) // sy, A, sz, R, sx, x, sy
fmul qword [edi + 144] // 6*sy, A, sz, R, sx, x, sy
fsubr st, st(4) // sx-6*sy, A, sz, R, sx, x, sy
fmul st(0), st(4) // sx*(sx-6*sy), A, sz, R, sx, x, sy
fld st(6)
fmul st, st
faddp // sy*sy + sx*(sx-6*sy), A, sz, R, sx, x, sy
fmul st, st(1)
fadd qword [esi + 24]
fstp qword [eax] // A, sz, R, sx, x, sy
fxch st(2) // R, sz, A, sx, x, sy
fsubr st(1), st // R, R-sz, A, sx, x, sy
fsqrt
fmulp // sqrt(R)*(R-sz), A, sx, x, sy
fmul qword [ecx]
fmul qword [edi + 128] //*4
fmul qword [edi - 16] //*dZmul
fadd qword [esi + 40]
fstp qword [ecx] // A, sx, x, sy
fxch // sx, A, x, sy y := 4 * x * y * A * (sx - sy) + J2;
fsubrp st(3), st // A, x, sx-sy
fmulp // A*x, sx-sy
fmulp
fmul qword [edi + 128] //*4
fmul qword [edx] //*y
fadd qword [esi + 32]
fstp qword [edx]
pop edi
pop esi
end;
procedure HybridIntP5(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
mov esi, [ebp + 8] //PIteration3D
fld qword [edx]
fmul st, st // y*y
fld qword [eax] // x, y*y
mov edi, [esi + 48] // PVars
fmul st, st // x*x, y*y
fld st // x*x, x*x, y*y
fadd st, st(2) // x*x+y*y = R, sx, sy
fld qword [ecx]
fmul st, st // sz, R, sx, sy
fld qword [edi + 136] // 5, sz, R, sx, sy
fld st // 5, 5, sz, R, sx, sy
fld st(2)
fmul st, st(4)
fadd st, st // sz*R*2, 5, 5, sz, R, sx, sy
fld st(3)
fmul st, st
fsubrp // sz*sz - sz*R*2, 5, 5, sz, R, sx, sy
fmulp // (sz*sz - sz*R*2) * 5, 5, sz, R, sx, sy
fld st(3)
fmul st, st // R*R, (sz*sz - sz*R*2) * 5, 5, sz, R, sx, sy
fadd qword [edi + 24] // 24-112 +1e-40
fdivp // (sz*sz - sz*R*2) * 5 / R*R, 5, sz, R, sx, sy
fld1
faddp // A, 5, sz, R, sx, sy
fld st(4) // sx, A, 5, sz, R, sx, sy
fmul qword [edi + 168] // 10*sx, A, 5, sz, R, sx, sy
fsub st, st(6) // 10*sx - sy, A, 5, sz, R, sx, sy
fmul st, st(6) // sy*(10*sx - sy), A, 5, sz, R, sx, sy
fld st(5)
fmul st, st
fmul st, st(3) // 5*sx*sx, sy*(10*sx - sy), A, 5, sz, R, sx, sy
fsubrp // 5*sx*sx - sy*(10*sx - sy), A, 5, sz, R, sx, sy
fmul st, st(1)
fmul qword [edx]
fadd qword [esi + 32]
fstp qword [edx] // A, 5, sz, R, sx, sy
fld st(3)
fmul st, st(2)
fadd st, st // 10*R, A, 5, sz, R, sx, sy
fsubr st, st(3) // sz-10*R, A, 5, sz, R, sx, sy
fmulp st(3), st // A, 5, sz*(sz-10*R), R, sx, sy
fxch st(3) // R, 5, sz*(sz-10*R), A, sx, sy
fmul st, st //
fmul st, st(1) //
faddp st(2), st // 5, sz*(sz-10*R)+5*R*R, A, sx, sy
fld st(4)
fmul st, st
fmul st, st(1) // 5*sy*sy, 5, sz*(sz-10*R)+5*R*R, A, sx, sy
fxch st(5) // sy, 5, sz*(sz-10*R)+5*R*R, A, sx, 5*sy*sy
fmulp
fadd st, st // 10*sy, sz*(sz-10*R)+5*R*R, A, sx, 5*sy*sy
fsubr st, st(3) // sx-10*sy, sz*(sz-10*R)+5*R*R, A, sx, 5*sy*sy
fmulp st(3), st // sz*(sz-10*R)+5*R*R, A, sx*(sx-10*sy), 5*sy*sy
fmul qword [ecx]
fmul qword [edi - 16] //*dZmul
fadd qword [esi + 40]
fstp qword [ecx] // A, sx*(sx-10*sy), 5*sy*sy
fmul qword [eax]
fxch // sx*(sx-10*sy), A*x, 5*sy*sy
faddp st(2), st // A*x, sx*(sx-10*sy)+5*sy*sy
fmulp
fadd qword [esi + 24]
fstp qword [eax]
pop edi
pop esi
end;
procedure HybridIntP6(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
mov esi, [ebp + 8] //PIteration3D
fld qword [edx]
fmul st, st // y*y
fld qword [eax] // x, y*y
mov edi, [esi + 48] // PVars
fmul st, st // x*x, y*y
fld st // x*x, x*x, y*y
fadd st, st(2) // x*x+y*y = R, sx, sy
fld qword [ecx]
add edi, 112
fmul st, st // sz, R, sx, sy
fld qword [edi + 176-112] // 15, sz, R, sx, sy
fld st // 15, 15, sz, R, sx, sy
fmul st, st(3) // 15*R,
fsubr st, st(2) // sz-R*15, 15, sz, R, sx, sy
fmul st, st(2)
fld st(3)
fmul st, st // R*R, sz*(sz-R*15), 15, sz, R, sx, sy
fxch
fld st(1) // R*R, sz*(sz-R*15), R*R, 15, sz, R, sx, sy
fmulp st(3), st // sz*(sz-R*15), R*R, 15*R*R, sz, R, sx, sy
faddp st(2), st // R*R, 15*R*R+sz*(sz-R*15), sz, R, sx, sy
fxch // 15*R*R+sz*(sz-R*15), R*R, sz, R, sx, sy
fmul st, st(2) // sz*(15*R*R+sz*(sz-R*15)), R*R, sz, R, sx, sy
fld st(1)
fmul st, st(4) // R*R*R, sz*(15*R*R+sz*(sz-R*15)), R*R, sz, R, sx, sy
fadd qword [edi + 24-112] // 24-112 +1e-40
fdivp // sz*(15*R*R+sz*(sz-R*15)) / R*R*R, R*R, sz, R, sx, sy
fld1
fsubrp // 1 - sz*(15*R*R+sz*(sz-R*15)) / R*R*R, R*R, sz, R, sx, sy
fld st(5) // sy, A, R*R, sz, R, sx, sy
fmul qword [edi + 168-112] // 10*sy, A, R*R, sz, R, sx, sy
fld st(5)
fmul qword [edi + 120-112] // 3*sx, 10*sy, A, R*R, sz, R, sx, sy
fsubrp // 3*sx-10*sy, A, R*R, sz, R, sx, sy
fmul st, st(5) // sx*(3*sx-10*sy), A, R*R, sz, R, sx, sy
fld st(6) // sy,
fmul st, st
fmul qword [edi + 120-112] // 3*sy*sy, sx*(3*sx-10*sy), A, R*R, sz, R, sx, sy
faddp // 3*sy*sy+sx*(3*sx-10*sy), A, R*R, sz, R, sx, sy
fmul st, st(1)
fmul qword [edx]
fmul qword [eax] // z := PDouble(Integer(PVar) - 16)^*2*z*Sqrt(R)*(sz*(3*sz - 10*R) + 3*R*R) + J3;
fadd st, st // x := A*(S1*S1*(S1 - 15*S2) + S2*S2*(15*S1 - S2)) + J1;
fadd qword [esi + 32]
fstp qword [edx] // A, R*R, sz, R, sx, sy
fld st(3)
fmul qword [edi + 168-112]
fld st(3) // sz, 10*R, A, R*R, sz, R, sx, sy
fmul qword [edi + 120-112]
fsubrp // 3*sz-10*R, A, R*R, sz, R, sx, sy
fmulp st(3), st // A, R*R, sz*(3*sz-10*R), R, sx, sy
fxch
fmul qword [edi + 120-112] // 3*R*R, A, sz*(3*sz-10*R), R, sx, sy
faddp st(2), st // A, sz*(3*sz-10*R)+3*R*R, R, sx, sy
fld qword [edi + 176-112] // 15
fld st // 15, 15, A, sz*(3*sz-10*R)+3*R*R, R, sx, sy
fmul st, st(5)
fsub st, st(6)
fmul st, st(6)
fmul st, st(6) // S2*S2*(15*S1-S2), 15, A, sz*(3*sz-10*R)+3*R*R, R, sx, sy
fxch
fmulp st(6), st // S2*S2*(15*S1-S2), A, sz*(3*sz-10*R)+3*R*R, R, sx, 15*sy
fxch st(5) // 15*sy, A, sz*(3*sz-10*R)+3*R*R, R, sx, S2*S2*(15*S1-S2)
fsubr st, st(4) // sx-15*sy, A, sz*(3*sz-10*R)+3*R*R, R, sx, S2*S2*(15*S1-S2)
fmul st, st(4)
fmulp st(4), st // A, sz*(3*sz-10*R)+3*R*R, R, sx*sx*(sx-15*sy), S2*S2*(15*S1-S2)
fxch st(4) // S2*S2*(15*S1-S2), sz*(3*sz-10*R)+3*R*R, R, sx*sx*(sx-15*sy), A
faddp st(3), st // sz*(3*sz-10*R)+3*R*R, R, S2*S2*(15*S1-S2)+sx*sx*(sx-15*sy), A
fxch
fsqrt
fmulp // (sz*(3*sz-10*R)+3*R*R)*sqrt(R), S2*S2*(15*S1-S2)+sx*sx*(sx-15*sy), A
fmul qword [ecx]
fmul qword [edi - 16-112] //*dZmul
fadd st, st
fadd qword [esi + 40]
fstp qword [ecx] // S2*S2*(15*S1-S2)+sx*sx*(sx-15*sy), A
fmulp
fadd qword [esi + 24]
fstp qword [eax]
pop edi
pop esi
end;
procedure HybridIntP7(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
mov esi, [ebp + 8] //PIteration3D
fld qword [edx]
fmul st, st // y*y
fld qword [eax] // x, y*y
mov edi, [esi + 48] // PVars
fmul st, st // x*x, y*y
fld st // x*x, x*x, y*y
fadd st, st(2) // x*x+y*y = R, sx, sy
fld qword [ecx]
add edi, 112
fmul st, st // sz, R, sx, sy
fld st(1) // R, sz, R, sx, sy
fmul qword [edi + 136-112] // 5R,
fsubr st, st(1) // sz-5R, sz, R, sx, sy
fmul st, st(1) // sz(sz-5R), sz, R, sx, sy
fld st(2)
fmul st, st // R*R, sz(sz-5R), sz, R, sx, sy
fxch // sz(sz-5R), R*R, sz, R, sx, sy
fld st(1) // R*R, sz(sz-5R), R*R, sz, R, sx, sy
fmul qword [edi + 120-112] // 3*R*R, sz(sz-5R), R*R, sz, R, sx, sy
faddp // 3RR+sz(sz-5R), R*R, sz, R, sx, sy
fmul st, st(2) // sz(3RR+sz(sz-5R)), R*R, sz, R, sx, sy
fmul qword [edi + 152-112]
fld st(1)
fmul st, st(4) // R*R*R, 7sz(3RR+sz(sz-5R)), R*R, sz, R, sx, sy
fadd qword [edi + 24-112] // 24-112 +1e-40
fdivp // 7sz(3RR+sz(sz-5R))/RRR, R*R, sz, R, sx, sy
fld1
fsubrp // A, R*R, sz, R, sx, sy
fld st(5) // sy, A, R*R, sz, R, sx, sy
fmul qword [edi + 200-112] // 35*sy, A, R*R, sz, R, sx, sy
fld st(5)
fmul qword [edi + 152-112] // 7*sx, 35*sy, A, R*R, sz, R, sx, sy
fsubrp // 7*sx-35*sy, A, R*R, sz, R, sx, sy
fmul st, st(5) // sx*(7*sx-35*sy), A, R*R, sz, R, sx, sy
fld st(6) // sy,
fmul st, st
fmul qword [edi + 184-112] // 21*sy*sy, sx*(7*sx-35*sy), A, R*R, sz, R, sx, sy
faddp // 21sysy+sx(7sx-35sy), A, R*R, sz, R, sx, sy
fmul st, st(5)
fld st(6)
fmul st, st
fmul st, st(7) // sysysy, sx(21sysy+sx(7sx-35sy)), A, R*R, sz, R, sx, sy
fsubp // sx(21sysy+sx(7sx-35sy))-sysysy, A, R*R, sz, R, sx, sy
fmul st, st(1)
fmul qword [edx]
fadd qword [esi + 32]
fstp qword [edx] // A, R*R, sz, R, sx, sy
fmul qword [eax] // z := J3 - PDouble(Integer(PVar) - 16)^*z*(sz*sz*sz - 7*R*(sz*(3*sz - 5*R) + R*R));
// x := A*x*(sx*(sx*(sx - 21*sy) + 35*sy*sy) - 7*sy*sy*sy) + J1;
fld st(3)
fmul qword [edi + 136-112] // 5R, A*x, R*R, sz, R, sx, sy
fld st(3) // sz, 5R, A*x, R*R, sz, R, sx, sy
fmul qword [edi + 120-112]
fsubrp // 3sz-5R, A*x, R*R, sz, R, sx, sy
fmul st, st(3) // sz(3sz-5R), A*x, R*R, sz, R, sx, sy
faddp st(2), st // A*x, RR+sz(3sz-5R), sz, R, sx, sy
fxch
fmul qword [edi + 152-112] // 7(RR+sz(3sz-5R)), A*x, sz, R, sx, sy
fmulp st(3), st // A*x, sz, 7R(sz(3sz-5R)+RR), sx, sy
fld st(1)
fmul st, st
fmulp st(2), st // A*x, szszsz, 7R(RR+sz(3sz-5R)), sx, sy
fxch // szszsz, A*x, 7R(RR+sz(3sz-5R)), sx, sy
fsubrp st(2), st // A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
fld st(3) // sy, A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
fmul qword [edi + 184-112] // 21sy, A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
fsubr st, st(3) // sx-21sy, A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
fmul st, st(3) // sx(sx-21sy)
fld st(4) // sy, sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
fmul st, st // sysy, sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
fmul qword [edi + 200-112] // 35sysy, sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
faddp // 35sysy+sx(sx-21sy), A*x, szszsz-7R(RR+sz(3sz-5R)), sx, sy
fmulp st(3), st // A*x, szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy)), sy
fxch st(3) // sy, szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy)), A*x
fld st
fmul st, st
fmulp
fmul qword [edi + 152-112] // 7sysysy, szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy)), A*x
fsubp st(2), st // szszsz-7R(RR+sz(3sz-5R)), sx(35sysy+sx(sx-21sy))-7sysysy, A*x
fmul qword [ecx]
fmul qword [edi - 16-112] //*dZmul
fsubr qword [esi + 40]
fstp qword [ecx] // sx(35sysy+sx(sx-21sy))-7sysysy, A*x
fmulp
fadd qword [esi + 24]
fstp qword [eax]
pop edi
pop esi
end;
procedure HybridIntP8(var x, y, z, w: Double; PIteration3D: TPIteration3D); //P8 white's formula
asm
push esi
push edi
mov esi, [ebp + 8] //PIteration3D
fld qword [eax] //x
mov edi, [esi + 48] //PVars
fmul st(0), st(0) //xx
fld qword [edx] //y
add edi, 88
fmul st(0), st(0) //yy,xx
fld qword [ecx] //z,yy,xx
fmul st(0), st(0) //zz,yy,xx
fld st(2) //xx,zz,yy,xx
fadd st(0), st(2) //xx+yy=r,zz,yy,xx
fld st(0) //r,r,zz,yy,xx
fmul st(0), st(1) //rr,r,zz,yy,xx
fld st(2)
fmul st(0), st(0) //zzzz(S3*S3),rr,r,zz,yy,xx
fld st(2) //r,zzzz(S3*S3),rr,r,zz,yy,xx z calculation
fmul st(0), st(4) //r*zz
fmul qword [edi + 56] //6*r*zz,zzzz(S3*S3),rr,r,zz,yy,xx
fsubr st(0), st(1) //zzzz-6rzz,zzzz,rr,r,zz,yy,xx
fadd st(0), st(2) //zzzz-6rzz+rr,zzzz,rr,r,zz,yy,xx
fld st(4) //zz,zzzz-6rzz+rr,zzzz,rr,r,zz,yy,xx
fsub st(0), st(4) //zz-r,zzzz-6rzz+rr,zzzz,rr,r,zz,yy,xx
fmulp //(zz-r)*(zzzz-6rzz+rr),zzzz,rr,r,zz,yy,xx
fld st(3) //r,(zz-r)*(zzzz-6rzz+rr),zzzz,rr,r,zz,yy,xx
fsqrt
fmulp //sqrt(r)*(zz-r)*(zzzz-6rzz+rr),zzzz,rr,r,zz,yy,xx
fmul qword [ecx] //*z
fmul qword [edi + 72] //*8
fmul qword [edi - 104] //*dZmul
fchs
fadd qword [esi + 40] //+J3
fstp qword [ecx] //zzzz,rr,r,zz,yy,xx
fld st(0) //zzzz,zzzz,rr,r,zz,yy,xx a calculation
fadd st(0), st(2) //zzzz+rr,zzzz,rr,r,zz,yy,xx
fmulp st(3), st(0) //zzzz,rr,r*(zzzz+rr),zz,yy,xx
fld st(1) //rr,zzzz,rr,r*(zzzz+rr),zz,yy,xx
fmul qword [edi + 120] //rr*70,zzzz,rr,r*(zzzz+rr),zz,yy,xx
fadd st(0), st(1)
fmulp //(rr*70+zzzz)*zzzz,rr,r*(zzzz+rr),zz,yy,xx
fxch st(2) //r*(zzzz+rr),rr,(rr*70+zzzz)*zzzz,zz,yy,xx
fmulp st(3), st(0) //rr,(rr*70+zzzz)*zzzz,zz*r*(zzzz+rr),yy,xx
fxch st(2) //zz*r*(zzzz+rr),(rr*70+zzzz)*zzzz,rr,yy,xx
fmul qword [edi + 104] //28*zz*r*(zzzz+rr),(rr*70+zzzz)*zzzz,rr,yy,xx
fsubp //(rr*70+zzzz)*zzzz-28*zz*r*(zzzz+rr),rr,yy,xx
fxch st(1)
fmul st(0), st(0) //rrrr,(rr*70+zzzz)*zzzz-28*zz*r*(zzzz+rr),yy,xx
fadd qword [edi - 64] // 24-88 +1e-40
fdivp //(zzzz*(rr*70+zzzz)-28*zz*r*(zzzz+rr))/rrrr,yy,xx
fadd qword [edi - 56] //a,yy,xx +1
fld st(1) //yy,a,yy,xx y calculation
fmul qword [edi + 64] //7*yy,a,yy,xx + 152-128=24
fld st(3) //xx,7*yy,a,yy,xx
fmul qword [edi + 64] //7*xx,7*yy,a,yy,xx
fsub st(0), st(3) //7*xx-yy,7*yy,a,yy,xx
fld st(4) //xx,7*xx-yy,7*yy,a,yy,xx
fsubr st(2), st(0) //xx,7*xx-yy,xx-7*yy,a,yy,xx
fmul st(0), st(0) //xxxx,7*xx-yy,xx-7*yy,a,yy,xx
fmul st(2), st(0) //xxxx,7xx-yy,xxxx(xx-7yy),a,yy,xx
fld st(4) //yy,xxxx,7xx-yy,xxxx(xx-7yy),a,yy,xx
fmul st(0), st(0) //yyyy,xxxx,7xx-yy,xxxx(xx-7yy),a,yy,xx
fmul st(2), st(0) //yyyy,xxxx,yyyy(7xx-yy),xxxx(xx-7yy),a,yy,xx
fxch st(2) //yyyy(7xx-yy),xxxx,yyyy,xxxx(xx-7yy),a,yy,xx
faddp st(3), st(0) //xxxx,yyyy,yyyy(7xx-yy)+xxxx(xx-7yy),a,yy,xx
fxch st(2) //yyyy(7xx-yy)+xxxx(xx-7yy),yyyy,xxxx,a,yy,xx
fmul qword [edi + 72] //*8
fmul qword [eax] //*x
fmul qword [edx] //*y
fmul st(0), st(3) //*a
fadd qword [esi + 32] //+J2
fstp qword [edx] //yyyy,xxxx,a,yy,xx
fld st(1) //xxxx,yyyy,xxxx,a,yy,xx
fmul qword [edi + 120] //70xxxx,yyyy,xxxx,a,yy,xx
fadd st(0), st(1) //70xxxx+yyyy,yyyy,xxxx,a,yy,xx
fmul st(0), st(1) //yyyy(70xxxx+yyyy),yyyy,xxxx,a,yy,xx
fxch st(1) //yyyy,yyyy(70xxxx+yyyy),xxxx,a,yy,xx
fadd st(0), st(2) //yyyy+xxxx,yyyy(70xxxx+yyyy),xxxx,a,yy,xx
fmulp st(4), st(0) //yyyy(70xxxx+yyyy),xxxx,a,yy(yyyy+xxxx),xx
fxch st(4) //xx,xxxx,a,yy(yyyy+xxxx),yyyy(70xxxx+yyyy)
fmulp st(3), st(0) //xxxx,a,xxyy(yyyy+xxxx),yyyy(70xxxx+yyyy)
fmul st(0), st(0) //xxxx*xxxx,a,xxyy(yyyy+xxxx),yyyy(70xxxx+yyyy)
faddp st(3), st(0) //a,xxyy(yyyy+xxxx),xxxx*xxxx+yyyy(70xxxx+yyyy)
fxch st(1) //xxyy(yyyy+xxxx),a,xxxx*xxxx+yyyy(70xxxx+yyyy)
fmul qword [edi + 104]
fsubp st(2), st(0) //a,xxxx*xxxx+yyyy(70xxxx+yyyy)-28xxyy(yyyy+xxxx)
fmulp
fadd qword [esi + 24]
fstp qword [eax]
pop edi
pop esi
end;
procedure HybridCubeSSE2(var x, y, z, w: Double; PIteration3D: TPIteration3D); // is used in alt hybrid without DE on w
asm
push esi
push ebx
mov esi, [ebp + 8] //PIteration3D
mov ebx, [esi + 48]
movupd xmm2, [eax] //[x,y]
movsd xmm4, [ecx] //[z]
movapd xmm0, xmm2
maxpd xmm0, [ebx - 64] //const:-1,-1,1,1
maxsd xmm4, [ebx - 64]
minpd xmm0, [ebx - 48]
minsd xmm4, [ebx - 48]
addpd xmm0, xmm0
addsd xmm4, xmm4
subpd xmm0, xmm2
subsd xmm4, [ecx]
movapd xmm1, xmm0 //x, y
movsd xmm5, xmm4
mulpd xmm1, xmm1 //x*x, y*y
mulsd xmm5, xmm5 //z*z
pshufd xmm6, xmm1, $4E //y*y, x*x copies and swaps hi<>lo
addsd xmm1, xmm5 //x*x + z*z
addsd xmm1, xmm6 // w = sqr(r)
ucomisd xmm1, [ebx - 32] //<dOption2 was:dOpt3
jnb @u1
movsd xmm3, [ebx - 24] //dOption1
jmp @u3
@u1:ucomisd xmm1, [ebx + 32] //<1 ? ucomisd slow?
movsd xmm3, [ebx - 16] //dPow = scale //Was:dOpt2
jnb @u3
divsd xmm3, xmm1
@u3:shufpd xmm3, xmm3, 0
movupd xmm5, [esi + 24] //[J1,J2]
mulpd xmm0, xmm3
mulsd xmm4, xmm3
addpd xmm0, xmm5
addsd xmm4, [esi + 40] //J3
movlpd [eax], xmm0
movhpd [edx], xmm0
movsd [ecx], xmm4
pop ebx
pop esi
end;
procedure HybridCube(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi //Amazing box x87 with options fold fold x2
push ebx
mov esi, [ebp + 8] //PIteration3D
mov esi, [esi + 48] //was:PAligned16
mov ebx, eax
fld qword [esi - 40] //fold
fld qword [eax] //x,fold
fld st(0) //x,x,fold folding with x = abs(x+fold) - abs(x-fold) - x
fsub st(0), st(2)
fabs
fadd st(0), st(1) //abs(x-fold)+x,x,fold
fxch //x,abs(x-fold)+x,fold
fadd st(0), st(2)
fabs
fsubrp //abs(x+fold)-(abs(x-fold)+x),fold
fld qword [edx] //y,fold
fld st(0)
fsub st(0), st(3)
fabs
fadd st(0), st(1)
fxch
fadd st(0), st(3)
fabs
fsubrp
fld qword [ecx] //y,fold
fld st(0)
fsub st(0), st(4)
fabs
fadd st(0), st(1)
fxch
fadd st(0), st(4)
fabs
fsubrp //z,y,x,fold
fld st(0) //7
fmul st(0), st(1)
fld st(2) //8
fmul st(0), st(3)
faddp //7
fld st(3) //8
fmul st(0), st(4)
faddp //r,z,y,x,-fold,fold,fold x2
fcom qword [esi - 32]
fnstsw ax
shr ah, 1
jnc @@7
fstp st(0)
fld qword [esi - 24]
jmp @@9
@@7: //r,z,y,x,-fold,fold,fold x2
fld1
fcom st(1)
fnstsw ax
shr ah, 1
jc @@8
fstp st(0)
fdivr qword [esi - 16]
jmp @@9
@@8:
fcompp
fld qword [esi - 16]
@@9:
fmul st(3), st(0) //mul,zr,yr,xr,-fold,fold, foldx2
fmul st(2), st(0)
fmulp //zr,yr,xr,-fold,fold, foldx2
mov esi, [ebp + 8]
fadd qword [esi + 40]
fstp qword [ecx]
fadd qword [esi + 32]
fstp qword [edx]
fadd qword [esi + 24]
fstp qword [ebx]
fstp st(0)
mov eax, ebx
pop ebx
pop esi
end;
procedure HybridCubeDE(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi //Amazing box without adding c x87 with option fold
push ebx
mov esi, [ebp + 8] //PIteration3D
mov ebx, eax
mov esi, [esi + 48] //was:PAligned16
fld qword [esi - 40] //fold
fld st(0)
fchs //-fold,fold
fld qword [ebx] //x,-fold,fold
fld st(0) //x,x,-fold,fold folding with x = abs(x+fold) - abs(x-fold) - x
fadd st(0), st(2)
fabs
fadd st(0), st(1)
fxch //x,abs(x-fold)+x,-fold,fold
fadd st(0), st(3)
fabs
fsubrp //abs(x+fold)-(abs(x-fold)+x),-fold,fold
fld qword [edx] //y,x,-fold,fold
fld st(0)
fadd st(0), st(3)
fabs
fadd st(0), st(1)
fxch
fadd st(0), st(4)
fabs
fsubrp
fld qword [ecx] //z,y,x,-fold,fold
fld st(0)
fadd st(0), st(4)
fabs
fadd st(0), st(1)
fxch
fadd st(0), st(5)
fabs
fsubrp
fld st(0) //7
fmul st(0), st(1)
fld st(2) //8
fmul st(0), st(3)
faddp //7
fld st(3) //8
fmul st(0), st(4)
faddp //r,z,y,x,-fold,fold
fcom qword [esi - 32]
fnstsw ax
shr ah, 1
jnc @@7
fstp st(0)
fld qword [esi - 24]
jmp @@9
@@7: //r,z,y,x,-fold,fold
fld1
fcom st(1)
fnstsw ax
shr ah, 1
jc @@8
fstp st(0)
fdivr qword [esi - 16]
jmp @@9
@@8:
fcompp
fld qword [esi - 16]
@@9:
fld qword [ecx + 8] //w,mul,zr,yr,xr,-fold,fold
fmul st(0), st(1)
fstp qword [ecx + 8]
fmul st(3), st(0) //mul,zr,yr,xr,-fold,fold
fmul st(2), st(0)
fmulp //zr,yr,xr,-fold,fold
mov esi, [ebp + 8]
fadd qword [esi + 40]
fstp qword [ecx]
fadd qword [esi + 32]
fstp qword [edx]
fadd qword [esi + 24]
fstp qword [ebx]
fcompp
mov eax, ebx
pop ebx
pop esi
end;
procedure HybridCubeSSE2DE(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push ebx
mov esi, [ebp + 8] //PIteration3D
mov ebx, [esi + 48] //was:PAligned16
movupd xmm2, [eax] //[x,y]
movsd xmm4, [ecx] //[z]
movapd xmm0, xmm2
maxpd xmm0, [ebx - 64] //const:-R,-R,R,R
maxsd xmm4, [ebx - 64]
minpd xmm0, [ebx - 48]
minsd xmm4, [ebx - 48]
addpd xmm0, xmm0
addsd xmm4, xmm4
subpd xmm0, xmm2
subsd xmm4, [ecx]
movapd xmm1, xmm0 //x, y
movsd xmm5, xmm4
mulpd xmm1, xmm1 //x*x, y*y
mulsd xmm5, xmm5 //z*z
pshufd xmm2, xmm1, $4E //y*y, x*x copies and swaps hi<>lo
addsd xmm1, xmm5
addsd xmm1, xmm2 // w = sqr(r)
ucomisd xmm1, [ebx - 32] //<dOption2 //7/6 clocks ucomisd latency :-(
movsd xmm3, [ebx - 24] //dOption1
jb @u3
ucomisd xmm1, [ebx + 32] //<1 ?
movsd xmm3, [ebx - 16] //dPow = scale
jnb @u3
divsd xmm3, xmm1
@u3:
movhpd xmm4, [ecx + 8]
shufpd xmm3, xmm3, 0 //r, r
movupd xmm5, [esi + 24] //[J1,J2]
mulpd xmm0, xmm3
mulpd xmm4, xmm3
addpd xmm0, xmm5
addsd xmm4, [esi + 40] //J3
movupd [eax], xmm0
movupd [ecx], xmm4
pop ebx
pop esi
end;
procedure HybridItIntPow2scale(var x, y, z, w: Double; PIteration3D: TPIteration3D); //sine bulb with scaling
asm
push esi
push edi
mov edi, [ebp + 8]
mov esi, [edi + 48]
fld qword [ecx]
fld qword [edx]
fld qword [eax] //x,y,z
fld qword [esi - 72] // scaling
fld1
fdivrp
fmul st(3), st(0)
fmul st(2), st(0)
fmulp
fld st(1) //y,x,y,z
fmul st(0), st(2) // y*y,x,y,z
fld st(1) // x,y*y,x,y,z
fmul st(0), st(2) // x*x, y*y,x,y,z
fld st(0) // x*x, x*x, y*y,x,y,z
fadd st(0), st(2) // xx+yy, xx, yy,x,y,z
fld st(0) // xx+yy, xx+yy, xx, yy,x,y,z
fsqrt
fmul st(0), st(6) //*z
fadd st(0), st(0) //*2
fchs
fmul qword [esi - 72]
fadd qword [edi + 40]
fstp qword [ecx] //xx+yy, xx, yy,x,y,z
fld st(5) //z, xx+yy, xx, yy,x,y,z
fmulp st(6), st(0) //xx+yy, xx, yy,x,y,z*z
fld st(0) //xx+yy, xx+yy, xx, yy,x,y,z*z
fsubrp st(6), st(0) //xx+yy, xx, yy,x,y, a - z*z
fdivp st(5), st(0) //xx, yy,x,y, a - z*z / a = a
fsubrp //xx-yy,x,y, a
fmul st(0), st(3) //a(xx-yy),x,y, a
fmul qword [esi - 72]
fadd qword [edi + 24]
fstp qword [eax] //x,y, a
fmulp
fmulp //x*y*a
fadd st(0), st(0) //*2
fmul qword [esi - 72]
fadd qword [edi + 32]
fstp qword [edx]
pop edi
pop esi
end;
procedure HybridFolding(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
push ebx
mov esi, [ebp + 8] //PIteration3D
mov edi, [esi + 48]
fld qword [edi - 24] //fold
fld qword [eax] //x,fold
fld st(0) //x,x,fold folding with x = abs(x+fold) - abs(x-fold) - x
fsub st(0), st(2)
fabs
fadd st(0), st(1) //abs(x-fold)+x,x,fold
fxch //x,abs(x-fold)+x,fold
fadd st(0), st(2)
fabs
fsubrp //abs(x+fold)-(abs(x-fold)+x),fold
fstp qword [eax]
fld qword [edx] //y,fold
fld st(0)
fsub st(0), st(2)
fabs
fadd st(0), st(1)
fxch
fadd st(0), st(2)
fabs
fsubrp
fstp qword [edx]
fld qword [ecx] //z,fold
fld st(0)
fsub st(0), st(2) //z-fold,z,fold
fabs
fadd st(0), st(1) //z+abs(z-fold),z,fold
fxch st(2)
faddp //z+fold,z+abs(z-fold)
fabs
fsubrp //z'
fstp qword [ecx]
mov ebx, [ebp + 12]
push ebx
push esi
call [edi - 52]
pop ebx
pop edi
pop esi
end;
procedure HybridCustomIFS; //for IFS, different calling convention! esi+edi is @it3dext.x+128 and @Pvar
asm
end;
procedure AexionC(var x, y, z, w: Double; PIteration3D: TPIteration3D);
asm
push esi
push edi
push ebx
push ecx
mov esi, [ebp + 8] //PIteration3D
mov edi, [esi + 48]
fld qword [ecx]
fld qword [edx]
fld qword [eax] //x,y,z
fld st(1)
fmul st, st //yy,x,y,z
fxch st(2) //y,x,yy,z
fld st(3)
fmul st, st //zz,y,x,yy,z
fld st(2)
fmul st, st //xx,zz,y,x,yy,z
fld st(1) //zz,xx,zz,y,x,yy,z
fadd st, st(1)
faddp st(5), st //xx,zz,y,x,r1,z
faddp
fsqrt //sqrt(xx+zz),y,x,r1,z
fxch
fpatan //th,x,r1,z
fxch st(3)
fxch //x,z,r1,th
fpatan //ph,r1,th
fld qword [edi - 16] //pow,ph,r1,th
fmul st(3), st
fmul st(1), st
fmul qword [edi - 8] //pow*0.5,ph,r1,th
fxch //ph,pow',r1,th
fxch st(2) //r1,pow',ph,th
fldln2 //power function base,expo -> st, st(1)
fxch
fyl2x
fxch
fmulp
fldl2e
fmulp
fld st(0)
frndint
fsub st(1), st(0)
fxch
f2xm1
fld1
faddp
fscale
fstp st(1) //r1',ph,th
fxch st(2) //th, ph, r1
fsincos //ct,st, ph, r1
fxch st(2) //ph, st,ct, r1
fsincos //cosP,sinP, sinT,cosT, r1
fmul st, st(3)
fmul st, st(4)
fadd qword [esi + 24]
fstp qword [eax] //sinP, sinT,cosT, r1
fmulp st(2), st //sinT,cosT*SinP, r1
fmul st, st(2)
fmul qword [edi - 24]
fadd qword [esi + 40]
fstp qword [ecx] //cosT*SinP, r1
fmulp
fadd qword [esi + 32]
fstp qword [edx]
cmp dword [edi - 28], 0
jz @@1
fld qword [edi - 40] //pd^
cmp dword [edi - 52], 0
jz @@2
fld qword [eax]
fsub qword [esi + 24]
fmul st, st
fld qword [edx]
fsub qword [esi + 32]
fmul st, st
faddp
fld qword [ecx]
fsub qword [esi + 40]
fmul st, st
faddp
fsqrt
fmulp
@@2: //pd^
fld qword [esi + 24]
fmul st, st
fld qword [esi + 32]
fmul st, st
faddp
fld qword [esi + 40]
fmul st, st
faddp
fsqrt //r1, pd^
mov ebx, [edi - 56]
test ebx, 16 //Modus Bit1: Flip atan theta, 2: Flip atan phi, 3: Flip theta and phi, 4: Flip CxCy, 5: diffs
jz @@4 // r,y z,x y<>x/z y<>x
fld qword [eax]
fsub qword [esi + 24]
fld qword [ecx]
fsub qword [esi + 40]
fld qword [edx]
fsub qword [esi + 32]
jmp @@5
@@4:
fld qword [esi + 24]
fld qword [esi + 40]
fld qword [esi + 32]
@@5: //Cy, Cz, Cx, r1, pd^
xor eax, eax //offset for cond phi test, normally x, or z if only flip-at2
xor ecx, ecx
add ecx, 8
test ebx, 8 //Modus bit4: flip CYCX
jz @@6
fxch //st(2) //(Cx,Cz,Cy)
add ecx, 8 //(Cz,Cx,Cy) test: Flip Cy<>Cz
@@6: //y, z, x, r1, pd^
fld st(1)
fmul st, st
fld st(3)
fmul st, st
faddp //xx+zz, y, z, x, r1, pd^
fsqrt //sqrt(sqr(j1)+sqr(j3)), y, z, x, r1, pd
test ebx, 1 //flip AT theta
jnz @@8
fxch
@@8:
fpatan //th, Cz, Cx, r1, pd
fxch st(2) //Cx, Cz, th, r1, pd
test ebx, 2 //flip AT phi
jz @@9
fxch
add eax, 24
sub eax, ecx
@@9:
fpatan //ph, th, r1, pd
test ebx, 4
jz @@7
fxch
mov eax, ecx
@@7:
cmp dword [edi - 32], 0
jz @@10
test dword [edx + eax - 4], $80000000
jnz @@10
fchs
@@10:
fmul st, st(3)
fxch st(3) //pd, th, r1, ph
fmulp //th, r1, ph
fsincos //costh,sinth,r1,ph
fxch st(3) //ph,sinth,r1,costh
fsincos //Cx,Sx,Sy,r1,Cy
fmul st, st(4)
fmul st, st(3)
fstp qword [esi + 24] //Sx,Sy,r1,Cy
fmulp st(3), st //Sy,r1,Cy*Sx
fmul st, st(1)
fmul qword [edi - 48]
fstp qword [esi + 40] //r1,Cy*Sx
fmulp
fstp qword [esi + 32]
@@1:
pop ecx
pop ebx
pop edi
pop esi
end;
procedure TCrc32Stream.add(var data; datasize:longint); assembler; register;
asm
pushad
mov edi, eax
mov esi, edx
jecxz @done
mov edx, [TCrc32Stream(edi).curcrc]
cld
@lp1:
xor eax, eax
lodsb
xor al, dl
shl eax, 2
mov ebx, OFFSET CrcTable
add ebx, eax
mov eax, [ebx]
shr edx, 8
xor edx, eax
loop @lp1
mov [TCrc32Stream(edi).curcrc], edx
@done:
popad
end;
function DotOf2VecNormalize(norm, light, view: TPSVec): Single;
asm
fld dword [eax]
fld dword [eax + 4]
fld dword [eax + 8] //norm2, norm1, norm0
fld dword [ecx]
fmul st, st(3)
fld dword [ecx + 4]
fmul st, st(3)
faddp
fld dword [ecx + 8]
fmul st, st(2)
faddp
fadd st, st //d2, norm2, norm1, norm0
fmul st(3), st
fmul st(2), st
fmulp //norm2', norm1', norm0'
fsubr dword [ecx + 8]
fmul dword [edx + 8]
fxch
fsubr dword [ecx + 4]
fmul dword [edx + 4]
faddp
fxch
fsubr dword [ecx]
fmul dword [edx]
faddp
end;
procedure calcAmbshadow(var dAmbS, sAmplitude: Single; PsiLight: TPsiLight5);
const s1d16383: Single = 1/16383;
asm
fld1
cmp word [ecx + 12], 16383
jl @@2
fld1
jmp @@3
@@2:
fild word [ecx + 12]
fmul s1d16383
@@3:
fld dword [edx] //Ampl, Shadow, 1
mov edx, eax
fcom st(2)
fnstsw ax
shr ah, 1
jc @@1
fxch
fsubr st, st(2) //dAmbS, Ampl, 1
fxch //Ampl,dAmbS,1
fsubrp st(2), st //dAmbS,Ampl-1
fld st
fmul st, st
fsub st, st(1) //Sqr(dAmbS)-dAmbS,dAmbS,Ampl-1
fmulp st(2), st
faddp
fstp dword [edx]
ret
@@1:
fmulp
fsubp
fstp dword [edx]
end;
function SqrSV255(const sv: TSVec): TSVec;
asm
fld dword [eax]
fmul st, st
fld dword [eax + 4]
fmul st, st
fld dword [eax + 8]
fmul st, st
fld s1d255
fmul st(3), st
fmul st(2), st
fmulp
fstp [edx + 8]
fstp [edx + 4]
fstp [edx]
end;
function ConvertVLight(Win: Integer): Integer;
asm
push ecx
and eax, $3FF
mov ecx, eax
shr ecx, 7
and eax, $7F
shl eax, cl
pop ecx
mov edx, eax
end;
function AddSVecWeight(const SPos, SPosPlus: TSVec; const Step: Integer): TSVec; //math3d: procedure AddSVecWeight(V1, V2: TPSVec; W: Double);
asm
push ecx
push ebx
mov ebx, [ebp + 8]
mov [ebp - 4], ecx
fld dword [edx]
fld dword [edx + 4]
fld dword [edx + 8]
fild dword [ebp - 4]
fmul st(3), st
fmul st(2), st
fmulp
fadd dword [eax + 8]
fstp dword [ebx + 8]
fadd dword [eax + 4]
fstp dword [ebx + 4]
fadd dword [eax]
fstp dword [ebx]
pop ebx
pop ecx
end;
procedure ScaleSVecHDR(sv1: TPSVec);
const s09: Single = 0.9;
asm
cmp SupportSSE, 0
jz @@1
movss xmm0, s09
movups xmm1, cSVec1
shufps xmm0, xmm0, 0
movups xmm2, dqword [eax]
movaps xmm3, xmm2
mulps xmm2, xmm0
mulps xmm2, xmm2
addps xmm2, xmm1
rsqrtps xmm2, xmm2
mulps xmm3, xmm2
movups dqword [eax], xmm3
ret
@@1:
fld1
fld s09
fld dword [eax]
fmul st, st(1)
fmul st, st
fadd st, st(2)
fsqrt
fdivr dword [eax]
fstp dword [eax]
fld dword [eax + 4]
fmul st, st(1)
fmul st, st
fadd st, st(2)
fsqrt
fdivr dword [eax + 4]
fstp dword [eax + 4]
fld dword [eax + 8]
fmulp
fmul st, st
faddp
fsqrt
fdivr dword [eax + 8]
fstp dword [eax + 8]
end;
procedure ScaleSingleHDR(var s: Single);
const s09: Single = 0.9;
s1: Single = 1;
asm
cmp SupportSSE, 0
jz @@1
movss xmm0, dword [eax]
movss xmm1, xmm0
mulss xmm0, s09
mulss xmm0, xmm0
addss xmm0, s1
rsqrtss xmm0, xmm0
mulss xmm1, xmm0
movss dword [eax], xmm1
ret
@@1:
fld1 //x := x / Sqrt(Sqr(x * 0.9) + 1);
fld dword [eax]
fmul s09
fmul st, st
faddp
fsqrt
fdivr dword [eax]
fstp dword [eax]
end;
procedure ScaleSingleHDRsqr(var s: Single);
const s09: Single = 0.9;
s1: Single = 1;
asm
cmp SupportSSE, 0
jz @@1
movss xmm0, dword [eax]
mulss xmm0, xmm0
movss xmm1, xmm0
mulss xmm0, s09
mulss xmm0, xmm0
addss xmm0, s1
rsqrtss xmm0, xmm0
mulss xmm1, xmm0
sqrtss xmm1, xmm1
movss dword [eax], xmm1
ret
@@1:
fld dword [eax] //x := Sqrt(x*x / Sqrt(Sqr(x*x * 0.9) + 1));
fmul st, st
fld st //xx,xx
fmul s09
fmul st, st
fld1
faddp
fsqrt
fdivp
fsqrt
fstp dword [eax]
end;
procedure SVec2ColSSE(sv1: TPSVec; pc: PCardinal);
asm // eax edx
add esp, -16
movups xmm0, dqword [eax]
movups xmm1, cSVec1
movups xmm2, cSVec255
xorps xmm3, xmm3
minps xmm0, xmm1
maxps xmm0, xmm3
mulps xmm0, xmm2
movups [esp], xmm0
cvtss2si eax, xmm0
fld dword [esp + 8]
fistp word [edx]
fld dword [esp + 4]
fistp word [edx + 1]
mov [edx + 2], al
add esp, 16
end;
procedure LabCubicRootSSE(sv: TPSVec);
const wstart: array[0..3] of Single = (0.4275, 0.4275, 0.4275, 0.4275);
sftc: array[0..3] of Single = (216/24389, 216/24389, 216/24389, 216/24389);
smul: array[0..3] of Single = (841/108, 841/108, 841/108, 841/108);
asm
movups xmm0, [eax] //r
movaps xmm4, xmm0
movups xmm6, sftc
movups xmm1, wstart
maxps xmm4, xmm6
movups xmm7, smul
movaps xmm5, xmm4
minps xmm0, xmm6
addps xmm5, xmm5 //2r
mov edx, 3
@ll: movaps xmm2, xmm1
mulps xmm2, xmm2
mulps xmm2, xmm1 //www
movaps xmm3, xmm2
addps xmm3, xmm3
addps xmm2, xmm5
addps xmm3, xmm4
mulps xmm1, xmm2
divps xmm1, xmm3
dec edx
jnz @ll
subps xmm0, xmm6
mulps xmm0, xmm7
addps xmm0, xmm1
movups [eax], xmm0
end;
procedure LabCubicRoot2SSE(sv: TPSVec); //rsqrtps less precise!
const wstart: array[0..3] of Single = (0.3661, 0.3661, 0.3661, 0.3661);
sftc: array[0..3] of Single = (216/24389, 216/24389, 216/24389, 216/24389);
smul: array[0..3] of Single = (841/108, 841/108, 841/108, 841/108);
s1d3: array[0..3] of Single = (1/3, 1/3, 1/3, 1/3);
s4d3: array[0..3] of Single = (4/3, 4/3, 4/3, 4/3);
asm
movups xmm0, [eax] //r
movaps xmm4, xmm0
movups xmm6, sftc
movups xmm1, wstart
movups xmm5, s4d3
movups xmm3, s1d3
maxps xmm4, xmm6
movups xmm7, smul
minps xmm0, xmm6
mov edx, 3
@ll: movaps xmm2, xmm1
mulps xmm1, xmm4 //w*r
rsqrtps xmm1, xmm1
mulps xmm2, xmm3
rsqrtps xmm1, xmm1
mulps xmm1, xmm5
subps xmm1, xmm2
dec edx
jnz @ll
subps xmm0, xmm6
mulps xmm0, xmm7
addps xmm0, xmm1
movups [eax], xmm0
end;
procedure LabPow3SSE(sv: TPSVec);
const sftc: array[0..3] of Single = (6/29, 6/29, 6/29, 6/29);
smul: array[0..3] of Single = (108/841, 108/841, 108/841, 108/841);
asm
movups xmm0, [eax] //r
movaps xmm4, xmm0
movups xmm6, sftc
maxps xmm4, xmm6
movups xmm7, smul
movaps xmm5, xmm4
minps xmm0, xmm6
mulps xmm5, xmm5
subps xmm0, xmm6
mulps xmm5, xmm4 //rrr
mulps xmm0, xmm7
addps xmm0, xmm5
movups [eax], xmm0
end;
procedure QuickSortInt(count: Integer; var List: array of TSortItem);
procedure QuickSort(const L, R: Integer; List: TPSortItem); //L:eax R:edx List:ecx
asm
push ebx
push esi
push edi
mov ebx, eax //Lpos := L
mov esi, edx //Rpos := R
dec ebx
mov edi, [ecx + edx * 8] //ListR := List[R].iZ;
@@1:
inc ebx
cmp edi, [ecx + ebx * 8]
jg @@1
@@2:
dec esi
cmp esi, ebx
jle @@4 //break
cmp edi, [ecx + esi * 8]
jl @@2
push eax
push edx
mov eax, [ecx + ebx * 8]
mov edx, [ecx + esi * 8]
mov [ecx + esi * 8], eax
mov [ecx + ebx * 8], edx
mov eax, [ecx + ebx * 8 + 4]
mov edx, [ecx + esi * 8 + 4]
mov [ecx + esi * 8 + 4], eax
mov [ecx + ebx * 8 + 4], edx
pop edx
pop eax
jmp @@1
@@4:
mov esi, [ecx + ebx * 8]
mov [ecx + edx * 8], esi
mov [ecx + ebx * 8], edi
mov esi, [ecx + ebx * 8 + 4]
mov edi, [ecx + edx * 8 + 4]
mov [ecx + edx * 8 + 4], esi
mov [ecx + ebx * 8 + 4], edi
dec ebx
cmp ebx, eax
jle @@5
mov esi, edx
mov edx, ebx
call QuickSort
mov edx, esi
@@5:
add ebx, 2
cmp ebx, edx
jge @@6
mov esi, eax
mov eax, ebx
call QuickSort
mov eax, esi
@@6:
pop edi
pop esi
pop ebx
end;
function RMcalcVLight(StepCount: Single): Integer;
asm
push ecx
fld dword [ebp + 8]
fistp dword [esp]
mov eax, [esp]
cmp eax, 16383
jle @1
mov eax, 16383
@1: bsr ecx, eax
jz @2
sub ecx, 6
jle @2
shr eax, cl
shl ecx, 7
or eax, ecx
@2: pop ecx
end;
procedure RMCalcRoughness(N: TPVec3D; var sRough: Single; dt2, dsG: PDouble);
asm
cmp SupportSSE2, 0
jz @@1
movupd xmm0, [eax]
movsd xmm1, [eax + 16]
movsd xmm2, [ecx]
mulpd xmm0, xmm0
mulsd xmm1, xmm1
mulsd xmm2, xmm2
addsd xmm1, xmm0
mov eax, [ebp + 8]
unpckhpd xmm0, xmm0
mulsd xmm2, [eax]
addsd xmm1, xmm0
mulsd xmm2, d7
addsd xmm1, d1em40
addsd xmm2, d1em40
xorpd xmm3, xmm3
divsd xmm2, xmm1
maxsd xmm2, xmm3
sqrtsd xmm2, xmm2
subsd xmm2, d005
maxsd xmm2, xmm3
minsd xmm2, d1p0
cvtsd2ss xmm4, xmm2
movss [edx], xmm4
jmp @end
@@1:
fld qword [eax]
fmul st, st
fld qword [eax + 8]
fmul st, st
faddp
fld qword [eax + 16]
fmul st, st
faddp
fadd d1em100
mov eax, [ebp + 8]
fld qword [ecx]
fmul st, st
fmul qword [eax]
fmul s7
fadd d1em100
fdivrp
ftst
fnstsw ax
shr ah, 1
jnc @1
fstp st
fldz
@1: fsqrt
fld s005 //0.05, sR'
fcom st(1)
fnstsw ax
shr ah, 1
jc @up
fcompp
xor eax, eax
mov [edx], eax
jmp @end
@up:
fsubp
fld1
fcomp st(1)
fnstsw ax
and ah, 41H
jz @up2
fstp st
fld1
@up2:
fstp dword [edx]
@end:
end;
procedure RMCalculateStartPos(pMCTparas: PMCTparameter; ix, iy: Integer);
asm
add eax, $78
cmp dword [eax + TMCTparameter.MCTCameraOptic - $78], 2
jne @@2
mov ecx, dword [eax + TMCTparameter.pIt3Dext - $78]
fld qword [eax + TMCTparameter.Ystart - $78]
fld qword [eax + TMCTparameter.Ystart - $78 + 8]
fld qword [eax + TMCTparameter.Ystart - $78 + 16]
fstp qword [ecx + TIteration3Dext.C3]
fstp qword [ecx + TIteration3Dext.C2]
fstp qword [ecx + TIteration3Dext.C1]
ret
@@2:
cmp SupportSSE2, 0
jz @@1
push ecx
push edx
cvtpi2pd xmm7, [esp] //xx,yy
mov ecx, dword [eax + TMCTparameter.pIt3Dext - $78] //+68
lea edx, eax + $78 //TMCTparameter.Ystart
movapd xmm6, xmm7
unpckhpd xmm7, xmm7 //yy,yy
unpcklpd xmm6, xmm6 //xx,xx
movupd xmm0, [eax + TMCTparameter.Vgrads - $78]
movupd xmm2, [eax + TMCTparameter.Vgrads - $60]
movupd xmm4, [edx + TMCTparameter.Ystart - $78 - $78]
mulpd xmm0, xmm6
mulsd xmm6, [eax + TMCTparameter.Vgrads - $68]
mulpd xmm2, xmm7
mulsd xmm7, [eax + TMCTparameter.Vgrads - $50]
addpd xmm0, xmm2
addsd xmm6, xmm7
addpd xmm0, xmm4
addsd xmm6, [edx + TMCTparameter.Ystart - $78 - $68]
movupd [ecx + TIteration3Dext.C1], xmm0
movsd [ecx + TIteration3Dext.C3], xmm6
pop edx
pop ecx
ret
@@1:
push ecx
fild dword [esp]
push edx
fild dword [esp] //xx,yy
mov ecx, dword [eax + TMCTparameter.pIt3Dext - $78] //+68
lea edx, eax + $78 //TMCTparameter.Ystart
fld qword [eax + TMCTparameter.Vgrads - $78]
fmul st, st(1)
fld qword [eax + TMCTparameter.Vgrads - $78 + 24]
fmul st, st(3)
faddp
fadd qword [edx + TMCTparameter.Ystart - $78 - $78]
fstp qword [ecx + TIteration3Dext.C1]
fld qword [eax + TMCTparameter.Vgrads - $78 + 8]
fmul st, st(1)
fld qword [eax + TMCTparameter.Vgrads - $78 + 32]
fmul st, st(3)
faddp
fadd qword [edx + TMCTparameter.Ystart - $70 - $78]
fstp qword [ecx + TIteration3Dext.C2] //xx,yy
fmul qword [eax + TMCTparameter.Vgrads - $78 + 16]
fxch
fmul qword [eax + TMCTparameter.Vgrads - $78 + 40]
faddp
fadd qword [edx + TMCTparameter.Ystart - $68 - $78]
fstp qword [ecx + TIteration3Dext.C3]
pop edx
pop ecx
end;
procedure RMCalculateVgradsFOV(pMCTparas: PMCTparameter; ix: Integer);
asm
push ebx
push esi
push edx //to store ix in [esp] and fiload (esp := esp-4)
lea ebx, eax + $1a0
fild dword [esp] //ix
fsubr dword [ebx + TMCTparameter.FOVXoff - $1a0]
fmul dword [ebx + TMCTparameter.FOVXmul - $1a0]
fst qword [ebx + TMCTparameter.CAFX - $1a0] // $1a0
cmp dword [ebx + TMCTparameter.MCTCameraOptic - $1a0], 1 // $1fc
je @@3
fstp st
lea ecx, [ebx + TMCTparameter.mVgradsFOV - $1a0]
lea edx, [ebx + TMCTparameter.CAFX - $1a0] // $1a0
lea eax, [ebx + TMCTparameter.CAFY - $1a0] // $1a8
cmp dword [ebx + TMCTparameter.MCTCameraOptic - $1a0], 0 // $1fc
je @@1
call BuildViewVectorDSphereFOV
jmp @@2
@@3:
fchs
fstp qword [ebx + TMCTparameter.mVgradsFOV - $1a0]
fld qword [ebx + TMCTparameter.CAFY - $1a0] // $1a8
fstp qword [ebx + TMCTparameter.mVgradsFOV - $1a0 + 8]
fld dword [ebx + TMCTparameter.mctPlOpticZ - $1a0] // $204
fstp qword [ebx + TMCTparameter.mVgradsFOV - $1a0 + 16]
lea eax, [ebx + TMCTparameter.mVgradsFOV - $1a0]
call NormaliseVectorVar
jmp @@2
@@1:
call BuildViewVectorDFOV
@@2:
lea edx, [ebx + TMCTparameter.VGrads - $1a0] // $80
lea eax, [ebx + TMCTparameter.mVgradsFOV - $1a0]
call RotateVectorReverse
pop edx //to Inc(esp, 4)
pop esi
pop ebx
end;
procedure RMdoColor(pMCTparas: PMCTparameter);
const
cd5200: Single = 5200;
cd4096: Single = 4096;
cd5215: Single = 5215;
asm
push ebx
push edi
push edx //just to get dword [esp]
mov edi, [eax + TMCTparameter.mPsiLight]
mov ebx, [eax + TMCTparameter.pIt3Dext]
movzx edx, byte [eax + TMCTparameter.ColorOption] //coloroption
cmp edx, 6
jnb @@COelse
jmp dword [edx * 4 + @@jmptable]
@@jmptable:
dd @@COelse, @@CO1, @@CO2, @@CO3, @@CO4, @@CO5
@@CO1:
fld qword [ebx + 8] //Rold
fld1
faddp
fdivr qword [ebx + $70] //Rout
fldln2
fxch
fyl2x
fmul dword [eax + TMCTparameter.mctColorMul] //mctColorMul
jmp @@up
nop
@@CO2:
fld qword [ebx+$20]
fsub qword [ebx+$40]
jmp @1
@@CO3:
fld qword [ebx+$28]
fsub qword [ebx+$48]
@1: fld qword [ebx+$18]
fsub qword [ebx+$38]
@2: fpatan
fldpi
faddp
fmul cd5200
jmp @@up
@@CO4:
fld qword [ebx+$28]
fsub qword [ebx+$48]
fld qword [ebx+$20]
fsub qword [ebx+$40]
jmp @2
@@CO5:
fld qword [ebx+$20]
fld st
fmul st, st //yy,y
fld qword [ebx+$18] //x,yy,y
fld st
fmul st, st //xx,x,yy,y
fxch st(3) //y,x,yy,xx
fpatan
fldpi
faddp
fmul cd5215 //s,yy,xx
fxch st(2) //xx,yy,s
faddp
fadd d1em100
fld qword [ebx+$28] //z,yy+xx,s norm vec[2] for arcsin
fld st
fmul st, st //zz,z,yy+xx,s
faddp st(2), st //z,rr,s
fxch //rr,z,s
fsqrt //r,z,s
fdivp //z/r,s
@@s2:
fld1 //arcsin(x) = arctan2(x, sqrt(1-x*x))
fld st(1)
fmul st(0), st(0)
fsubp
fsqrt
fpatan
fadd st, st
fldpi
faddp
fmul cd5215
fstp dword [esp]
lea edx, [edi + TsiLight5.SIgradient]
mov eax, esp
call MinMaxClip15bit
jmp @@up
@@COelse:
fld qword [ebx + TIteration3Dext.OTrap]
fmul cd4096
@@up:
fstp dword [esp]
lea edx, [edi + TsiLight5.Otrap]
mov eax, esp
call MinMaxClip15bit
pop edx
pop edi
pop ebx
end;
const CS8388352: Single = 8388352;
asm
push ebx
push esi
push edi
add esp, -24
mov edi, [eax+TMCTparameter.mPsiLight] //PSL
mov esi, edx //cutplane
lea ebx, eax + 128 //MCTparas
test esi, esi //if cutplane>0
jle @@1
fld1
fld qword [ebx+TMCTparameter.mZZ-128] //+104 mZZ^,1 NN := 8388352 - ZcMul * (Sqrt(mZZ * Zcorr + 1) - 1);
fmul qword [ebx+TMCTparameter.Zcorr-128] //$274
fadd st, st(1)
fsqrt
fsubrp
fmul qword [ebx+TMCTparameter.ZcMul-128] //$26c
fsubr CS8388352 //NN
fistp dword [esp]
mov eax, [esp]
test eax, eax
jns @@3
xor eax, eax
@@3:
shl eax, 8 // PCardinal(@PSL.RoughZposFine)^ := iTmp shl 8;
mov [edi+6], eax
dec esi //VGrads: +128
fld qword [ebx+esi*8+TMCTparameter.VGrads+$30-128] // if Abs(Vgrads[2, CutPlane]) < 1e-40
fabs
fcomp d1em40
fnstsw ax
shr ah, 1
jnc @@4
fld dm1e40 // NN := -1e40
jmp @@5
@@4:
fld1 // NN := -1 / Vgrads[2, CutPlane];
fchs
fdiv qword [ebx+esi*8+TMCTparameter.VGrads+$30-128]
@@5:
fld qword [ebx+esi*8+TMCTparameter.VGrads-128] // N[0] := Vgrads[0, CutPlane] * NN;
fmul st, st(1)
fstp qword [esp]
fld qword [ebx+esi*8+TMCTparameter.VGrads+$18-128] // N[1] := Vgrads[1, CutPlane] * NN;
fmulp
fstp qword [esp+8]
fld1
fchs // N[2] := -1;
fstp qword [esp+16]
mov edx, esp // MakeWNormalsFromDVec(TPLNormals(PSL), @N);
mov eax, edi
call MakeWNormalsFromDVec
jmp @@6
@@1:
xor eax, eax
mov [edi+6], $7fff0000
mov dword [edi], eax
mov word [edi+4], $8001
@@6:
add esp, 24
pop edi
pop esi
pop ebx //}
end;
procedure RMdoBinSearch(pMCTparas: PMCTparameter; var DE, RLastStepWidth{, RLastDE}: Double);
asm
push ebx
push esi
push edi
push ebp
add esp, -8
mov edi, edx //@dTmp
lea esi, eax+$38 //@MCTParas (was:qTMandCalcThread)
mov ebx, [esi+TMCTparameter.pIt3Dext-$38]
mov ebp, [esi+TMCTparameter.iDEAddSteps-$38] //+$40
fld qword [ecx] // RLastStepWidth
fmul sm05
jmp @@2
@@4:
fld qword [esi+TMCTparameter.mZZ-$38] //+$68
fadd qword [esp]
fstp qword [esi+TMCTparameter.mZZ-$38] //+$68
fld qword [esi+TMCTparameter.mVgradsFOV-$38]
fld qword [esi+TMCTparameter.mVgradsFOV-$38 + 8]
fld qword [esi+TMCTparameter.mVgradsFOV-$38 + 16]
fld qword [esp]
fmul st(3), st
fmul st(2), st
fmulp
fadd qword [ebx+TIteration3Dext.C1 + 16]
fstp qword [ebx+TIteration3Dext.C1 + 16]
fadd qword [ebx+TIteration3Dext.C1 + 8]
fstp qword [ebx+TIteration3Dext.C1 + 8]
fadd qword [ebx+TIteration3Dext.C1]
fstp qword [ebx+TIteration3Dext.C1]
fld qword [esi+TMCTparameter.mZZ-$38] //+$68
fmul dword [esi+TMCTparameter.mctDEstopFactor-$38] //+$54
fld1
faddp
fmul dword [esi+TMCTparameter.DEstop-$38] //+$60
fstp dword [esi+TMCTparameter.msDEstop-$38] //+$38 msDEstop := DEstop * (1 + mZZ * mctDEstopFactor);
dec ebp
test ebp, ebp
jle @@3
lea edx, esi-$38
mov eax, ebx
call esi+TMCTparameter.CalcDE-$38
fstp qword [edi] //dTmp
fld qword [edi]
fcomp dword [esi+TMCTparameter.msDEstop-$38] //+$38
fnstsw ax
fld qword [esp]
fabs
fmul s055
shr ah, 1
jnc @@8
fchs
@@8:
@@2:
fstp qword [esp]
fld qword [edi]
fsub dword [esi+TMCTparameter.msDEstop-$38] //+$38
fabs
fcomp s0001
fnstsw ax
shr ah, 1
jnc @@4
@@3:
add esp, 8
pop ebp
pop edi
pop esi
pop ebx
end; //}
procedure CalcZposAndRough(siLight: TPsiLight5; mct: PMCTparameter; const ZZ: Double);
push ebx
sub esp, 4
fld1
test byte [ebp + 15], 128 //negative zz clip
jns @1
fldz
jmp @2
@1: fld qword [ebp + 8]
@2: fmul qword [edx + TMCTparameter.Zcorr]
fadd st(0), st(1)
fsqrt //at fsqrt? ZZ * Zcorr > 1?? zz=-642!!
fsubrp //invalid fp operation in critical ipol hybrid
fmul qword [edx + TMCTparameter.ZcMul]
fistp dword [esp]
mov ebx, 8388352
sub ebx, dword [esp]
test ebx, ebx
jnl @up1
xor ebx, ebx
@up1:
cmp ebx, 8388352
jle @up2
mov ebx, 8388352
@up2:
shl ebx, 8
cmp byte [edx + TMCTparameter.iSmNormals], 0
jle @up3
fld dword [edx + TMCTparameter.sRoughness]
fmul s255
fistp dword [esp]
or ebx, [esp]
@up3:
mov [eax + 6], ebx
add esp, 4
pop ebx
end;
procedure FirstATlevelHiQ(PIA: TPCardinalArray; PsiLight: TPsiLight5; Leng: Integer);
asm
push esi
dec ecx
js @@out
inc ecx
add edx, 8
@@1:
cmp word [edx], $8000
jnb @@2
mov esi, [edx-2]
and esi, $ffffff00
shr esi, 1
jmp @@3
@@2:
xor esi, esi
@@3:
mov [eax], esi
add edx, 18
add eax, 4
dec ecx
jnz @@1
@@out:
pop esi
end;
procedure SmoothH(PIA, SA: TPCardinalArray; ya, Step: Integer);
asm
add esp, -12
push ebx
push esi
push edi
mov [ebp-8], ecx
mov ebx, edx
mov edi, [ebp+8]
mov edx, ecx
test edx, edx
jl @@2
inc edx
mov [ebp-12], edx
xor esi, esi
@@1:
mov edx, esi
sub edx, edi
test edx, edx
jnl @@3
xor edx, edx
@@3:
mov ecx, edi
add ecx, esi
cmp ecx, [ebp-8]
jle @@4
mov ecx, [ebp-8]
@@4:
mov ecx, [ebx+ecx*4]
add ecx, [ebx+edx*4]
shr ecx, 1
add ecx, [eax]
shr ecx, 1
mov [eax], ecx
inc esi
add eax, 4
dec dword [ebp-12]
jnz @@1
@@2:
pop edi
pop esi
pop ebx
add esp, 12
end;
procedure SmoothV(PIA, SA: TPCardinalArray; ye, Step, wid: Integer);
asm
add esp, -12
push ebx
push esi
push edi
mov [ebp-8], ecx
mov ebx, edx
mov edi, [ebp+12]
mov edx, ecx
test edx, edx
jl @@2
inc edx
mov [ebp-12], edx
xor esi, esi
@@1:
mov edx, esi
sub edx, edi
test edx, edx
jnl @@3
xor edx, edx
@@3:
mov ecx, edi
add ecx, esi
cmp ecx, [ebp-8]
jle @@4
mov ecx, [ebp-8]
@@4:
mov ecx, [ebx+ecx*4]
add ecx, [ebx+edx*4]
shr ecx, 1
add ecx, [eax]
shr ecx, 1
mov [eax], ecx
inc esi
add eax, dword [ebp+8]
dec dword [ebp-12]
jnz @@1
@@2:
pop edi
pop esi
pop ebx
add esp, 12
end;
procedure MinSI(var SI: SmallInt; var i: Integer);
asm
movsx ecx, word [eax]
cmp ecx, [edx]
jnl @@1
cmp dword [edx], $7FFF
jl @@2
mov word [eax], $7FFF
ret
@@2:
mov edx, [edx]
mov word [eax], dx
@@1:
end;
function NotOnlyBackGround4(p: Pointer): Integer;
asm
mov edx, [eax]
and edx, [eax + 18]
and edx, [eax + 36]
and edx, [eax + 54]
and edx, $80000000
mov eax, edx
end;
procedure MakeZP4(p: Pointer; var zp: array of Integer);
asm
mov ecx, [eax]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx], ecx
mov ecx, [eax + 18]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx + 4], ecx
mov ecx, [eax + 36]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx + 8], ecx
mov ecx, [eax + 54]
and ecx, $FFFFFF00
shr ecx, 1
mov [edx + 12], ecx
end;
procedure TAmbHiQCalcR.Execute;
// ...
asm //~13s with 3 steps
push eax
push ebx
push ecx
push edx
push esi
push edi
mov esi, PATL
mov edx, psm
mov edi, seed
mov iDir, 31
xorps xmm2, xmm2
xorps xmm3, xmm3
xorps xmm4, xmm4
movss xmm5, sAbs
xorps xmm6, xmm6
xorps xmm7, xmm7
movlps xmm4, ssub //xmm4 = ssub
movlps xmm7, sstep //xmm7 = sstep
@foriDir: mov eax, PS
movlps xmm6, [eax] //xmm6 = PS[0,1]
movlps xmm2, sMinRad
movaps xmm1, xmm6
mulps xmm1, xmm2
subps xmm1, xmm4 //sxy-ssub
mov eax, StepCount
mov sc, eax
@while: imul edi, $000343FD
add edi, $269EC3
mov eax, edi
movaps xmm0, xmm1
shr eax, 10
CVTSS2SI ecx, xmm0 //x2
mov ebx, eax
and ebx, iand
add ecx, ebx
shufps xmm0, xmm0, 1
shr eax, 6
CVTSS2SI ebx, xmm0 //y2
and eax, iand
add ebx, eax
push ecx
mov eax, ebx
imul ecx, ecx
imul eax, eax
add eax, ecx
pop ecx
test eax, eax
jz @skip
CVTSI2SS xmm2, eax
add ecx, dword [xy]
add ebx, dword [xy + 4]
test ecx, ecx //reflection at borders
jns @@1
neg ecx
cmp ecx, WLo
jge @endwhile
jmp @@2
@@1: cmp ecx, MWidth
jl @@2
sub ecx, MW2
neg ecx
cmp ecx, WHi
jl @endwhile
@@2: test ebx, ebx
jns @@3
neg ebx
cmp ebx, HLo
jge @endwhile
jmp @con
@@3: cmp ebx, MHeight
jl @con
sub ebx, MH2
neg ebx
cmp ebx, HHi
jl @endwhile
@con: imul ebx, MWidth
add ebx, ecx
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2]
sub eax, zp
CVTSI2SS xmm0, eax
RSQRTSS xmm2, xmm2
mulss xmm0, xmm2 //st
movss xmm3, xmm0
andps xmm0, xmm5
mulss xmm3, sit
addss xmm0, sZRT
mulss xmm3, sZRT
rcpss xmm0, xmm0
mulss xmm3, xmm0
minss xmm3, s32767
maxss xmm3, sm32768
CVTSS2SI eax, xmm3 //it := Round(st * sZRT * sit / (sZRT + Abs(st)) );
mov ecx, iDir
cmp ax, word [edx + ecx * 2]
jle @skip
mov word [edx + ecx * 2], ax
@skip: movaps xmm3, xmm7 //sstep
mulps xmm3, xmm6 //DirXY
addps xmm1, xmm3 //sx,sy
dec sc
jnz @while
@endwhile: add PS, 8
dec iDir
jns @foriDir
mov seed, edi
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end
// ...
procedure TAmbHiQCalcRpano.Execute;
// ...
asm //~13s with 3 steps
push eax
push ebx
push ecx
push edx
push esi
push edi
mov esi, PATL
mov edx, psm
mov edi, seed
mov iDir, 31
xorps xmm2, xmm2
xorps xmm3, xmm3
xorps xmm4, xmm4
movss xmm5, sAbs
xorps xmm6, xmm6
xorps xmm7, xmm7
movlps xmm4, ssub //xmm4 = ssub
movlps xmm7, sstep //xmm7 = sstep
@foriDir: mov eax, PS
movlps xmm6, [eax] //xmm6 = PS[0,1]
movlps xmm2, sMinRad
movaps xmm1, xmm6
mulps xmm1, xmm2
subps xmm1, xmm4 //sxy-ssub
mov eax, StepCount
mov sc, eax
@while: imul edi, $000343FD
add edi, $269EC3
mov eax, edi
movaps xmm0, xmm1
shr eax, 10
CVTSS2SI ecx, xmm0 //x2
mov ebx, eax
and ebx, iand
add ecx, ebx
shufps xmm0, xmm0, 1
shr eax, 6
CVTSS2SI ebx, xmm0 //y2
and eax, iand
add ebx, eax
push ecx
mov eax, ebx
imul ecx, ecx
imul eax, eax
add eax, ecx
pop ecx
test eax, eax
jz @skip
CVTSI2SS xmm2, eax
add ecx, dword [xy]
add ebx, dword [xy + 4]
test ecx, ecx // reflection at borders
jns @@1
add ecx, MWidth
test ecx, ecx
jns @@2
jmp @endwhile
@@1: cmp ecx, MWidth
jl @@2
sub ecx, MWidth
cmp ecx, MWidth
jnl @endwhile
@@2: test ebx, ebx
jns @@3
neg ebx
cmp ebx, HLo
jge @endwhile
jmp @con
@@3: cmp ebx, MHeight
jl @con
sub ebx, MH2
neg ebx
cmp ebx, HHi
jl @endwhile
@con: imul ebx, MWidth
add ebx, ecx
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2]
sub eax, zp
CVTSI2SS xmm0, eax
RSQRTSS xmm2, xmm2
mulss xmm0, xmm2 //st
movss xmm3, xmm0
andps xmm0, xmm5
mulss xmm3, sit
addss xmm0, sZRT
mulss xmm3, sZRT
rcpss xmm0, xmm0
mulss xmm3, xmm0
minss xmm3, s32767
maxss xmm3, sm32768
CVTSS2SI eax, xmm3 //it := Round(st * sZRT * sit / (sZRT + Abs(st)) );
mov ecx, iDir
cmp ax, word [edx + ecx * 2]
jle @skip
mov word [edx + ecx * 2], ax
@skip: movaps xmm3, xmm7 //sstep
mulps xmm3, xmm6 //DirXY
addps xmm1, xmm3 //sx,sy
dec sc
jnz @while
@endwhile: add PS, 8
dec iDir
jns @foriDir
mov seed, edi
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end
// ...
procedure TAmbHiQCalcRT0.Execute;
// ...
asm
push eax
push ebx
push ecx
push edx
push esi
push edi
mov esi, PATL
mov edx, PSI
mov edi, seed
mov iDir, 31
xorps xmm1, xmm1
xorps xmm2, xmm2
xorps xmm3, xmm3
xorps xmm4, xmm4
xorps xmm5, xmm5
xorps xmm6, xmm6
movlps xmm4, ssub //xmm4 = ssub
movlps xmm5, sstep //xmm5 = sstep
@foriDir: mov eax, PS
movlps xmm1, sMinRad // (1.2 at stepw1)
movlps xmm6, [eax] //xmm6 = PS[0,1]
mulps xmm1, xmm6
subps xmm1, xmm4 //sxy-ssub (-0,5 at stepw1)
mov eax, StepCount
mov sc, eax
@while: imul edi, $000343FD
add edi, $269EC3
mov eax, edi
movaps xmm0, xmm1 //sx, sy
shr eax, 10
CVTSS2SI ecx, xmm0
mov ebx, eax
and ebx, iand
add ecx, ebx
shufps xmm0, xmm0, 1
shr eax, 6
CVTSS2SI ebx, xmm0
and eax, iand
add ebx, eax
push ecx
mov eax, ebx
imul ecx, ecx
imul eax, eax
add eax, ecx
pop ecx
test eax, eax
jz @skip
CVTSI2SS xmm2, eax
add ecx, dword [xy]
add ebx, dword [xy+4]
test ecx, ecx // reflection at borders
jns @@1
neg ecx
cmp ecx, WLo
jge @endwhile
jmp @@2
@@1: cmp ecx, MWidth
jl @@2
sub ecx, MW2
neg ecx
cmp ecx, WHi
jl @endwhile
@@2: test ebx, ebx
jns @@3
neg ebx
cmp ebx, HLo
jge @endwhile
jmp @con
@@3: cmp ebx, MHeight
jl @con
sub ebx, MH2
neg ebx
cmp ebx, HHi
jl @endwhile
@con: imul ebx, MWidth
add ebx, ecx
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2]
sub eax, zp
CVTSI2SS xmm0, eax // (CVTPI2PS=sse, 2 int to single)
RSQRTSS xmm2, xmm2
mulss xmm0, xmm2 //st := (PATL^[y2 * MWidth + x2] - zp) / Sqrt(st);
movss xmm3, xmm0
mulss xmm0, xmm0
mulss xmm3, sit
addss xmm0, sZRT
mulss xmm3, sZRT
rcpss xmm0, xmm0
mulss xmm3, xmm0
minss xmm3, s32767
maxss xmm3, sm32768
CVTSS2SI eax, xmm3 //it := Round(st * sit * sZRT / (st * st + sZRT));
mov ecx, iDir
cmp ax, word [edx + ecx * 2]
jle @skip
mov word [edx + ecx * 2], ax
@skip: movaps xmm3, xmm5 //sstep
mulps xmm3, xmm6 //DirXY
addps xmm1, xmm3 //sx,sy
dec sc
jnz @while
@endwhile: add PS, 8
dec iDir
jns @foriDir
mov seed, edi
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end
// ...
procedure TAmbHiQCalcRT0pano.Execute;
// ...
asm
push eax
push ebx
push ecx
push edx
push esi
push edi
mov esi, PATL
mov edx, PSI
mov edi, seed
mov iDir, 31
xorps xmm1, xmm1
xorps xmm2, xmm2
xorps xmm3, xmm3
xorps xmm4, xmm4
xorps xmm5, xmm5
xorps xmm6, xmm6
movlps xmm4, ssub //xmm4 = ssub
movlps xmm5, sstep //xmm5 = sstep
@foriDir: mov eax, PS
movlps xmm1, sMinRad // (1.2 at stepw1)
movlps xmm6, [eax] //xmm6 = PS[0,1]
mulps xmm1, xmm6
subps xmm1, xmm4 //sxy-ssub (-0,5 at stepw1)
mov eax, StepCount
mov sc, eax
@while: imul edi, $000343FD
add edi, $269EC3
mov eax, edi
movaps xmm0, xmm1 //sx, sy
shr eax, 10
CVTSS2SI ecx, xmm0
mov ebx, eax
and ebx, iand
add ecx, ebx
shufps xmm0, xmm0, 1
shr eax, 6
CVTSS2SI ebx, xmm0
and eax, iand
add ebx, eax
push ecx
mov eax, ebx
imul ecx, ecx
imul eax, eax
add eax, ecx
pop ecx
test eax, eax
jz @skip
CVTSI2SS xmm2, eax
add ecx, dword [xy]
add ebx, dword [xy+4]
test ecx, ecx // reflection at borders
jns @@1
add ecx, MWidth
test ecx, ecx
jns @@2
jmp @endwhile
@@1: cmp ecx, MWidth
jl @@2
sub ecx, MWidth
cmp ecx, MWidth
jnl @endwhile
@@2: test ebx, ebx
jns @@3
neg ebx
cmp ebx, HLo
jge @endwhile
jmp @con
@@3: cmp ebx, MHeight
jl @con
sub ebx, MH2
neg ebx
cmp ebx, HHi
jl @endwhile
@con: imul ebx, MWidth
add ebx, ecx
mov eax, [esi + ebx * 4] //PATL^[y2 * MWidth + x2]
sub eax, zp
CVTSI2SS xmm0, eax // (CVTPI2PS=sse, 2 int to single)
RSQRTSS xmm2, xmm2
mulss xmm0, xmm2 //st := (PATL^[y2 * MWidth + x2] - zp) / Sqrt(st);
movss xmm3, xmm0
mulss xmm0, xmm0
mulss xmm3, sit
addss xmm0, sZRT
mulss xmm3, sZRT
rcpss xmm0, xmm0
mulss xmm3, xmm0
minss xmm3, s32767
maxss xmm3, sm32768
CVTSS2SI eax, xmm3 //it := Round(st * sit * sZRT / (st * st + sZRT));
mov ecx, iDir
cmp ax, word [edx + ecx * 2]
jle @skip
mov word [edx + ecx * 2], ax
@skip: movaps xmm3, xmm5 //sstep
mulps xmm3, xmm6 //DirXY
addps xmm1, xmm3 //sx,sy
dec sc
jnz @while
@endwhile: add PS, 8
dec iDir
jns @foriDir
mov seed, edi
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
end
// ...
function VolLightMapPosSSE(vd: TPVec3D): LongBool;
asm
push esi
push edx //to get esp buf
lea esi, VolumeLightMap
fld qword [eax]
fsub qword [esi + TVolumetricLightMap.LightPos]
fstp dword [esp]
fld qword [eax + 8]
movss xmm0, [esp]
fsub qword [esi + TVolumetricLightMap.LightPos + 8]
fstp dword [esp]
fld qword [eax + 16]
movss xmm1, [esp]
fsub qword [esi + TVolumetricLightMap.LightPos + 16]
fstp dword [esp]
shufps xmm0, xmm0, 0
movss xmm2, [esp]
shufps xmm1, xmm1, 0
shufps xmm2, xmm2, 0
movups xmm4, [esi + TVolumetricLightMap.RotMatrix]
movups xmm5, [esi + TVolumetricLightMap.RotMatrix + 16]
movups xmm6, [esi + TVolumetricLightMap.RotMatrix + 32]
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
addps xmm4, xmm5
addps xmm4, xmm6
xorps xmm2, xmm2
movhlps xmm5, xmm4
movss xmm1, [esi + TVolumetricLightMap.StretchSide1]
movss xmm3, [esi + TVolumetricLightMap.HSizeS]
movss xmm0, [esi + TVolumetricLightMap.CSizeS]
shufps xmm1, xmm1, 0
shufps xmm3, xmm3, 0
shufps xmm0, xmm0, 0
mulps xmm4, xmm1
addps xmm4, xmm3
maxps xmm4, xmm2
minps xmm4, xmm0
cvtss2si eax, xmm4
shufps xmm4, xmm4, 1
cvtss2si edx, xmm4
imul edx, dword [esi + TVolumetricLightMap.CubeSize]
mov esi, [esi + TVolumetricLightMap.CubeSides]
add edx, eax
xor eax, eax
comiss xmm5, [esi + edx * 4]
jnc @e
mov eax, -1
@e: pop edx
pop esi
end;
function GetVolLightMapVecSSE(vd: TPSVec): Single;
asm
push esi
push ebx
xorps xmm4, xmm4
lea esi, VolumeLightMap
movups xmm5, [eax]
movups xmm7, cAbsSVec
movaps xmm0, xmm5
movaps xmm1, xmm5
movhlps xmm2, xmm5
shufps xmm1, xmm1, 1
andps xmm5, xmm7
movaps xmm6, xmm5
movhlps xmm7, xmm5
shufps xmm6, xmm6, 1
movss xmm3, [esi + TVolumetricLightMap.SizeFactor]
ucomiss xmm5, xmm6
jc @1
ucomiss xmm5, xmm7
jc @2
xor edx, edx
ucomiss xmm0, xmm4
adc edx, 0
@3: divss xmm3, xmm0
mulss xmm1, xmm3
mulss xmm2, xmm3
cvtss2si eax, xmm1
cvtss2si ebx, xmm2
jmp @e
@2: mov edx, 4
ucomiss xmm2, xmm4
adc edx, 0
@4: divss xmm3, xmm2
mulss xmm0, xmm3
mulss xmm1, xmm3
cvtss2si eax, xmm0
cvtss2si ebx, xmm1
jmp @e
@1: ucomiss xmm6, xmm7
jc @2
mov edx, 2
ucomiss xmm1, xmm4
adc edx, 0
@5: divss xmm3, xmm1
mulss xmm0, xmm3
mulss xmm2, xmm3
cvtss2si eax, xmm0
cvtss2si ebx, xmm2
@e: add ebx, [esi + TVolumetricLightMap.HalfSize]
add eax, [esi + TVolumetricLightMap.HalfSize]
imul ebx, dword [esi + TVolumetricLightMap.CubeSize]
mov esi, [esi + edx * 4 + TVolumetricLightMap.CubeSides]
add eax, ebx
fld dword [esi + eax * 4]
pop ebx
pop esi
end;
function TCalcAmbShadowDEThreadGeneral.GetRand: Double;
const dm: Double = 1 / $7FFFFF;
asm
add esp, -4
imul edx, [eax + seed], $343FD
add edx, $269EC3
mov [eax + seed], edx
shr edx, 8
and edx, $7FFFFF
mov [esp], edx
fild dword [esp]
fmul dm
add esp, 4
end;
function TCalcAmbShadowDEThreadGeneral2.GetRand: Double;
const dm: Double = 1 / $7FFFFF;
asm
add esp, -4
imul edx, [eax + seed], $343FD
add edx, $269EC3
mov [eax + seed], edx
shr edx, 8
and edx, $7FFFFF
mov [esp], edx
fild dword [esp]
fmul dm
add esp, 4
end;
function RdTsc: int64;
asm
db $0f, $31 // RdTsc
end;
function Clamp255(i: Integer): Integer;
asm
cmp eax, 255
jle @up
mov eax, 255
@up:
end;
procedure MakeCubicWeightsFromT(const t: Single; var sv: TSVec); //all weights 6 times bigger!
const s3: Single = 3;
s6: Single = 6;
asm
fld dword [ebp + 8]
fld st
fmul st, st //t*t,t
fld st
fmul st, st(2) //t³,t²,t
fld s3
fmul st(2), st //3, t³=sv[3], 3*t²=sv[2], t
fld st(2) //sv[2], 3, sv[3], sv[2], t
fsub st, st(2) //sv[2]-sv[3], 3, sv[3], sv[2], t
fsub st, st(4) //sv[2]-sv[3]-t, 3, sv[3], sv[2], t
fsub st, st(4) //sv[2]-sv[3]-2*t, 3, sv[3], sv[2], t
fstp dword [eax] //3, sv[3], sv[2], t
fld st(1) //sv[3], 3, sv[3], sv[2], t
fmul st, st(1) //3*sv[3], 3, sv[3], sv[2], t
fsub st, st(3) //3*sv[3]-sv[2], 3, sv[3], sv[2], t
fsub st, st(3) //3*sv[3]-2*sv[2], 3, sv[3], sv[2], t
fld st(4)
fmul st, st(2)
fsubp
fadd s6
fstp dword [eax + 4]
fmul st, st(1)
fsubp st(2), st //t³,3*t²-3*t³,t
fsub st, st(2)
fstp dword [eax + 12] //3*t²-3*t³,t
fxch
fmul s6
faddp
fstp dword [eax + 8]
end;
function GetCosTabVal(const Tnr: Integer; const DotP, Rough: Single): Single;
// ...
asm
mov edx, Tnr
shl edx, 7
add edx, ip
lea eax, DiffCosTabNsmall + edx * 4
movups xmm2, w
movups xmm0, [eax]
movups xmm1, [eax + $800]
mulps xmm0, xmm2
mulps xmm1, xmm2
movaps xmm3, xmm0
unpcklps xmm3, xmm1
unpckhps xmm0, xmm1
addps xmm3, xmm0
movhlps xmm0, xmm3
addps xmm3, xmm0
movaps xmm2, xmm3
shufps xmm2, xmm2, 1
subss xmm2, xmm3
mulss xmm2, Rough
addss xmm2, xmm3
movss Result, xmm2
end
// ...
function GetCosTabValSqr(const Tnr: Integer; const DotP, Rough: Single): Single;
// ...
asm
mov edx, Tnr
shl edx, 7
add edx, ip
lea eax, DiffCosTabNsmall + edx * 4
movups xmm2, w
movups xmm0, [eax]
movups xmm1, [eax + $800]
mulps xmm0, xmm2
mulps xmm1, xmm2
movaps xmm3, xmm0
unpcklps xmm3, xmm1
unpckhps xmm0, xmm1
addps xmm3, xmm0
movhlps xmm0, xmm3
addps xmm3, xmm0
mulps xmm3, xmm3
movaps xmm2, xmm3
shufps xmm2, xmm2, 1
subss xmm2, xmm3
mulss xmm2, Rough
addss xmm2, xmm3
movss Result, xmm2
end
// ...
function TMCCalcThread.GetRand: Double;
asm //begin result := random; end;
imul edx, [eax + seed], $343FD
add edx, $269EC3
mov [eax + seed], edx
and edx, $7FFFFFFF
push edx
fild dword [esp]
fmul dSeedMul
pop edx
end;
function TMCCalcThread.GenSphereSVecOm: TSVec; //fullsphere
asm
cmp dword [eax + TMCCalcThread.bDoDOF], 0
jnz @@1
fld dword [eax + TMCCalcThread.HaltonDiscY]
fld dword [eax + TMCCalcThread.HaltonDiscX]
jmp @@2
@@1:
push edx
call GetRand
call GetRand
pop edx
@@2:
fmul PiM2
fsincos //cos,sin,v
fld1
fsub st, st(3)
fmul st, st(3)
fsqrt
fadd st, st //r,cos,sin,v
fmul st(2), st
fmulp //c',s',v
fstp dword [edx]
fstp dword [edx + 4]
fadd st, st
fld1
fsubrp
fstp dword [edx + 8]
xor eax, eax
mov [edx + 12], eax //}
end;
function ByteSwap(const a: integer): integer;
asm
bswap eax
end;
function ByteSwap16(inp:word): word;
asm
bswap eax
shr eax, 16
end;
function TPngObject.RGB2Quad(RGB: pRGBPixel): TRGBQuad;
asm
push ecx
mov ax, [edx]
mov [esp], ax
mov al, [edx + 2]
mov [esp + 2], al
mov eax, [esp]
and eax, $00FFFFFF
pop edx
end;
function ReturnAddr: Pointer;
asm
MOV EAX,[EBP+4] // sysutils.pas says [EBP-4], but this works !
end;
function CompareMem(P1, P2: Pointer; Length: Integer): Boolean; assembler;
asm
PUSH ESI
PUSH EDI
MOV ESI,P1
MOV EDI,P2
MOV EDX,ECX
XOR EAX,EAX
AND EDX,3
SHR ECX,1
SHR ECX,1
REPE CMPSD
JNE @@2
MOV ECX,EDX
REPE CMPSB
JNE @@2
@@1: INC EAX
@@2: POP EDI
POP ESI
end;
GetSystemInfo(SysInfo);
asm
MOV EDX, Colors
MOV ECX, Count
DEC ECX
JS @@END
LEA EAX, SysInfo
CMP [EAX].TSystemInfo.wProcessorLevel, 3
JE @@386
@@1: MOV EAX, [EDX+ECX*4]
BSWAP EAX
SHR EAX,8
MOV [EDX+ECX*4],EAX
DEC ECX
JNS @@1
JMP @@END
@@386:
PUSH EBX
@@2: XOR EBX,EBX
MOV EAX, [EDX+ECX*4]
MOV BH, AL
MOV BL, AH
SHR EAX,16
SHL EBX,8
MOV BL, AL
MOV [EDX+ECX*4],EBX
DEC ECX
JNS @@2
POP EBX
@@END:
end;
function Scan(Buf: PAnsiChar; Value: Byte; Count: integer): boolean; assembler;
asm
PUSH EDI
MOV EDI, Buf
MOV ECX, Count
MOV AL, Value
REPNE SCASB
MOV EAX, False
JNE @@1
MOV EAX, True
@@1:POP EDI
end;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment